logs-analyzer/signoz/pkg/query-service/app/metrics/v4/cumulative/timeseries.go

package cumulative

import (
	"fmt"

	"go.signoz.io/signoz/pkg/query-service/app/metrics/v4/helpers"
	"go.signoz.io/signoz/pkg/query-service/constants"
	v3 "go.signoz.io/signoz/pkg/query-service/model/v3"
	"go.signoz.io/signoz/pkg/query-service/utils"
)

// See https://clickhouse.com/docs/en/sql-reference/window-functions for more details on `lagInFrame` function
//
// Calculating the rate of change of a metric is a common use case.
// Requests and errors are two examples of metrics that are often expressed as a rate of change.
// The rate of change is the difference between the current value and the previous value divided by
// the time difference between the current and previous values (i.e. the time interval).
//
// The value of a cumulative counter always increases. However, the rate of change can be negative
// if the value decreases between two samples. This can happen if the counter is reset when the
// application restarts or if the counter is reset manually. In this case, the rate of change is
// not meaningful and should be ignored.
//
// The condition `(per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window) < 0`
// checks if the rate of change is negative. If it is negative, the value is replaced with `nan`.
//
// The condition `ts - lagInFrame(ts, 1, toDate('1970-01-01')) OVER rate_window) >= 86400` checks
// if the time difference between the current and previous values is greater than or equal to 1 day.
// The first sample of a metric is always `nan` because there is no previous value to compare it to.
// When the first sample is encountered, the previous value for the time is set to default i.e `1970-01-01`.
// Since any difference between the first sample timestamp and the previous value timestamp will be
// greater than or equal to 1 day, the rate of change for the first sample will be `nan`.
//
// If neither of the above conditions are true, the rate of change is calculated as
// `(per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window) / (ts - lagInFrame(ts, 1, toDate('1970-01-01')) OVER rate_window)`
// where `rate_window` is a window function that partitions the data by fingerprint and orders it by timestamp.
// We want to calculate the rate of change for each time series, so we partition the data by fingerprint.
//
// The `increase` function is similar to the `rate` function, except that it does not divide by the time interval.
const (
	rateWithoutNegative     = `If((per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window) < 0, nan, If((ts - lagInFrame(ts, 1, toDate('1970-01-01')) OVER rate_window) >= 86400, nan, (per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window) / (ts - lagInFrame(ts, 1, toDate('1970-01-01')) OVER rate_window)))`
	increaseWithoutNegative = `If((per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window) < 0, nan, If((ts - lagInFrame(ts, 1, toDate('1970-01-01')) OVER rate_window) >= 86400, nan, (per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window)))`
)

// prepareTimeAggregationSubQueryTimeSeries prepares the sub-query to be used for temporal aggregation
// of time series data

// The following example illustrates how the sub-query is used to calculate the sume of values for each
// time series in a 15 seconds interval:

// ```
// timestamp    01.00  01.05  01.10  01.15  01.20  01.25  01.30  01.35  01.40
//             +------+------+------+------+------+------+------+------+------+
//             |      |      |      |      |      |      |      |      |      |
//             |  v1  |  v2  |  v3  |  v4  |  v5  |  v6  | v7   |  v8  |  v9  |
//             |      |      |      |      |      |      |      |      |      |
//             +------+------+------+------+------+------+------+------+------+
//                |      |      |      |      |      |       |     |      |
//                  |    |    |          |    |    |           |   |    |
//                       |                    |                    |
//                    +------+             +------+            +------+
//                    | v1+  |             |  v4+ |            |  v7+ |
//                    | v2+  |             |  v5+ |            |  v8+ |
//                    | v3   |             |  v6  |            |  v9  |
//                    +------+             +------+            +------+
//                     01.00                01.15               01.30
// ```

// Calculating the rate/increase involves an additional step. We first calculate the maximum value for each time series
// in a 15 seconds interval. Then, we calculate the difference between the current maximum value and the previous
// maximum value

// The following example illustrates how the sub-query is used to calculate the rate of change for each time series
// in a 15 seconds interval:

// ```
// timestamp    01.00  01.05  01.10  01.15  01.20  01.25  01.30  01.35  01.40
//             +------+------+------+------+------+------+------+------+------+
//             |      |      |      |      |      |      |      |      |      |
//             |  v1  |  v2  |  v3  |  v4  |  v5  |  v6  | v7   |  v8  |  v9  |
//             |      |      |      |      |      |      |      |      |      |
//             +------+------+------+------+------+------+------+------+------+
//                |      |      |      |      |      |       |     |      |
//                  |    |    |          |    |    |           |   |    |
//                       |                    |                    |
//                    +------+             +------+            +------+
//                max(| v1,  |         max(|  v4, |        max(|  v7, |
//                    | v2,  |             |  v5, |            |  v8, |
//                    | v3   |)            |  v6  |)           |  v9  |)
//                    +------+             +------+            +------+
//                     01.00                01.15               01.30

//                              +-------+             +--------+
//                              | V6-V2 |             |  V9-V6 |
//                              |       |             |        |
//                              |       |             |        |
//                              +------+              +--------+
//                                01.00                  01.15
// ```

// The rate of change is calculated as (Vy - Vx) / (Ty - Tx) where Vx and Vy are the values at time Tx and Ty respectively.
// In an ideal scenario, the last value of each interval could be used to calculate the rate of change. Instead, we use
// the maximum value of each interval to calculate the rate of change. This is because any process restart can cause the
// value to be reset to 0. This will produce an inaccurate result. The max is the best approximation we can get.
// We don't expect the process to restart very often, so this should be a good approximation.

func prepareTimeAggregationSubQuery(start, end, step int64, mq *v3.BuilderQuery) (string, error) {
	var subQuery string

	timeSeriesSubQuery, err := helpers.PrepareTimeseriesFilterQuery(start, end, mq)
	if err != nil {
		return "", err
	}

	samplesTableFilter := fmt.Sprintf("metric_name = %s AND unix_milli >= %d AND unix_milli < %d", utils.ClickHouseFormattedValue(mq.AggregateAttribute.Key), start, end)

	// Select the aggregate value for interval
	queryTmpl :=
		"SELECT fingerprint, %s" +
			" toStartOfInterval(toDateTime(intDiv(unix_milli, 1000)), INTERVAL %d SECOND) as ts," +
			" %s as per_series_value" +
			" FROM " + constants.SIGNOZ_METRIC_DBNAME + "." + constants.SIGNOZ_SAMPLES_V4_TABLENAME +
			" INNER JOIN" +
			" (%s) as filtered_time_series" +
			" USING fingerprint" +
			" WHERE " + samplesTableFilter +
			" GROUP BY fingerprint, ts" +
			" ORDER BY fingerprint, ts"

	selectLabelsAny := helpers.SelectLabelsAny(mq.GroupBy)
	selectLabels := helpers.SelectLabels(mq.GroupBy)

	switch mq.TimeAggregation {
	case v3.TimeAggregationAvg:
		op := "avg(value)"
		subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)
	case v3.TimeAggregationSum:
		op := "sum(value)"
		subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)
	case v3.TimeAggregationMin:
		op := "min(value)"
		subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)
	case v3.TimeAggregationMax:
		op := "max(value)"
		subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)
	case v3.TimeAggregationCount:
		op := "count(value)"
		subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)
	case v3.TimeAggregationCountDistinct:
		op := "count(distinct(value))"
		subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)
	case v3.TimeAggregationAnyLast:
		op := "anyLast(value)"
		subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)
	case v3.TimeAggregationRate:
		op := "max(value)"
		innerSubQuery := fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)
		rateQueryTmpl :=
			"SELECT %s ts, " + rateWithoutNegative +
				" as per_series_value FROM (%s) WINDOW rate_window as (PARTITION BY fingerprint ORDER BY fingerprint, ts)"
		subQuery = fmt.Sprintf(rateQueryTmpl, selectLabels, innerSubQuery)
	case v3.TimeAggregationIncrease:
		op := "max(value)"
		innerSubQuery := fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)
		rateQueryTmpl :=
			"SELECT %s ts, " + increaseWithoutNegative +
				" as per_series_value FROM (%s) WINDOW rate_window as (PARTITION BY fingerprint ORDER BY fingerprint, ts)"
		subQuery = fmt.Sprintf(rateQueryTmpl, selectLabels, innerSubQuery)
	}
	return subQuery, nil
}

// PrepareMetricQueryCumulativeTimeSeries prepares the query to be used for fetching metrics
func PrepareMetricQueryCumulativeTimeSeries(start, end, step int64, mq *v3.BuilderQuery) (string, error) {
	var query string

	temporalAggSubQuery, err := prepareTimeAggregationSubQuery(start, end, step, mq)
	if err != nil {
		return "", err
	}

	groupBy := helpers.GroupingSetsByAttributeKeyTags(mq.GroupBy...)
	orderBy := helpers.OrderByAttributeKeyTags(mq.OrderBy, mq.GroupBy)
	selectLabels := helpers.GroupByAttributeKeyTags(mq.GroupBy...)

	queryTmpl :=
		"SELECT %s," +
			" %s as value" +
			" FROM (%s)" +
			" WHERE isNaN(per_series_value) = 0" +
			" GROUP BY %s" +
			" ORDER BY %s"

	switch mq.SpaceAggregation {
	case v3.SpaceAggregationAvg:
		op := "avg(per_series_value)"
		query = fmt.Sprintf(queryTmpl, selectLabels, op, temporalAggSubQuery, groupBy, orderBy)
	case v3.SpaceAggregationSum:
		op := "sum(per_series_value)"
		query = fmt.Sprintf(queryTmpl, selectLabels, op, temporalAggSubQuery, groupBy, orderBy)
	case v3.SpaceAggregationMin:
		op := "min(per_series_value)"
		query = fmt.Sprintf(queryTmpl, selectLabels, op, temporalAggSubQuery, groupBy, orderBy)
	case v3.SpaceAggregationMax:
		op := "max(per_series_value)"
		query = fmt.Sprintf(queryTmpl, selectLabels, op, temporalAggSubQuery, groupBy, orderBy)
	case v3.SpaceAggregationCount:
		op := "count(per_series_value)"
		query = fmt.Sprintf(queryTmpl, selectLabels, op, temporalAggSubQuery, groupBy, orderBy)
	}

	return query, nil
}
logs analyzer with log preparer 2024-09-02 22:47:30 +03:00			`package cumulative`

			`import (`
			`"fmt"`

			`"go.signoz.io/signoz/pkg/query-service/app/metrics/v4/helpers"`
			`"go.signoz.io/signoz/pkg/query-service/constants"`
			`v3 "go.signoz.io/signoz/pkg/query-service/model/v3"`
			`"go.signoz.io/signoz/pkg/query-service/utils"`
			`)`

			// See https://clickhouse.com/docs/en/sql-reference/window-functions for more details on `lagInFrame` function
			`//`
			`// Calculating the rate of change of a metric is a common use case.`
			`// Requests and errors are two examples of metrics that are often expressed as a rate of change.`
			`// The rate of change is the difference between the current value and the previous value divided by`
			`// the time difference between the current and previous values (i.e. the time interval).`
			`//`
			`// The value of a cumulative counter always increases. However, the rate of change can be negative`
			`// if the value decreases between two samples. This can happen if the counter is reset when the`
			`// application restarts or if the counter is reset manually. In this case, the rate of change is`
			`// not meaningful and should be ignored.`
			`//`
			// The condition `(per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window) < 0`
			// checks if the rate of change is negative. If it is negative, the value is replaced with `nan`.
			`//`
			// The condition `ts - lagInFrame(ts, 1, toDate('1970-01-01')) OVER rate_window) >= 86400` checks
			`// if the time difference between the current and previous values is greater than or equal to 1 day.`
			// The first sample of a metric is always `nan` because there is no previous value to compare it to.
			// When the first sample is encountered, the previous value for the time is set to default i.e `1970-01-01`.
			`// Since any difference between the first sample timestamp and the previous value timestamp will be`
			// greater than or equal to 1 day, the rate of change for the first sample will be `nan`.
			`//`
			`// If neither of the above conditions are true, the rate of change is calculated as`
			// `(per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window) / (ts - lagInFrame(ts, 1, toDate('1970-01-01')) OVER rate_window)`
			// where `rate_window` is a window function that partitions the data by fingerprint and orders it by timestamp.
			`// We want to calculate the rate of change for each time series, so we partition the data by fingerprint.`
			`//`
			// The `increase` function is similar to the `rate` function, except that it does not divide by the time interval.
			`const (`
			rateWithoutNegative = `If((per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window) < 0, nan, If((ts - lagInFrame(ts, 1, toDate('1970-01-01')) OVER rate_window) >= 86400, nan, (per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window) / (ts - lagInFrame(ts, 1, toDate('1970-01-01')) OVER rate_window)))`
			increaseWithoutNegative = `If((per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window) < 0, nan, If((ts - lagInFrame(ts, 1, toDate('1970-01-01')) OVER rate_window) >= 86400, nan, (per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window)))`
			`)`

			`// prepareTimeAggregationSubQueryTimeSeries prepares the sub-query to be used for temporal aggregation`
			`// of time series data`

			`// The following example illustrates how the sub-query is used to calculate the sume of values for each`
			`// time series in a 15 seconds interval:`

			// ```
			`// timestamp 01.00 01.05 01.10 01.15 01.20 01.25 01.30 01.35 01.40`
			`// +------+------+------+------+------+------+------+------+------+`
			`// \| \| \| \| \| \| \| \| \| \|`
			`// \| v1 \| v2 \| v3 \| v4 \| v5 \| v6 \| v7 \| v8 \| v9 \|`
			`// \| \| \| \| \| \| \| \| \| \|`
			`// +------+------+------+------+------+------+------+------+------+`
			`// \| \| \| \| \| \| \| \| \|`
			`// \| \| \| \| \| \| \| \| \|`
			`// \| \| \|`
			`// +------+ +------+ +------+`
			`// \| v1+ \| \| v4+ \| \| v7+ \|`
			`// \| v2+ \| \| v5+ \| \| v8+ \|`
			`// \| v3 \| \| v6 \| \| v9 \|`
			`// +------+ +------+ +------+`
			`// 01.00 01.15 01.30`
			// ```

			`// Calculating the rate/increase involves an additional step. We first calculate the maximum value for each time series`
			`// in a 15 seconds interval. Then, we calculate the difference between the current maximum value and the previous`
			`// maximum value`

			`// The following example illustrates how the sub-query is used to calculate the rate of change for each time series`
			`// in a 15 seconds interval:`

			// ```
			`// timestamp 01.00 01.05 01.10 01.15 01.20 01.25 01.30 01.35 01.40`
			`// +------+------+------+------+------+------+------+------+------+`
			`// \| \| \| \| \| \| \| \| \| \|`
			`// \| v1 \| v2 \| v3 \| v4 \| v5 \| v6 \| v7 \| v8 \| v9 \|`
			`// \| \| \| \| \| \| \| \| \| \|`
			`// +------+------+------+------+------+------+------+------+------+`
			`// \| \| \| \| \| \| \| \| \|`
			`// \| \| \| \| \| \| \| \| \|`
			`// \| \| \|`
			`// +------+ +------+ +------+`
			`// max(\| v1, \| max(\| v4, \| max(\| v7, \|`
			`// \| v2, \| \| v5, \| \| v8, \|`
			`// \| v3 \|) \| v6 \|) \| v9 \|)`
			`// +------+ +------+ +------+`
			`// 01.00 01.15 01.30`

			`// +-------+ +--------+`
			`// \| V6-V2 \| \| V9-V6 \|`
			`// \| \| \| \|`
			`// \| \| \| \|`
			`// +------+ +--------+`
			`// 01.00 01.15`
			// ```

			`// The rate of change is calculated as (Vy - Vx) / (Ty - Tx) where Vx and Vy are the values at time Tx and Ty respectively.`
			`// In an ideal scenario, the last value of each interval could be used to calculate the rate of change. Instead, we use`
			`// the maximum value of each interval to calculate the rate of change. This is because any process restart can cause the`
			`// value to be reset to 0. This will produce an inaccurate result. The max is the best approximation we can get.`
			`// We don't expect the process to restart very often, so this should be a good approximation.`

			`func prepareTimeAggregationSubQuery(start, end, step int64, mq *v3.BuilderQuery) (string, error) {`
			`var subQuery string`

			`timeSeriesSubQuery, err := helpers.PrepareTimeseriesFilterQuery(start, end, mq)`
			`if err != nil {`
			`return "", err`
			`}`

			`samplesTableFilter := fmt.Sprintf("metric_name = %s AND unix_milli >= %d AND unix_milli < %d", utils.ClickHouseFormattedValue(mq.AggregateAttribute.Key), start, end)`

			`// Select the aggregate value for interval`
			`queryTmpl :=`
			`"SELECT fingerprint, %s" +`
			`" toStartOfInterval(toDateTime(intDiv(unix_milli, 1000)), INTERVAL %d SECOND) as ts," +`
			`" %s as per_series_value" +`
			`" FROM " + constants.SIGNOZ_METRIC_DBNAME + "." + constants.SIGNOZ_SAMPLES_V4_TABLENAME +`
			`" INNER JOIN" +`
			`" (%s) as filtered_time_series" +`
			`" USING fingerprint" +`
			`" WHERE " + samplesTableFilter +`
			`" GROUP BY fingerprint, ts" +`
			`" ORDER BY fingerprint, ts"`

			`selectLabelsAny := helpers.SelectLabelsAny(mq.GroupBy)`
			`selectLabels := helpers.SelectLabels(mq.GroupBy)`

			`switch mq.TimeAggregation {`
			`case v3.TimeAggregationAvg:`
			`op := "avg(value)"`
			`subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)`
			`case v3.TimeAggregationSum:`
			`op := "sum(value)"`
			`subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)`
			`case v3.TimeAggregationMin:`
			`op := "min(value)"`
			`subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)`
			`case v3.TimeAggregationMax:`
			`op := "max(value)"`
			`subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)`
			`case v3.TimeAggregationCount:`
			`op := "count(value)"`
			`subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)`
			`case v3.TimeAggregationCountDistinct:`
			`op := "count(distinct(value))"`
			`subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)`
			`case v3.TimeAggregationAnyLast:`
			`op := "anyLast(value)"`
			`subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)`
			`case v3.TimeAggregationRate:`
			`op := "max(value)"`
			`innerSubQuery := fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)`
			`rateQueryTmpl :=`
			`"SELECT %s ts, " + rateWithoutNegative +`
			`" as per_series_value FROM (%s) WINDOW rate_window as (PARTITION BY fingerprint ORDER BY fingerprint, ts)"`
			`subQuery = fmt.Sprintf(rateQueryTmpl, selectLabels, innerSubQuery)`
			`case v3.TimeAggregationIncrease:`
			`op := "max(value)"`
			`innerSubQuery := fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)`
			`rateQueryTmpl :=`
			`"SELECT %s ts, " + increaseWithoutNegative +`
			`" as per_series_value FROM (%s) WINDOW rate_window as (PARTITION BY fingerprint ORDER BY fingerprint, ts)"`
			`subQuery = fmt.Sprintf(rateQueryTmpl, selectLabels, innerSubQuery)`
			`}`
			`return subQuery, nil`
			`}`

			`// PrepareMetricQueryCumulativeTimeSeries prepares the query to be used for fetching metrics`
			`func PrepareMetricQueryCumulativeTimeSeries(start, end, step int64, mq *v3.BuilderQuery) (string, error) {`
			`var query string`

			`temporalAggSubQuery, err := prepareTimeAggregationSubQuery(start, end, step, mq)`
			`if err != nil {`
			`return "", err`
			`}`

			`groupBy := helpers.GroupingSetsByAttributeKeyTags(mq.GroupBy...)`
			`orderBy := helpers.OrderByAttributeKeyTags(mq.OrderBy, mq.GroupBy)`
			`selectLabels := helpers.GroupByAttributeKeyTags(mq.GroupBy...)`

			`queryTmpl :=`
			`"SELECT %s," +`
			`" %s as value" +`
			`" FROM (%s)" +`
			`" WHERE isNaN(per_series_value) = 0" +`
			`" GROUP BY %s" +`
			`" ORDER BY %s"`

			`switch mq.SpaceAggregation {`
			`case v3.SpaceAggregationAvg:`
			`op := "avg(per_series_value)"`
			`query = fmt.Sprintf(queryTmpl, selectLabels, op, temporalAggSubQuery, groupBy, orderBy)`
			`case v3.SpaceAggregationSum:`
			`op := "sum(per_series_value)"`
			`query = fmt.Sprintf(queryTmpl, selectLabels, op, temporalAggSubQuery, groupBy, orderBy)`
			`case v3.SpaceAggregationMin:`
			`op := "min(per_series_value)"`
			`query = fmt.Sprintf(queryTmpl, selectLabels, op, temporalAggSubQuery, groupBy, orderBy)`
			`case v3.SpaceAggregationMax:`
			`op := "max(per_series_value)"`
			`query = fmt.Sprintf(queryTmpl, selectLabels, op, temporalAggSubQuery, groupBy, orderBy)`
			`case v3.SpaceAggregationCount:`
			`op := "count(per_series_value)"`
			`query = fmt.Sprintf(queryTmpl, selectLabels, op, temporalAggSubQuery, groupBy, orderBy)`
			`}`

			`return query, nil`
			`}`