package cumulative import ( "fmt" "go.signoz.io/signoz/pkg/query-service/app/metrics/v4/helpers" "go.signoz.io/signoz/pkg/query-service/constants" v3 "go.signoz.io/signoz/pkg/query-service/model/v3" "go.signoz.io/signoz/pkg/query-service/utils" ) // See https://clickhouse.com/docs/en/sql-reference/window-functions for more details on `lagInFrame` function // // Calculating the rate of change of a metric is a common use case. // Requests and errors are two examples of metrics that are often expressed as a rate of change. // The rate of change is the difference between the current value and the previous value divided by // the time difference between the current and previous values (i.e. the time interval). // // The value of a cumulative counter always increases. However, the rate of change can be negative // if the value decreases between two samples. This can happen if the counter is reset when the // application restarts or if the counter is reset manually. In this case, the rate of change is // not meaningful and should be ignored. // // The condition `(per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window) < 0` // checks if the rate of change is negative. If it is negative, the value is replaced with `nan`. // // The condition `ts - lagInFrame(ts, 1, toDate('1970-01-01')) OVER rate_window) >= 86400` checks // if the time difference between the current and previous values is greater than or equal to 1 day. // The first sample of a metric is always `nan` because there is no previous value to compare it to. // When the first sample is encountered, the previous value for the time is set to default i.e `1970-01-01`. // Since any difference between the first sample timestamp and the previous value timestamp will be // greater than or equal to 1 day, the rate of change for the first sample will be `nan`. // // If neither of the above conditions are true, the rate of change is calculated as // `(per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window) / (ts - lagInFrame(ts, 1, toDate('1970-01-01')) OVER rate_window)` // where `rate_window` is a window function that partitions the data by fingerprint and orders it by timestamp. // We want to calculate the rate of change for each time series, so we partition the data by fingerprint. // // The `increase` function is similar to the `rate` function, except that it does not divide by the time interval. const ( rateWithoutNegative = `If((per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window) < 0, nan, If((ts - lagInFrame(ts, 1, toDate('1970-01-01')) OVER rate_window) >= 86400, nan, (per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window) / (ts - lagInFrame(ts, 1, toDate('1970-01-01')) OVER rate_window)))` increaseWithoutNegative = `If((per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window) < 0, nan, If((ts - lagInFrame(ts, 1, toDate('1970-01-01')) OVER rate_window) >= 86400, nan, (per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window)))` ) // prepareTimeAggregationSubQueryTimeSeries prepares the sub-query to be used for temporal aggregation // of time series data // The following example illustrates how the sub-query is used to calculate the sume of values for each // time series in a 15 seconds interval: // ``` // timestamp 01.00 01.05 01.10 01.15 01.20 01.25 01.30 01.35 01.40 // +------+------+------+------+------+------+------+------+------+ // | | | | | | | | | | // | v1 | v2 | v3 | v4 | v5 | v6 | v7 | v8 | v9 | // | | | | | | | | | | // +------+------+------+------+------+------+------+------+------+ // | | | | | | | | | // | | | | | | | | | // | | | // +------+ +------+ +------+ // | v1+ | | v4+ | | v7+ | // | v2+ | | v5+ | | v8+ | // | v3 | | v6 | | v9 | // +------+ +------+ +------+ // 01.00 01.15 01.30 // ``` // Calculating the rate/increase involves an additional step. We first calculate the maximum value for each time series // in a 15 seconds interval. Then, we calculate the difference between the current maximum value and the previous // maximum value // The following example illustrates how the sub-query is used to calculate the rate of change for each time series // in a 15 seconds interval: // ``` // timestamp 01.00 01.05 01.10 01.15 01.20 01.25 01.30 01.35 01.40 // +------+------+------+------+------+------+------+------+------+ // | | | | | | | | | | // | v1 | v2 | v3 | v4 | v5 | v6 | v7 | v8 | v9 | // | | | | | | | | | | // +------+------+------+------+------+------+------+------+------+ // | | | | | | | | | // | | | | | | | | | // | | | // +------+ +------+ +------+ // max(| v1, | max(| v4, | max(| v7, | // | v2, | | v5, | | v8, | // | v3 |) | v6 |) | v9 |) // +------+ +------+ +------+ // 01.00 01.15 01.30 // +-------+ +--------+ // | V6-V2 | | V9-V6 | // | | | | // | | | | // +------+ +--------+ // 01.00 01.15 // ``` // The rate of change is calculated as (Vy - Vx) / (Ty - Tx) where Vx and Vy are the values at time Tx and Ty respectively. // In an ideal scenario, the last value of each interval could be used to calculate the rate of change. Instead, we use // the maximum value of each interval to calculate the rate of change. This is because any process restart can cause the // value to be reset to 0. This will produce an inaccurate result. The max is the best approximation we can get. // We don't expect the process to restart very often, so this should be a good approximation. func prepareTimeAggregationSubQuery(start, end, step int64, mq *v3.BuilderQuery) (string, error) { var subQuery string timeSeriesSubQuery, err := helpers.PrepareTimeseriesFilterQuery(start, end, mq) if err != nil { return "", err } samplesTableFilter := fmt.Sprintf("metric_name = %s AND unix_milli >= %d AND unix_milli < %d", utils.ClickHouseFormattedValue(mq.AggregateAttribute.Key), start, end) // Select the aggregate value for interval queryTmpl := "SELECT fingerprint, %s" + " toStartOfInterval(toDateTime(intDiv(unix_milli, 1000)), INTERVAL %d SECOND) as ts," + " %s as per_series_value" + " FROM " + constants.SIGNOZ_METRIC_DBNAME + "." + constants.SIGNOZ_SAMPLES_V4_TABLENAME + " INNER JOIN" + " (%s) as filtered_time_series" + " USING fingerprint" + " WHERE " + samplesTableFilter + " GROUP BY fingerprint, ts" + " ORDER BY fingerprint, ts" selectLabelsAny := helpers.SelectLabelsAny(mq.GroupBy) selectLabels := helpers.SelectLabels(mq.GroupBy) switch mq.TimeAggregation { case v3.TimeAggregationAvg: op := "avg(value)" subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery) case v3.TimeAggregationSum: op := "sum(value)" subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery) case v3.TimeAggregationMin: op := "min(value)" subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery) case v3.TimeAggregationMax: op := "max(value)" subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery) case v3.TimeAggregationCount: op := "count(value)" subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery) case v3.TimeAggregationCountDistinct: op := "count(distinct(value))" subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery) case v3.TimeAggregationAnyLast: op := "anyLast(value)" subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery) case v3.TimeAggregationRate: op := "max(value)" innerSubQuery := fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery) rateQueryTmpl := "SELECT %s ts, " + rateWithoutNegative + " as per_series_value FROM (%s) WINDOW rate_window as (PARTITION BY fingerprint ORDER BY fingerprint, ts)" subQuery = fmt.Sprintf(rateQueryTmpl, selectLabels, innerSubQuery) case v3.TimeAggregationIncrease: op := "max(value)" innerSubQuery := fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery) rateQueryTmpl := "SELECT %s ts, " + increaseWithoutNegative + " as per_series_value FROM (%s) WINDOW rate_window as (PARTITION BY fingerprint ORDER BY fingerprint, ts)" subQuery = fmt.Sprintf(rateQueryTmpl, selectLabels, innerSubQuery) } return subQuery, nil } // PrepareMetricQueryCumulativeTimeSeries prepares the query to be used for fetching metrics func PrepareMetricQueryCumulativeTimeSeries(start, end, step int64, mq *v3.BuilderQuery) (string, error) { var query string temporalAggSubQuery, err := prepareTimeAggregationSubQuery(start, end, step, mq) if err != nil { return "", err } groupBy := helpers.GroupingSetsByAttributeKeyTags(mq.GroupBy...) orderBy := helpers.OrderByAttributeKeyTags(mq.OrderBy, mq.GroupBy) selectLabels := helpers.GroupByAttributeKeyTags(mq.GroupBy...) queryTmpl := "SELECT %s," + " %s as value" + " FROM (%s)" + " WHERE isNaN(per_series_value) = 0" + " GROUP BY %s" + " ORDER BY %s" switch mq.SpaceAggregation { case v3.SpaceAggregationAvg: op := "avg(per_series_value)" query = fmt.Sprintf(queryTmpl, selectLabels, op, temporalAggSubQuery, groupBy, orderBy) case v3.SpaceAggregationSum: op := "sum(per_series_value)" query = fmt.Sprintf(queryTmpl, selectLabels, op, temporalAggSubQuery, groupBy, orderBy) case v3.SpaceAggregationMin: op := "min(per_series_value)" query = fmt.Sprintf(queryTmpl, selectLabels, op, temporalAggSubQuery, groupBy, orderBy) case v3.SpaceAggregationMax: op := "max(per_series_value)" query = fmt.Sprintf(queryTmpl, selectLabels, op, temporalAggSubQuery, groupBy, orderBy) case v3.SpaceAggregationCount: op := "count(per_series_value)" query = fmt.Sprintf(queryTmpl, selectLabels, op, temporalAggSubQuery, groupBy, orderBy) } return query, nil }