214 lines
11 KiB
Go
214 lines
11 KiB
Go
|
package cumulative
|
||
|
|
||
|
import (
|
||
|
"fmt"
|
||
|
|
||
|
"go.signoz.io/signoz/pkg/query-service/app/metrics/v4/helpers"
|
||
|
"go.signoz.io/signoz/pkg/query-service/constants"
|
||
|
v3 "go.signoz.io/signoz/pkg/query-service/model/v3"
|
||
|
"go.signoz.io/signoz/pkg/query-service/utils"
|
||
|
)
|
||
|
|
||
|
// See https://clickhouse.com/docs/en/sql-reference/window-functions for more details on `lagInFrame` function
|
||
|
//
|
||
|
// Calculating the rate of change of a metric is a common use case.
|
||
|
// Requests and errors are two examples of metrics that are often expressed as a rate of change.
|
||
|
// The rate of change is the difference between the current value and the previous value divided by
|
||
|
// the time difference between the current and previous values (i.e. the time interval).
|
||
|
//
|
||
|
// The value of a cumulative counter always increases. However, the rate of change can be negative
|
||
|
// if the value decreases between two samples. This can happen if the counter is reset when the
|
||
|
// application restarts or if the counter is reset manually. In this case, the rate of change is
|
||
|
// not meaningful and should be ignored.
|
||
|
//
|
||
|
// The condition `(per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window) < 0`
|
||
|
// checks if the rate of change is negative. If it is negative, the value is replaced with `nan`.
|
||
|
//
|
||
|
// The condition `ts - lagInFrame(ts, 1, toDate('1970-01-01')) OVER rate_window) >= 86400` checks
|
||
|
// if the time difference between the current and previous values is greater than or equal to 1 day.
|
||
|
// The first sample of a metric is always `nan` because there is no previous value to compare it to.
|
||
|
// When the first sample is encountered, the previous value for the time is set to default i.e `1970-01-01`.
|
||
|
// Since any difference between the first sample timestamp and the previous value timestamp will be
|
||
|
// greater than or equal to 1 day, the rate of change for the first sample will be `nan`.
|
||
|
//
|
||
|
// If neither of the above conditions are true, the rate of change is calculated as
|
||
|
// `(per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window) / (ts - lagInFrame(ts, 1, toDate('1970-01-01')) OVER rate_window)`
|
||
|
// where `rate_window` is a window function that partitions the data by fingerprint and orders it by timestamp.
|
||
|
// We want to calculate the rate of change for each time series, so we partition the data by fingerprint.
|
||
|
//
|
||
|
// The `increase` function is similar to the `rate` function, except that it does not divide by the time interval.
|
||
|
const (
|
||
|
rateWithoutNegative = `If((per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window) < 0, nan, If((ts - lagInFrame(ts, 1, toDate('1970-01-01')) OVER rate_window) >= 86400, nan, (per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window) / (ts - lagInFrame(ts, 1, toDate('1970-01-01')) OVER rate_window)))`
|
||
|
increaseWithoutNegative = `If((per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window) < 0, nan, If((ts - lagInFrame(ts, 1, toDate('1970-01-01')) OVER rate_window) >= 86400, nan, (per_series_value - lagInFrame(per_series_value, 1, 0) OVER rate_window)))`
|
||
|
)
|
||
|
|
||
|
// prepareTimeAggregationSubQueryTimeSeries prepares the sub-query to be used for temporal aggregation
|
||
|
// of time series data
|
||
|
|
||
|
// The following example illustrates how the sub-query is used to calculate the sume of values for each
|
||
|
// time series in a 15 seconds interval:
|
||
|
|
||
|
// ```
|
||
|
// timestamp 01.00 01.05 01.10 01.15 01.20 01.25 01.30 01.35 01.40
|
||
|
// +------+------+------+------+------+------+------+------+------+
|
||
|
// | | | | | | | | | |
|
||
|
// | v1 | v2 | v3 | v4 | v5 | v6 | v7 | v8 | v9 |
|
||
|
// | | | | | | | | | |
|
||
|
// +------+------+------+------+------+------+------+------+------+
|
||
|
// | | | | | | | | |
|
||
|
// | | | | | | | | |
|
||
|
// | | |
|
||
|
// +------+ +------+ +------+
|
||
|
// | v1+ | | v4+ | | v7+ |
|
||
|
// | v2+ | | v5+ | | v8+ |
|
||
|
// | v3 | | v6 | | v9 |
|
||
|
// +------+ +------+ +------+
|
||
|
// 01.00 01.15 01.30
|
||
|
// ```
|
||
|
|
||
|
// Calculating the rate/increase involves an additional step. We first calculate the maximum value for each time series
|
||
|
// in a 15 seconds interval. Then, we calculate the difference between the current maximum value and the previous
|
||
|
// maximum value
|
||
|
|
||
|
// The following example illustrates how the sub-query is used to calculate the rate of change for each time series
|
||
|
// in a 15 seconds interval:
|
||
|
|
||
|
// ```
|
||
|
// timestamp 01.00 01.05 01.10 01.15 01.20 01.25 01.30 01.35 01.40
|
||
|
// +------+------+------+------+------+------+------+------+------+
|
||
|
// | | | | | | | | | |
|
||
|
// | v1 | v2 | v3 | v4 | v5 | v6 | v7 | v8 | v9 |
|
||
|
// | | | | | | | | | |
|
||
|
// +------+------+------+------+------+------+------+------+------+
|
||
|
// | | | | | | | | |
|
||
|
// | | | | | | | | |
|
||
|
// | | |
|
||
|
// +------+ +------+ +------+
|
||
|
// max(| v1, | max(| v4, | max(| v7, |
|
||
|
// | v2, | | v5, | | v8, |
|
||
|
// | v3 |) | v6 |) | v9 |)
|
||
|
// +------+ +------+ +------+
|
||
|
// 01.00 01.15 01.30
|
||
|
|
||
|
// +-------+ +--------+
|
||
|
// | V6-V2 | | V9-V6 |
|
||
|
// | | | |
|
||
|
// | | | |
|
||
|
// +------+ +--------+
|
||
|
// 01.00 01.15
|
||
|
// ```
|
||
|
|
||
|
// The rate of change is calculated as (Vy - Vx) / (Ty - Tx) where Vx and Vy are the values at time Tx and Ty respectively.
|
||
|
// In an ideal scenario, the last value of each interval could be used to calculate the rate of change. Instead, we use
|
||
|
// the maximum value of each interval to calculate the rate of change. This is because any process restart can cause the
|
||
|
// value to be reset to 0. This will produce an inaccurate result. The max is the best approximation we can get.
|
||
|
// We don't expect the process to restart very often, so this should be a good approximation.
|
||
|
|
||
|
func prepareTimeAggregationSubQuery(start, end, step int64, mq *v3.BuilderQuery) (string, error) {
|
||
|
var subQuery string
|
||
|
|
||
|
timeSeriesSubQuery, err := helpers.PrepareTimeseriesFilterQuery(start, end, mq)
|
||
|
if err != nil {
|
||
|
return "", err
|
||
|
}
|
||
|
|
||
|
samplesTableFilter := fmt.Sprintf("metric_name = %s AND unix_milli >= %d AND unix_milli < %d", utils.ClickHouseFormattedValue(mq.AggregateAttribute.Key), start, end)
|
||
|
|
||
|
// Select the aggregate value for interval
|
||
|
queryTmpl :=
|
||
|
"SELECT fingerprint, %s" +
|
||
|
" toStartOfInterval(toDateTime(intDiv(unix_milli, 1000)), INTERVAL %d SECOND) as ts," +
|
||
|
" %s as per_series_value" +
|
||
|
" FROM " + constants.SIGNOZ_METRIC_DBNAME + "." + constants.SIGNOZ_SAMPLES_V4_TABLENAME +
|
||
|
" INNER JOIN" +
|
||
|
" (%s) as filtered_time_series" +
|
||
|
" USING fingerprint" +
|
||
|
" WHERE " + samplesTableFilter +
|
||
|
" GROUP BY fingerprint, ts" +
|
||
|
" ORDER BY fingerprint, ts"
|
||
|
|
||
|
selectLabelsAny := helpers.SelectLabelsAny(mq.GroupBy)
|
||
|
selectLabels := helpers.SelectLabels(mq.GroupBy)
|
||
|
|
||
|
switch mq.TimeAggregation {
|
||
|
case v3.TimeAggregationAvg:
|
||
|
op := "avg(value)"
|
||
|
subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)
|
||
|
case v3.TimeAggregationSum:
|
||
|
op := "sum(value)"
|
||
|
subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)
|
||
|
case v3.TimeAggregationMin:
|
||
|
op := "min(value)"
|
||
|
subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)
|
||
|
case v3.TimeAggregationMax:
|
||
|
op := "max(value)"
|
||
|
subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)
|
||
|
case v3.TimeAggregationCount:
|
||
|
op := "count(value)"
|
||
|
subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)
|
||
|
case v3.TimeAggregationCountDistinct:
|
||
|
op := "count(distinct(value))"
|
||
|
subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)
|
||
|
case v3.TimeAggregationAnyLast:
|
||
|
op := "anyLast(value)"
|
||
|
subQuery = fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)
|
||
|
case v3.TimeAggregationRate:
|
||
|
op := "max(value)"
|
||
|
innerSubQuery := fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)
|
||
|
rateQueryTmpl :=
|
||
|
"SELECT %s ts, " + rateWithoutNegative +
|
||
|
" as per_series_value FROM (%s) WINDOW rate_window as (PARTITION BY fingerprint ORDER BY fingerprint, ts)"
|
||
|
subQuery = fmt.Sprintf(rateQueryTmpl, selectLabels, innerSubQuery)
|
||
|
case v3.TimeAggregationIncrease:
|
||
|
op := "max(value)"
|
||
|
innerSubQuery := fmt.Sprintf(queryTmpl, selectLabelsAny, step, op, timeSeriesSubQuery)
|
||
|
rateQueryTmpl :=
|
||
|
"SELECT %s ts, " + increaseWithoutNegative +
|
||
|
" as per_series_value FROM (%s) WINDOW rate_window as (PARTITION BY fingerprint ORDER BY fingerprint, ts)"
|
||
|
subQuery = fmt.Sprintf(rateQueryTmpl, selectLabels, innerSubQuery)
|
||
|
}
|
||
|
return subQuery, nil
|
||
|
}
|
||
|
|
||
|
// PrepareMetricQueryCumulativeTimeSeries prepares the query to be used for fetching metrics
|
||
|
func PrepareMetricQueryCumulativeTimeSeries(start, end, step int64, mq *v3.BuilderQuery) (string, error) {
|
||
|
var query string
|
||
|
|
||
|
temporalAggSubQuery, err := prepareTimeAggregationSubQuery(start, end, step, mq)
|
||
|
if err != nil {
|
||
|
return "", err
|
||
|
}
|
||
|
|
||
|
groupBy := helpers.GroupingSetsByAttributeKeyTags(mq.GroupBy...)
|
||
|
orderBy := helpers.OrderByAttributeKeyTags(mq.OrderBy, mq.GroupBy)
|
||
|
selectLabels := helpers.GroupByAttributeKeyTags(mq.GroupBy...)
|
||
|
|
||
|
queryTmpl :=
|
||
|
"SELECT %s," +
|
||
|
" %s as value" +
|
||
|
" FROM (%s)" +
|
||
|
" WHERE isNaN(per_series_value) = 0" +
|
||
|
" GROUP BY %s" +
|
||
|
" ORDER BY %s"
|
||
|
|
||
|
switch mq.SpaceAggregation {
|
||
|
case v3.SpaceAggregationAvg:
|
||
|
op := "avg(per_series_value)"
|
||
|
query = fmt.Sprintf(queryTmpl, selectLabels, op, temporalAggSubQuery, groupBy, orderBy)
|
||
|
case v3.SpaceAggregationSum:
|
||
|
op := "sum(per_series_value)"
|
||
|
query = fmt.Sprintf(queryTmpl, selectLabels, op, temporalAggSubQuery, groupBy, orderBy)
|
||
|
case v3.SpaceAggregationMin:
|
||
|
op := "min(per_series_value)"
|
||
|
query = fmt.Sprintf(queryTmpl, selectLabels, op, temporalAggSubQuery, groupBy, orderBy)
|
||
|
case v3.SpaceAggregationMax:
|
||
|
op := "max(per_series_value)"
|
||
|
query = fmt.Sprintf(queryTmpl, selectLabels, op, temporalAggSubQuery, groupBy, orderBy)
|
||
|
case v3.SpaceAggregationCount:
|
||
|
op := "count(per_series_value)"
|
||
|
query = fmt.Sprintf(queryTmpl, selectLabels, op, temporalAggSubQuery, groupBy, orderBy)
|
||
|
}
|
||
|
|
||
|
return query, nil
|
||
|
}
|