package rules import ( "bytes" "context" "encoding/json" "fmt" "math" "net/url" "regexp" "sort" "sync" "text/template" "time" "unicode" "go.uber.org/zap" "github.com/ClickHouse/clickhouse-go/v2" "github.com/ClickHouse/clickhouse-go/v2/lib/driver" "go.signoz.io/signoz/pkg/query-service/common" "go.signoz.io/signoz/pkg/query-service/converter" "go.signoz.io/signoz/pkg/query-service/postprocess" "go.signoz.io/signoz/pkg/query-service/app/querier" querierV2 "go.signoz.io/signoz/pkg/query-service/app/querier/v2" "go.signoz.io/signoz/pkg/query-service/app/queryBuilder" "go.signoz.io/signoz/pkg/query-service/constants" "go.signoz.io/signoz/pkg/query-service/interfaces" v3 "go.signoz.io/signoz/pkg/query-service/model/v3" "go.signoz.io/signoz/pkg/query-service/utils/labels" querytemplate "go.signoz.io/signoz/pkg/query-service/utils/queryTemplate" "go.signoz.io/signoz/pkg/query-service/utils/times" "go.signoz.io/signoz/pkg/query-service/utils/timestamp" logsv3 "go.signoz.io/signoz/pkg/query-service/app/logs/v3" "go.signoz.io/signoz/pkg/query-service/formatter" yaml "gopkg.in/yaml.v2" ) type ThresholdRule struct { id string name string source string ruleCondition *RuleCondition evalWindow time.Duration holdDuration time.Duration labels labels.Labels annotations labels.Labels preferredChannels []string mtx sync.Mutex evaluationDuration time.Duration evaluationTimestamp time.Time health RuleHealth lastError error // map of active alerts active map[uint64]*Alert version string // temporalityMap is a map of metric name to temporality // to avoid fetching temporality for the same metric multiple times // querying the v4 table on low cardinal temporality column // should be fast but we can still avoid the query if we have the data in memory temporalityMap map[string]map[v3.Temporality]bool opts ThresholdRuleOpts lastTimestampWithDatapoints time.Time typ string querier interfaces.Querier querierV2 interfaces.Querier } type ThresholdRuleOpts struct { // sendUnmatched sends observed metric values // even if they dont match the rule condition. this is // useful in testing the rule SendUnmatched bool // sendAlways will send alert irresepective of resendDelay // or other params SendAlways bool } func NewThresholdRule( id string, p *PostableRule, opts ThresholdRuleOpts, featureFlags interfaces.FeatureLookup, reader interfaces.Reader, ) (*ThresholdRule, error) { if p.RuleCondition == nil { return nil, fmt.Errorf("no rule condition") } else if !p.RuleCondition.IsValid() { return nil, fmt.Errorf("invalid rule condition") } t := ThresholdRule{ id: id, name: p.AlertName, source: p.Source, ruleCondition: p.RuleCondition, evalWindow: time.Duration(p.EvalWindow), labels: labels.FromMap(p.Labels), annotations: labels.FromMap(p.Annotations), preferredChannels: p.PreferredChannels, health: HealthUnknown, active: map[uint64]*Alert{}, opts: opts, typ: p.AlertType, version: p.Version, temporalityMap: make(map[string]map[v3.Temporality]bool), } if int64(t.evalWindow) == 0 { t.evalWindow = 5 * time.Minute } querierOption := querier.QuerierOptions{ Reader: reader, Cache: nil, KeyGenerator: queryBuilder.NewKeyGenerator(), FeatureLookup: featureFlags, } querierOptsV2 := querierV2.QuerierOptions{ Reader: reader, Cache: nil, KeyGenerator: queryBuilder.NewKeyGenerator(), FeatureLookup: featureFlags, } t.querier = querier.NewQuerier(querierOption) t.querierV2 = querierV2.NewQuerier(querierOptsV2) zap.L().Info("creating new ThresholdRule", zap.String("name", t.name), zap.String("id", t.id)) return &t, nil } func (r *ThresholdRule) Name() string { return r.name } func (r *ThresholdRule) ID() string { return r.id } func (r *ThresholdRule) Condition() *RuleCondition { return r.ruleCondition } func (r *ThresholdRule) GeneratorURL() string { return prepareRuleGeneratorURL(r.ID(), r.source) } func (r *ThresholdRule) PreferredChannels() []string { return r.preferredChannels } // targetVal returns the target value for the rule condition // when the y-axis and target units are non-empty, it // converts the target value to the y-axis unit func (r *ThresholdRule) targetVal() float64 { if r.ruleCondition == nil || r.ruleCondition.Target == nil { return 0 } // get the converter for the target unit unitConverter := converter.FromUnit(converter.Unit(r.ruleCondition.TargetUnit)) // convert the target value to the y-axis unit value := unitConverter.Convert(converter.Value{ F: *r.ruleCondition.Target, U: converter.Unit(r.ruleCondition.TargetUnit), }, converter.Unit(r.Unit())) return value.F } func (r *ThresholdRule) matchType() MatchType { if r.ruleCondition == nil { return AtleastOnce } return r.ruleCondition.MatchType } func (r *ThresholdRule) compareOp() CompareOp { if r.ruleCondition == nil { return ValueIsEq } return r.ruleCondition.CompareOp } func (r *ThresholdRule) Type() RuleType { return RuleTypeThreshold } func (r *ThresholdRule) SetLastError(err error) { r.mtx.Lock() defer r.mtx.Unlock() r.lastError = err } func (r *ThresholdRule) LastError() error { r.mtx.Lock() defer r.mtx.Unlock() return r.lastError } func (r *ThresholdRule) SetHealth(health RuleHealth) { r.mtx.Lock() defer r.mtx.Unlock() r.health = health } func (r *ThresholdRule) Health() RuleHealth { r.mtx.Lock() defer r.mtx.Unlock() return r.health } // SetEvaluationDuration updates evaluationDuration to the duration it took to evaluate the rule on its last evaluation. func (r *ThresholdRule) SetEvaluationDuration(dur time.Duration) { r.mtx.Lock() defer r.mtx.Unlock() r.evaluationDuration = dur } func (r *ThresholdRule) HoldDuration() time.Duration { return r.holdDuration } func (r *ThresholdRule) EvalWindow() time.Duration { return r.evalWindow } // Labels returns the labels of the alerting rule. func (r *ThresholdRule) Labels() labels.BaseLabels { return r.labels } // Annotations returns the annotations of the alerting rule. func (r *ThresholdRule) Annotations() labels.BaseLabels { return r.annotations } // GetEvaluationDuration returns the time in seconds it took to evaluate the alerting rule. func (r *ThresholdRule) GetEvaluationDuration() time.Duration { r.mtx.Lock() defer r.mtx.Unlock() return r.evaluationDuration } // SetEvaluationTimestamp updates evaluationTimestamp to the timestamp of when the rule was last evaluated. func (r *ThresholdRule) SetEvaluationTimestamp(ts time.Time) { r.mtx.Lock() defer r.mtx.Unlock() r.evaluationTimestamp = ts } // GetEvaluationTimestamp returns the time the evaluation took place. func (r *ThresholdRule) GetEvaluationTimestamp() time.Time { r.mtx.Lock() defer r.mtx.Unlock() return r.evaluationTimestamp } // State returns the maximum state of alert instances for this rule. // StateFiring > StatePending > StateInactive func (r *ThresholdRule) State() AlertState { r.mtx.Lock() defer r.mtx.Unlock() maxState := StateInactive for _, a := range r.active { if a.State > maxState { maxState = a.State } } return maxState } func (r *ThresholdRule) currentAlerts() []*Alert { r.mtx.Lock() defer r.mtx.Unlock() alerts := make([]*Alert, 0, len(r.active)) for _, a := range r.active { anew := *a alerts = append(alerts, &anew) } return alerts } func (r *ThresholdRule) ActiveAlerts() []*Alert { var res []*Alert for _, a := range r.currentAlerts() { if a.ResolvedAt.IsZero() { res = append(res, a) } } return res } func (r *ThresholdRule) FetchTemporality(ctx context.Context, metricNames []string, ch driver.Conn) (map[string]map[v3.Temporality]bool, error) { metricNameToTemporality := make(map[string]map[v3.Temporality]bool) query := fmt.Sprintf(`SELECT DISTINCT metric_name, temporality FROM %s.%s WHERE metric_name IN $1`, constants.SIGNOZ_METRIC_DBNAME, constants.SIGNOZ_TIMESERIES_v4_1DAY_TABLENAME) rows, err := ch.Query(ctx, query, metricNames) if err != nil { return nil, err } defer rows.Close() for rows.Next() { var metricName, temporality string err := rows.Scan(&metricName, &temporality) if err != nil { return nil, err } if _, ok := metricNameToTemporality[metricName]; !ok { metricNameToTemporality[metricName] = make(map[v3.Temporality]bool) } metricNameToTemporality[metricName][v3.Temporality(temporality)] = true } return metricNameToTemporality, nil } // populateTemporality same as addTemporality but for v4 and better func (r *ThresholdRule) populateTemporality(ctx context.Context, qp *v3.QueryRangeParamsV3, ch driver.Conn) error { missingTemporality := make([]string, 0) metricNameToTemporality := make(map[string]map[v3.Temporality]bool) if qp.CompositeQuery != nil && len(qp.CompositeQuery.BuilderQueries) > 0 { for _, query := range qp.CompositeQuery.BuilderQueries { // if there is no temporality specified in the query but we have it in the map // then use the value from the map if query.Temporality == "" && r.temporalityMap[query.AggregateAttribute.Key] != nil { // We prefer delta if it is available if r.temporalityMap[query.AggregateAttribute.Key][v3.Delta] { query.Temporality = v3.Delta } else if r.temporalityMap[query.AggregateAttribute.Key][v3.Cumulative] { query.Temporality = v3.Cumulative } else { query.Temporality = v3.Unspecified } } // we don't have temporality for this metric if query.DataSource == v3.DataSourceMetrics && query.Temporality == "" { missingTemporality = append(missingTemporality, query.AggregateAttribute.Key) } if _, ok := metricNameToTemporality[query.AggregateAttribute.Key]; !ok { metricNameToTemporality[query.AggregateAttribute.Key] = make(map[v3.Temporality]bool) } } } nameToTemporality, err := r.FetchTemporality(ctx, missingTemporality, ch) if err != nil { return err } if qp.CompositeQuery != nil && len(qp.CompositeQuery.BuilderQueries) > 0 { for name := range qp.CompositeQuery.BuilderQueries { query := qp.CompositeQuery.BuilderQueries[name] if query.DataSource == v3.DataSourceMetrics && query.Temporality == "" { if nameToTemporality[query.AggregateAttribute.Key][v3.Delta] { query.Temporality = v3.Delta } else if nameToTemporality[query.AggregateAttribute.Key][v3.Cumulative] { query.Temporality = v3.Cumulative } else { query.Temporality = v3.Unspecified } r.temporalityMap[query.AggregateAttribute.Key] = nameToTemporality[query.AggregateAttribute.Key] } } } return nil } // ForEachActiveAlert runs the given function on each alert. // This should be used when you want to use the actual alerts from the ThresholdRule // and not on its copy. // If you want to run on a copy of alerts then don't use this, get the alerts from 'ActiveAlerts()'. func (r *ThresholdRule) ForEachActiveAlert(f func(*Alert)) { r.mtx.Lock() defer r.mtx.Unlock() for _, a := range r.active { f(a) } } func (r *ThresholdRule) SendAlerts(ctx context.Context, ts time.Time, resendDelay time.Duration, interval time.Duration, notifyFunc NotifyFunc) { zap.L().Info("sending alerts", zap.String("rule", r.Name())) alerts := []*Alert{} r.ForEachActiveAlert(func(alert *Alert) { if r.opts.SendAlways || alert.needsSending(ts, resendDelay) { alert.LastSentAt = ts // Allow for two Eval or Alertmanager send failures. delta := resendDelay if interval > resendDelay { delta = interval } alert.ValidUntil = ts.Add(4 * delta) anew := *alert alerts = append(alerts, &anew) } else { zap.L().Debug("skipping send alert due to resend delay", zap.String("rule", r.Name()), zap.Any("alert", alert.Labels)) } }) notifyFunc(ctx, "", alerts...) } func (r *ThresholdRule) Unit() string { if r.ruleCondition != nil && r.ruleCondition.CompositeQuery != nil { return r.ruleCondition.CompositeQuery.Unit } return "" } func (r *ThresholdRule) prepareQueryRange(ts time.Time) *v3.QueryRangeParamsV3 { // todo(srikanthccv): make this configurable // 2 minutes is reasonable time to wait for data to be available // 60 seconds (SDK) + 10 seconds (batch) + rest for n/w + serialization + write to disk etc.. start := ts.Add(-time.Duration(r.evalWindow)).UnixMilli() - 2*60*1000 end := ts.UnixMilli() - 2*60*1000 // round to minute otherwise we could potentially miss data start = start - (start % (60 * 1000)) end = end - (end % (60 * 1000)) if r.ruleCondition.QueryType() == v3.QueryTypeClickHouseSQL { params := &v3.QueryRangeParamsV3{ Start: start, End: end, Step: int64(math.Max(float64(common.MinAllowedStepInterval(start, end)), 60)), CompositeQuery: &v3.CompositeQuery{ QueryType: r.ruleCondition.CompositeQuery.QueryType, PanelType: r.ruleCondition.CompositeQuery.PanelType, BuilderQueries: make(map[string]*v3.BuilderQuery), ClickHouseQueries: make(map[string]*v3.ClickHouseQuery), PromQueries: make(map[string]*v3.PromQuery), Unit: r.ruleCondition.CompositeQuery.Unit, }, Variables: make(map[string]interface{}, 0), NoCache: true, } querytemplate.AssignReservedVarsV3(params) for name, chQuery := range r.ruleCondition.CompositeQuery.ClickHouseQueries { if chQuery.Disabled { continue } tmpl := template.New("clickhouse-query") tmpl, err := tmpl.Parse(chQuery.Query) if err != nil { zap.L().Error("failed to parse clickhouse query to populate vars", zap.String("ruleid", r.ID()), zap.Error(err)) r.SetHealth(HealthBad) return params } var query bytes.Buffer err = tmpl.Execute(&query, params.Variables) if err != nil { zap.L().Error("failed to populate clickhouse query", zap.String("ruleid", r.ID()), zap.Error(err)) r.SetHealth(HealthBad) return params } params.CompositeQuery.ClickHouseQueries[name] = &v3.ClickHouseQuery{ Query: query.String(), Disabled: chQuery.Disabled, Legend: chQuery.Legend, } } return params } if r.ruleCondition.CompositeQuery != nil && r.ruleCondition.CompositeQuery.BuilderQueries != nil { for _, q := range r.ruleCondition.CompositeQuery.BuilderQueries { // If the step interval is less than the minimum allowed step interval, set it to the minimum allowed step interval if minStep := common.MinAllowedStepInterval(start, end); q.StepInterval < minStep { q.StepInterval = minStep } } } if r.ruleCondition.CompositeQuery.PanelType != v3.PanelTypeGraph { r.ruleCondition.CompositeQuery.PanelType = v3.PanelTypeGraph } // default mode return &v3.QueryRangeParamsV3{ Start: start, End: end, Step: int64(math.Max(float64(common.MinAllowedStepInterval(start, end)), 60)), CompositeQuery: r.ruleCondition.CompositeQuery, Variables: make(map[string]interface{}, 0), NoCache: true, } } // The following function is used to prepare the where clause for the query // `lbls` contains the key value pairs of the labels from the result of the query // We iterate over the where clause and replace the labels with the actual values // There are two cases: // 1. The label is present in the where clause // 2. The label is not present in the where clause // // Example for case 2: // Latency by serviceName without any filter // In this case, for each service with latency > threshold we send a notification // The expectation will be that clicking on the related traces for service A, will // take us to the traces page with the filter serviceName=A // So for all the missing labels in the where clause, we add them as key = value // // Example for case 1: // Severity text IN (WARN, ERROR) // In this case, the Severity text will appear in the `lbls` if it were part of the group // by clause, in which case we replace it with the actual value for the notification // i.e Severity text = WARN // If the Severity text is not part of the group by clause, then we add it as it is func (r *ThresholdRule) fetchFilters(selectedQuery string, lbls labels.Labels) []v3.FilterItem { var filterItems []v3.FilterItem added := make(map[string]struct{}) if r.ruleCondition.CompositeQuery.QueryType == v3.QueryTypeBuilder && r.ruleCondition.CompositeQuery.BuilderQueries[selectedQuery] != nil && r.ruleCondition.CompositeQuery.BuilderQueries[selectedQuery].Filters != nil { for _, item := range r.ruleCondition.CompositeQuery.BuilderQueries[selectedQuery].Filters.Items { exists := false for _, label := range lbls { if item.Key.Key == label.Name { // if the label is present in the where clause, replace it with key = value filterItems = append(filterItems, v3.FilterItem{ Key: item.Key, Operator: v3.FilterOperatorEqual, Value: label.Value, }) exists = true added[label.Name] = struct{}{} break } } if !exists { // if the label is not present in the where clause, add it as it is filterItems = append(filterItems, item) } } } // add the labels which are not present in the where clause for _, label := range lbls { if _, ok := added[label.Name]; !ok { filterItems = append(filterItems, v3.FilterItem{ Key: v3.AttributeKey{Key: label.Name}, Operator: v3.FilterOperatorEqual, Value: label.Value, }) } } return filterItems } func (r *ThresholdRule) prepareLinksToLogs(ts time.Time, lbls labels.Labels) string { selectedQuery := r.GetSelectedQuery() // TODO(srikanthccv): handle formula queries if selectedQuery < "A" || selectedQuery > "Z" { return "" } q := r.prepareQueryRange(ts) // Logs list view expects time in milliseconds tr := timeRange{ Start: q.Start, End: q.End, PageSize: 100, } options := Options{ MaxLines: 2, Format: "list", SelectColumns: []v3.AttributeKey{}, } period, _ := json.Marshal(tr) urlEncodedTimeRange := url.QueryEscape(string(period)) filterItems := r.fetchFilters(selectedQuery, lbls) urlData := urlShareableCompositeQuery{ QueryType: string(v3.QueryTypeBuilder), Builder: builderQuery{ QueryData: []v3.BuilderQuery{ { DataSource: v3.DataSourceLogs, QueryName: "A", AggregateOperator: v3.AggregateOperatorNoOp, AggregateAttribute: v3.AttributeKey{}, Filters: &v3.FilterSet{ Items: filterItems, Operator: "AND", }, Expression: "A", Disabled: false, Having: []v3.Having{}, StepInterval: 60, OrderBy: []v3.OrderBy{ { ColumnName: "timestamp", Order: "desc", }, }, }, }, QueryFormulas: make([]string, 0), }, } data, _ := json.Marshal(urlData) compositeQuery := url.QueryEscape(string(data)) optionsData, _ := json.Marshal(options) urlEncodedOptions := url.QueryEscape(string(optionsData)) return fmt.Sprintf("compositeQuery=%s&timeRange=%s&startTime=%d&endTime=%d&options=%s", compositeQuery, urlEncodedTimeRange, tr.Start, tr.End, urlEncodedOptions) } func (r *ThresholdRule) prepareLinksToTraces(ts time.Time, lbls labels.Labels) string { selectedQuery := r.GetSelectedQuery() // TODO(srikanthccv): handle formula queries if selectedQuery < "A" || selectedQuery > "Z" { return "" } q := r.prepareQueryRange(ts) // Traces list view expects time in nanoseconds tr := timeRange{ Start: q.Start * time.Second.Microseconds(), End: q.End * time.Second.Microseconds(), PageSize: 100, } options := Options{ MaxLines: 2, Format: "list", SelectColumns: constants.TracesListViewDefaultSelectedColumns, } period, _ := json.Marshal(tr) urlEncodedTimeRange := url.QueryEscape(string(period)) filterItems := r.fetchFilters(selectedQuery, lbls) urlData := urlShareableCompositeQuery{ QueryType: string(v3.QueryTypeBuilder), Builder: builderQuery{ QueryData: []v3.BuilderQuery{ { DataSource: v3.DataSourceTraces, QueryName: "A", AggregateOperator: v3.AggregateOperatorNoOp, AggregateAttribute: v3.AttributeKey{}, Filters: &v3.FilterSet{ Items: filterItems, Operator: "AND", }, Expression: "A", Disabled: false, Having: []v3.Having{}, StepInterval: 60, OrderBy: []v3.OrderBy{ { ColumnName: "timestamp", Order: "desc", }, }, }, }, QueryFormulas: make([]string, 0), }, } data, _ := json.Marshal(urlData) compositeQuery := url.QueryEscape(string(data)) optionsData, _ := json.Marshal(options) urlEncodedOptions := url.QueryEscape(string(optionsData)) return fmt.Sprintf("compositeQuery=%s&timeRange=%s&startTime=%d&endTime=%d&options=%s", compositeQuery, urlEncodedTimeRange, tr.Start, tr.End, urlEncodedOptions) } func (r *ThresholdRule) hostFromSource() string { parsedUrl, err := url.Parse(r.source) if err != nil { return "" } if parsedUrl.Port() != "" { return fmt.Sprintf("%s://%s:%s", parsedUrl.Scheme, parsedUrl.Hostname(), parsedUrl.Port()) } return fmt.Sprintf("%s://%s", parsedUrl.Scheme, parsedUrl.Hostname()) } func (r *ThresholdRule) GetSelectedQuery() string { if r.ruleCondition != nil { if r.ruleCondition.SelectedQuery != "" { return r.ruleCondition.SelectedQuery } queryNames := map[string]struct{}{} if r.ruleCondition.CompositeQuery != nil { if r.ruleCondition.QueryType() == v3.QueryTypeBuilder { for name := range r.ruleCondition.CompositeQuery.BuilderQueries { queryNames[name] = struct{}{} } } else if r.ruleCondition.QueryType() == v3.QueryTypeClickHouseSQL { for name := range r.ruleCondition.CompositeQuery.ClickHouseQueries { queryNames[name] = struct{}{} } } } // The following logic exists for backward compatibility // If there is no selected query, then // - check if F1 is present, if yes, return F1 // - else return the query with max ascii value // this logic is not really correct. we should be considering // whether the query is enabled or not. but this is a temporary // fix to support backward compatibility if _, ok := queryNames["F1"]; ok { return "F1" } keys := make([]string, 0, len(queryNames)) for k := range queryNames { keys = append(keys, k) } sort.Strings(keys) return keys[len(keys)-1] } // This should never happen return "" } func (r *ThresholdRule) buildAndRunQuery(ctx context.Context, ts time.Time, ch clickhouse.Conn) (Vector, error) { if r.ruleCondition == nil || r.ruleCondition.CompositeQuery == nil { r.SetHealth(HealthBad) r.SetLastError(fmt.Errorf("no rule condition")) return nil, fmt.Errorf("invalid rule condition") } params := r.prepareQueryRange(ts) err := r.populateTemporality(ctx, params, ch) if err != nil { r.SetHealth(HealthBad) zap.L().Error("failed to set temporality", zap.String("rule", r.Name()), zap.Error(err)) return nil, fmt.Errorf("internal error while setting temporality") } if params.CompositeQuery.QueryType == v3.QueryTypeBuilder { // check if any enrichment is required for logs if yes then enrich them if logsv3.EnrichmentRequired(params) { // Note: Sending empty fields key because enrichment is only needed for json // TODO: Add support for attribute enrichment later logsv3.Enrich(params, map[string]v3.AttributeKey{}) } } var results []*v3.Result var errQuriesByName map[string]error if r.version == "v4" { results, errQuriesByName, err = r.querierV2.QueryRange(ctx, params, map[string]v3.AttributeKey{}) } else { results, errQuriesByName, err = r.querier.QueryRange(ctx, params, map[string]v3.AttributeKey{}) } if err != nil { zap.L().Error("failed to get alert query result", zap.String("rule", r.Name()), zap.Error(err), zap.Any("queries", errQuriesByName)) r.SetHealth(HealthBad) return nil, fmt.Errorf("internal error while querying") } if params.CompositeQuery.QueryType == v3.QueryTypeBuilder { results, err = postprocess.PostProcessResult(results, params) if err != nil { r.SetHealth(HealthBad) zap.L().Error("failed to post process result", zap.String("rule", r.Name()), zap.Error(err)) return nil, fmt.Errorf("internal error while post processing") } } selectedQuery := r.GetSelectedQuery() var queryResult *v3.Result for _, res := range results { if res.QueryName == selectedQuery { queryResult = res break } } if queryResult != nil && len(queryResult.Series) > 0 { r.lastTimestampWithDatapoints = time.Now() } var resultVector Vector // if the data is missing for `For` duration then we should send alert if r.ruleCondition.AlertOnAbsent && r.lastTimestampWithDatapoints.Add(time.Duration(r.Condition().AbsentFor)*time.Minute).Before(time.Now()) { zap.L().Info("no data found for rule condition", zap.String("ruleid", r.ID())) lbls := labels.NewBuilder(labels.Labels{}) if !r.lastTimestampWithDatapoints.IsZero() { lbls.Set("lastSeen", r.lastTimestampWithDatapoints.Format(constants.AlertTimeFormat)) } resultVector = append(resultVector, Sample{ Metric: lbls.Labels(), IsMissing: true, }) return resultVector, nil } for _, series := range queryResult.Series { smpl, shouldAlert := r.shouldAlert(*series) if shouldAlert { resultVector = append(resultVector, smpl) } } return resultVector, nil } func normalizeLabelName(name string) string { // See https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels // Regular expression to match non-alphanumeric characters except underscores reg := regexp.MustCompile(`[^a-zA-Z0-9_]`) // Replace all non-alphanumeric characters except underscores with underscores normalized := reg.ReplaceAllString(name, "_") // If the first character is not a letter or an underscore, prepend an underscore if len(normalized) > 0 && !unicode.IsLetter(rune(normalized[0])) && normalized[0] != '_' { normalized = "_" + normalized } return normalized } func (r *ThresholdRule) Eval(ctx context.Context, ts time.Time, queriers *Queriers) (interface{}, error) { valueFormatter := formatter.FromUnit(r.Unit()) res, err := r.buildAndRunQuery(ctx, ts, queriers.Ch) if err != nil { r.SetHealth(HealthBad) r.SetLastError(err) zap.L().Error("failure in buildAndRunQuery", zap.String("ruleid", r.ID()), zap.Error(err)) return nil, err } r.mtx.Lock() defer r.mtx.Unlock() resultFPs := map[uint64]struct{}{} var alerts = make(map[uint64]*Alert, len(res)) for _, smpl := range res { l := make(map[string]string, len(smpl.Metric)) for _, lbl := range smpl.Metric { l[lbl.Name] = lbl.Value } value := valueFormatter.Format(smpl.V, r.Unit()) threshold := valueFormatter.Format(r.targetVal(), r.Unit()) zap.L().Debug("Alert template data for rule", zap.String("name", r.Name()), zap.String("formatter", valueFormatter.Name()), zap.String("value", value), zap.String("threshold", threshold)) tmplData := AlertTemplateData(l, value, threshold) // Inject some convenience variables that are easier to remember for users // who are not used to Go's templating system. defs := "{{$labels := .Labels}}{{$value := .Value}}{{$threshold := .Threshold}}" // utility function to apply go template on labels and annotations expand := func(text string) string { tmpl := NewTemplateExpander( ctx, defs+text, "__alert_"+r.Name(), tmplData, times.Time(timestamp.FromTime(ts)), nil, ) result, err := tmpl.Expand() if err != nil { result = fmt.Sprintf("", err) zap.L().Error("Expanding alert template failed", zap.Error(err), zap.Any("data", tmplData)) } return result } lb := labels.NewBuilder(smpl.Metric).Del(labels.MetricNameLabel).Del(labels.TemporalityLabel) for _, l := range r.labels { lb.Set(l.Name, expand(l.Value)) } lb.Set(labels.AlertNameLabel, r.Name()) lb.Set(labels.AlertRuleIdLabel, r.ID()) lb.Set(labels.RuleSourceLabel, r.GeneratorURL()) annotations := make(labels.Labels, 0, len(r.annotations)) for _, a := range r.annotations { if smpl.IsMissing { if a.Name == labels.AlertDescriptionLabel || a.Name == labels.AlertSummaryLabel { a.Value = labels.AlertMissingData } } annotations = append(annotations, labels.Label{Name: normalizeLabelName(a.Name), Value: expand(a.Value)}) } // Links with timestamps should go in annotations since labels // is used alert grouping, and we want to group alerts with the same // label set, but different timestamps, together. if r.typ == "TRACES_BASED_ALERT" { link := r.prepareLinksToTraces(ts, smpl.MetricOrig) if link != "" && r.hostFromSource() != "" { annotations = append(annotations, labels.Label{Name: "related_traces", Value: fmt.Sprintf("%s/traces-explorer?%s", r.hostFromSource(), link)}) } } else if r.typ == "LOGS_BASED_ALERT" { link := r.prepareLinksToLogs(ts, smpl.MetricOrig) if link != "" && r.hostFromSource() != "" { annotations = append(annotations, labels.Label{Name: "related_logs", Value: fmt.Sprintf("%s/logs/logs-explorer?%s", r.hostFromSource(), link)}) } } lbs := lb.Labels() h := lbs.Hash() resultFPs[h] = struct{}{} if _, ok := alerts[h]; ok { zap.L().Error("the alert query returns duplicate records", zap.String("ruleid", r.ID()), zap.Any("alert", alerts[h])) err = fmt.Errorf("duplicate alert found, vector contains metrics with the same labelset after applying alert labels") // We have already acquired the lock above hence using SetHealth and // SetLastError will deadlock. r.health = HealthBad r.lastError = err return nil, err } alerts[h] = &Alert{ Labels: lbs, Annotations: annotations, ActiveAt: ts, State: StatePending, Value: smpl.V, GeneratorURL: r.GeneratorURL(), Receivers: r.preferredChannels, } } zap.L().Info("alerts found", zap.String("name", r.Name()), zap.Int("count", len(alerts))) // alerts[h] is ready, add or update active list now for h, a := range alerts { // Check whether we already have alerting state for the identifying label set. // Update the last value and annotations if so, create a new alert entry otherwise. if alert, ok := r.active[h]; ok && alert.State != StateInactive { alert.Value = a.Value alert.Annotations = a.Annotations alert.Receivers = r.preferredChannels continue } r.active[h] = a } // Check if any pending alerts should be removed or fire now. Write out alert timeseries. for fp, a := range r.active { if _, ok := resultFPs[fp]; !ok { // If the alert was previously firing, keep it around for a given // retention time so it is reported as resolved to the AlertManager. if a.State == StatePending || (!a.ResolvedAt.IsZero() && ts.Sub(a.ResolvedAt) > resolvedRetention) { delete(r.active, fp) } if a.State != StateInactive { a.State = StateInactive a.ResolvedAt = ts } continue } if a.State == StatePending && ts.Sub(a.ActiveAt) >= r.holdDuration { a.State = StateFiring a.FiredAt = ts } } r.health = HealthGood r.lastError = err return len(r.active), nil } func (r *ThresholdRule) String() string { ar := PostableRule{ AlertName: r.name, RuleCondition: r.ruleCondition, EvalWindow: Duration(r.evalWindow), Labels: r.labels.Map(), Annotations: r.annotations.Map(), PreferredChannels: r.preferredChannels, } byt, err := yaml.Marshal(ar) if err != nil { return fmt.Sprintf("error marshaling alerting rule: %s", err.Error()) } return string(byt) } func removeGroupinSetPoints(series v3.Series) []v3.Point { var result []v3.Point for _, s := range series.Points { if s.Timestamp >= 0 && !math.IsNaN(s.Value) && !math.IsInf(s.Value, 0) { result = append(result, s) } } return result } func (r *ThresholdRule) shouldAlert(series v3.Series) (Sample, bool) { var alertSmpl Sample var shouldAlert bool var lbls labels.Labels var lblsNormalized labels.Labels for name, value := range series.Labels { lbls = append(lbls, labels.Label{Name: name, Value: value}) lblsNormalized = append(lblsNormalized, labels.Label{Name: normalizeLabelName(name), Value: value}) } series.Points = removeGroupinSetPoints(series) // nothing to evaluate if len(series.Points) == 0 { return alertSmpl, false } switch r.matchType() { case AtleastOnce: // If any sample matches the condition, the rule is firing. if r.compareOp() == ValueIsAbove { for _, smpl := range series.Points { if smpl.Value > r.targetVal() { alertSmpl = Sample{Point: Point{V: smpl.Value}, Metric: lblsNormalized, MetricOrig: lbls} shouldAlert = true break } } } else if r.compareOp() == ValueIsBelow { for _, smpl := range series.Points { if smpl.Value < r.targetVal() { alertSmpl = Sample{Point: Point{V: smpl.Value}, Metric: lblsNormalized, MetricOrig: lbls} shouldAlert = true break } } } else if r.compareOp() == ValueIsEq { for _, smpl := range series.Points { if smpl.Value == r.targetVal() { alertSmpl = Sample{Point: Point{V: smpl.Value}, Metric: lblsNormalized, MetricOrig: lbls} shouldAlert = true break } } } else if r.compareOp() == ValueIsNotEq { for _, smpl := range series.Points { if smpl.Value != r.targetVal() { alertSmpl = Sample{Point: Point{V: smpl.Value}, Metric: lblsNormalized, MetricOrig: lbls} shouldAlert = true break } } } case AllTheTimes: // If all samples match the condition, the rule is firing. shouldAlert = true alertSmpl = Sample{Point: Point{V: r.targetVal()}, Metric: lblsNormalized, MetricOrig: lbls} if r.compareOp() == ValueIsAbove { for _, smpl := range series.Points { if smpl.Value <= r.targetVal() { shouldAlert = false break } } } else if r.compareOp() == ValueIsBelow { for _, smpl := range series.Points { if smpl.Value >= r.targetVal() { shouldAlert = false break } } } else if r.compareOp() == ValueIsEq { for _, smpl := range series.Points { if smpl.Value != r.targetVal() { shouldAlert = false break } } } else if r.compareOp() == ValueIsNotEq { for _, smpl := range series.Points { if smpl.Value == r.targetVal() { shouldAlert = false break } } } case OnAverage: // If the average of all samples matches the condition, the rule is firing. var sum, count float64 for _, smpl := range series.Points { if math.IsNaN(smpl.Value) || math.IsInf(smpl.Value, 0) { continue } sum += smpl.Value count++ } avg := sum / count alertSmpl = Sample{Point: Point{V: avg}, Metric: lblsNormalized, MetricOrig: lbls} if r.compareOp() == ValueIsAbove { if avg > r.targetVal() { shouldAlert = true } } else if r.compareOp() == ValueIsBelow { if avg < r.targetVal() { shouldAlert = true } } else if r.compareOp() == ValueIsEq { if avg == r.targetVal() { shouldAlert = true } } else if r.compareOp() == ValueIsNotEq { if avg != r.targetVal() { shouldAlert = true } } case InTotal: // If the sum of all samples matches the condition, the rule is firing. var sum float64 for _, smpl := range series.Points { if math.IsNaN(smpl.Value) || math.IsInf(smpl.Value, 0) { continue } sum += smpl.Value } alertSmpl = Sample{Point: Point{V: sum}, Metric: lblsNormalized, MetricOrig: lbls} if r.compareOp() == ValueIsAbove { if sum > r.targetVal() { shouldAlert = true } } else if r.compareOp() == ValueIsBelow { if sum < r.targetVal() { shouldAlert = true } } else if r.compareOp() == ValueIsEq { if sum == r.targetVal() { shouldAlert = true } } else if r.compareOp() == ValueIsNotEq { if sum != r.targetVal() { shouldAlert = true } } } return alertSmpl, shouldAlert }