937 lines
24 KiB
Go
937 lines
24 KiB
Go
|
package rules
|
||
|
|
||
|
import (
|
||
|
"context"
|
||
|
"encoding/json"
|
||
|
"fmt"
|
||
|
"sort"
|
||
|
"strconv"
|
||
|
"strings"
|
||
|
"sync"
|
||
|
"time"
|
||
|
|
||
|
"github.com/google/uuid"
|
||
|
|
||
|
"github.com/go-kit/log"
|
||
|
|
||
|
"go.uber.org/zap"
|
||
|
|
||
|
"errors"
|
||
|
|
||
|
"github.com/jmoiron/sqlx"
|
||
|
|
||
|
// opentracing "github.com/opentracing/opentracing-go"
|
||
|
am "go.signoz.io/signoz/pkg/query-service/integrations/alertManager"
|
||
|
"go.signoz.io/signoz/pkg/query-service/interfaces"
|
||
|
"go.signoz.io/signoz/pkg/query-service/model"
|
||
|
v3 "go.signoz.io/signoz/pkg/query-service/model/v3"
|
||
|
"go.signoz.io/signoz/pkg/query-service/telemetry"
|
||
|
"go.signoz.io/signoz/pkg/query-service/utils/labels"
|
||
|
)
|
||
|
|
||
|
const taskNamesuffix = "webAppEditor"
|
||
|
|
||
|
func ruleIdFromTaskName(n string) string {
|
||
|
return strings.Split(n, "-groupname")[0]
|
||
|
}
|
||
|
|
||
|
func prepareTaskName(ruleId interface{}) string {
|
||
|
switch ruleId.(type) {
|
||
|
case int, int64:
|
||
|
return fmt.Sprintf("%d-groupname", ruleId)
|
||
|
case string:
|
||
|
return fmt.Sprintf("%s-groupname", ruleId)
|
||
|
default:
|
||
|
return fmt.Sprintf("%v-groupname", ruleId)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// ManagerOptions bundles options for the Manager.
|
||
|
type ManagerOptions struct {
|
||
|
NotifierOpts am.NotifierOptions
|
||
|
Queriers *Queriers
|
||
|
|
||
|
// RepoURL is used to generate a backlink in sent alert messages
|
||
|
RepoURL string
|
||
|
|
||
|
// rule db conn
|
||
|
DBConn *sqlx.DB
|
||
|
|
||
|
Context context.Context
|
||
|
Logger log.Logger
|
||
|
ResendDelay time.Duration
|
||
|
DisableRules bool
|
||
|
FeatureFlags interfaces.FeatureLookup
|
||
|
Reader interfaces.Reader
|
||
|
}
|
||
|
|
||
|
// The Manager manages recording and alerting rules.
|
||
|
type Manager struct {
|
||
|
opts *ManagerOptions
|
||
|
tasks map[string]Task
|
||
|
rules map[string]Rule
|
||
|
mtx sync.RWMutex
|
||
|
block chan struct{}
|
||
|
// Notifier sends messages through alert manager
|
||
|
notifier *am.Notifier
|
||
|
|
||
|
// datastore to store alert definitions
|
||
|
ruleDB RuleDB
|
||
|
|
||
|
logger log.Logger
|
||
|
|
||
|
featureFlags interfaces.FeatureLookup
|
||
|
reader interfaces.Reader
|
||
|
}
|
||
|
|
||
|
func defaultOptions(o *ManagerOptions) *ManagerOptions {
|
||
|
if o.NotifierOpts.QueueCapacity == 0 {
|
||
|
o.NotifierOpts.QueueCapacity = 10000
|
||
|
}
|
||
|
if o.NotifierOpts.Timeout == 0 {
|
||
|
o.NotifierOpts.Timeout = 10 * time.Second
|
||
|
}
|
||
|
if o.ResendDelay == time.Duration(0) {
|
||
|
o.ResendDelay = 1 * time.Minute
|
||
|
}
|
||
|
return o
|
||
|
}
|
||
|
|
||
|
// NewManager returns an implementation of Manager, ready to be started
|
||
|
// by calling the Run method.
|
||
|
func NewManager(o *ManagerOptions) (*Manager, error) {
|
||
|
|
||
|
o = defaultOptions(o)
|
||
|
// here we just initiate notifier, it will be started
|
||
|
// in run()
|
||
|
notifier, err := am.NewNotifier(&o.NotifierOpts, nil)
|
||
|
if err != nil {
|
||
|
// todo(amol): rethink on this, the query service
|
||
|
// should not be down because alert manager is not available
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
db := NewRuleDB(o.DBConn)
|
||
|
|
||
|
telemetry.GetInstance().SetAlertsInfoCallback(db.GetAlertsInfo)
|
||
|
|
||
|
m := &Manager{
|
||
|
tasks: map[string]Task{},
|
||
|
rules: map[string]Rule{},
|
||
|
notifier: notifier,
|
||
|
ruleDB: db,
|
||
|
opts: o,
|
||
|
block: make(chan struct{}),
|
||
|
logger: o.Logger,
|
||
|
featureFlags: o.FeatureFlags,
|
||
|
reader: o.Reader,
|
||
|
}
|
||
|
return m, nil
|
||
|
}
|
||
|
|
||
|
func (m *Manager) Start() {
|
||
|
if err := m.initiate(); err != nil {
|
||
|
zap.L().Error("failed to initialize alerting rules manager", zap.Error(err))
|
||
|
}
|
||
|
m.run()
|
||
|
}
|
||
|
|
||
|
func (m *Manager) RuleDB() RuleDB {
|
||
|
return m.ruleDB
|
||
|
}
|
||
|
|
||
|
func (m *Manager) Pause(b bool) {
|
||
|
m.mtx.Lock()
|
||
|
defer m.mtx.Unlock()
|
||
|
for _, t := range m.tasks {
|
||
|
t.Pause(b)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (m *Manager) initiate() error {
|
||
|
storedRules, err := m.ruleDB.GetStoredRules(context.Background())
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
if len(storedRules) == 0 {
|
||
|
return nil
|
||
|
}
|
||
|
var loadErrors []error
|
||
|
|
||
|
for _, rec := range storedRules {
|
||
|
taskName := fmt.Sprintf("%d-groupname", rec.Id)
|
||
|
parsedRule, errs := ParsePostableRule([]byte(rec.Data))
|
||
|
|
||
|
if len(errs) > 0 {
|
||
|
if errs[0].Error() == "failed to load json" {
|
||
|
zap.L().Info("failed to load rule in json format, trying yaml now:", zap.String("name", taskName))
|
||
|
|
||
|
// see if rule is stored in yaml format
|
||
|
parsedRule, errs = parsePostableRule([]byte(rec.Data), "yaml")
|
||
|
|
||
|
if parsedRule == nil {
|
||
|
zap.L().Error("failed to parse and initialize yaml rule", zap.String("name", taskName), zap.Error(err))
|
||
|
// just one rule is being parsed so expect just one error
|
||
|
loadErrors = append(loadErrors, errs[0])
|
||
|
continue
|
||
|
} else {
|
||
|
// rule stored in yaml, so migrate it to json
|
||
|
zap.L().Info("migrating rule from JSON to yaml", zap.String("name", taskName))
|
||
|
ruleJSON, err := json.Marshal(parsedRule)
|
||
|
if err == nil {
|
||
|
taskName, _, err := m.ruleDB.EditRuleTx(context.Background(), string(ruleJSON), fmt.Sprintf("%d", rec.Id))
|
||
|
if err != nil {
|
||
|
zap.L().Error("failed to migrate rule", zap.String("name", taskName), zap.Error(err))
|
||
|
} else {
|
||
|
zap.L().Info("migrated rule from yaml to json", zap.String("name", taskName))
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
zap.L().Error("failed to parse and initialize rule", zap.String("name", taskName), zap.Error(err))
|
||
|
// just one rule is being parsed so expect just one error
|
||
|
loadErrors = append(loadErrors, err)
|
||
|
continue
|
||
|
}
|
||
|
}
|
||
|
if !parsedRule.Disabled {
|
||
|
err := m.addTask(parsedRule, taskName)
|
||
|
if err != nil {
|
||
|
zap.L().Error("failed to load the rule definition", zap.String("name", taskName), zap.Error(err))
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if len(loadErrors) > 0 {
|
||
|
return errors.Join(loadErrors...)
|
||
|
}
|
||
|
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// Run starts processing of the rule manager.
|
||
|
func (m *Manager) run() {
|
||
|
// initiate notifier
|
||
|
go m.notifier.Run()
|
||
|
|
||
|
// initiate blocked tasks
|
||
|
close(m.block)
|
||
|
}
|
||
|
|
||
|
// Stop the rule manager's rule evaluation cycles.
|
||
|
func (m *Manager) Stop() {
|
||
|
m.mtx.Lock()
|
||
|
defer m.mtx.Unlock()
|
||
|
|
||
|
zap.L().Info("Stopping rule manager...")
|
||
|
|
||
|
for _, t := range m.tasks {
|
||
|
t.Stop()
|
||
|
}
|
||
|
|
||
|
zap.L().Info("Rule manager stopped")
|
||
|
}
|
||
|
|
||
|
// EditRuleDefinition writes the rule definition to the
|
||
|
// datastore and also updates the rule executor
|
||
|
func (m *Manager) EditRule(ctx context.Context, ruleStr string, id string) error {
|
||
|
|
||
|
parsedRule, errs := ParsePostableRule([]byte(ruleStr))
|
||
|
|
||
|
currentRule, err := m.GetRule(ctx, id)
|
||
|
if err != nil {
|
||
|
zap.L().Error("failed to get the rule from rule db", zap.String("id", id), zap.Error(err))
|
||
|
return err
|
||
|
}
|
||
|
|
||
|
if !checkIfTraceOrLogQB(¤tRule.PostableRule) {
|
||
|
// check if the new rule uses any feature that is not enabled
|
||
|
err = m.checkFeatureUsage(parsedRule)
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if len(errs) > 0 {
|
||
|
zap.L().Error("failed to parse rules", zap.Errors("errors", errs))
|
||
|
// just one rule is being parsed so expect just one error
|
||
|
return errs[0]
|
||
|
}
|
||
|
|
||
|
taskName, _, err := m.ruleDB.EditRuleTx(ctx, ruleStr, id)
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
|
||
|
if !m.opts.DisableRules {
|
||
|
err = m.syncRuleStateWithTask(taskName, parsedRule)
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// update feature usage if the current rule is not a trace or log query builder
|
||
|
if !checkIfTraceOrLogQB(¤tRule.PostableRule) {
|
||
|
err = m.updateFeatureUsage(parsedRule, 1)
|
||
|
if err != nil {
|
||
|
zap.L().Error("error updating feature usage", zap.Error(err))
|
||
|
}
|
||
|
// update feature usage if the new rule is not a trace or log query builder and the current rule is
|
||
|
} else if !checkIfTraceOrLogQB(parsedRule) {
|
||
|
err = m.updateFeatureUsage(¤tRule.PostableRule, -1)
|
||
|
if err != nil {
|
||
|
zap.L().Error("error updating feature usage", zap.Error(err))
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func (m *Manager) editTask(rule *PostableRule, taskName string) error {
|
||
|
m.mtx.Lock()
|
||
|
defer m.mtx.Unlock()
|
||
|
|
||
|
zap.L().Debug("editing a rule task", zap.String("name", taskName))
|
||
|
|
||
|
newTask, err := m.prepareTask(false, rule, taskName)
|
||
|
|
||
|
if err != nil {
|
||
|
zap.L().Error("loading tasks failed", zap.Error(err))
|
||
|
return errors.New("error preparing rule with given parameters, previous rule set restored")
|
||
|
}
|
||
|
|
||
|
// If there is an old task with the same identifier, stop it and wait for
|
||
|
// it to finish the current iteration. Then copy it into the new group.
|
||
|
oldTask, ok := m.tasks[taskName]
|
||
|
if !ok {
|
||
|
zap.L().Warn("rule task not found, a new task will be created", zap.String("name", taskName))
|
||
|
}
|
||
|
|
||
|
delete(m.tasks, taskName)
|
||
|
|
||
|
if ok {
|
||
|
oldTask.Stop()
|
||
|
newTask.CopyState(oldTask)
|
||
|
}
|
||
|
go func() {
|
||
|
// Wait with starting evaluation until the rule manager
|
||
|
// is told to run. This is necessary to avoid running
|
||
|
// queries against a bootstrapping storage.
|
||
|
<-m.block
|
||
|
newTask.Run(m.opts.Context)
|
||
|
}()
|
||
|
|
||
|
m.tasks[taskName] = newTask
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func (m *Manager) DeleteRule(ctx context.Context, id string) error {
|
||
|
|
||
|
idInt, err := strconv.Atoi(id)
|
||
|
if err != nil {
|
||
|
zap.L().Error("delete rule received an rule id in invalid format, must be a number", zap.String("id", id), zap.Error(err))
|
||
|
return fmt.Errorf("delete rule received an rule id in invalid format, must be a number")
|
||
|
}
|
||
|
|
||
|
// update feature usage
|
||
|
rule, err := m.GetRule(ctx, id)
|
||
|
if err != nil {
|
||
|
zap.L().Error("failed to get the rule from rule db", zap.String("id", id), zap.Error(err))
|
||
|
return err
|
||
|
}
|
||
|
|
||
|
taskName := prepareTaskName(int64(idInt))
|
||
|
if !m.opts.DisableRules {
|
||
|
m.deleteTask(taskName)
|
||
|
}
|
||
|
|
||
|
if _, _, err := m.ruleDB.DeleteRuleTx(ctx, id); err != nil {
|
||
|
zap.L().Error("failed to delete the rule from rule db", zap.String("id", id), zap.Error(err))
|
||
|
return err
|
||
|
}
|
||
|
|
||
|
err = m.updateFeatureUsage(&rule.PostableRule, -1)
|
||
|
if err != nil {
|
||
|
zap.L().Error("error updating feature usage", zap.Error(err))
|
||
|
}
|
||
|
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func (m *Manager) deleteTask(taskName string) {
|
||
|
m.mtx.Lock()
|
||
|
defer m.mtx.Unlock()
|
||
|
zap.L().Debug("deleting a rule task", zap.String("name", taskName))
|
||
|
|
||
|
oldg, ok := m.tasks[taskName]
|
||
|
if ok {
|
||
|
oldg.Stop()
|
||
|
delete(m.tasks, taskName)
|
||
|
delete(m.rules, ruleIdFromTaskName(taskName))
|
||
|
zap.L().Debug("rule task deleted", zap.String("name", taskName))
|
||
|
} else {
|
||
|
zap.L().Info("rule not found for deletion", zap.String("name", taskName))
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// CreateRule stores rule def into db and also
|
||
|
// starts an executor for the rule
|
||
|
func (m *Manager) CreateRule(ctx context.Context, ruleStr string) (*GettableRule, error) {
|
||
|
parsedRule, errs := ParsePostableRule([]byte(ruleStr))
|
||
|
|
||
|
// check if the rule uses any feature that is not enabled
|
||
|
err := m.checkFeatureUsage(parsedRule)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
if len(errs) > 0 {
|
||
|
zap.L().Error("failed to parse rules", zap.Errors("errors", errs))
|
||
|
// just one rule is being parsed so expect just one error
|
||
|
return nil, errs[0]
|
||
|
}
|
||
|
|
||
|
lastInsertId, tx, err := m.ruleDB.CreateRuleTx(ctx, ruleStr)
|
||
|
taskName := prepareTaskName(lastInsertId)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
if !m.opts.DisableRules {
|
||
|
if err := m.addTask(parsedRule, taskName); err != nil {
|
||
|
tx.Rollback()
|
||
|
return nil, err
|
||
|
}
|
||
|
}
|
||
|
err = tx.Commit()
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
// update feature usage
|
||
|
err = m.updateFeatureUsage(parsedRule, 1)
|
||
|
if err != nil {
|
||
|
zap.L().Error("error updating feature usage", zap.Error(err))
|
||
|
}
|
||
|
gettableRule := &GettableRule{
|
||
|
Id: fmt.Sprintf("%d", lastInsertId),
|
||
|
PostableRule: *parsedRule,
|
||
|
}
|
||
|
return gettableRule, nil
|
||
|
}
|
||
|
|
||
|
func (m *Manager) updateFeatureUsage(parsedRule *PostableRule, usage int64) error {
|
||
|
isTraceOrLogQB := checkIfTraceOrLogQB(parsedRule)
|
||
|
if isTraceOrLogQB {
|
||
|
feature, err := m.featureFlags.GetFeatureFlag(model.QueryBuilderAlerts)
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
feature.Usage += usage
|
||
|
if feature.Usage == feature.UsageLimit && feature.UsageLimit != -1 {
|
||
|
feature.Active = false
|
||
|
}
|
||
|
if feature.Usage < feature.UsageLimit || feature.UsageLimit == -1 {
|
||
|
feature.Active = true
|
||
|
}
|
||
|
err = m.featureFlags.UpdateFeatureFlag(feature)
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func (m *Manager) checkFeatureUsage(parsedRule *PostableRule) error {
|
||
|
isTraceOrLogQB := checkIfTraceOrLogQB(parsedRule)
|
||
|
if isTraceOrLogQB {
|
||
|
err := m.featureFlags.CheckFeature(model.QueryBuilderAlerts)
|
||
|
if err != nil {
|
||
|
switch err.(type) {
|
||
|
case model.ErrFeatureUnavailable:
|
||
|
zap.L().Error("feature unavailable", zap.String("featureKey", model.QueryBuilderAlerts), zap.Error(err))
|
||
|
return model.BadRequest(err)
|
||
|
default:
|
||
|
zap.L().Error("feature check failed", zap.String("featureKey", model.QueryBuilderAlerts), zap.Error(err))
|
||
|
return model.BadRequest(err)
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func checkIfTraceOrLogQB(parsedRule *PostableRule) bool {
|
||
|
if parsedRule != nil {
|
||
|
if parsedRule.RuleCondition.QueryType() == v3.QueryTypeBuilder {
|
||
|
for _, query := range parsedRule.RuleCondition.CompositeQuery.BuilderQueries {
|
||
|
if query.DataSource == v3.DataSourceTraces || query.DataSource == v3.DataSourceLogs {
|
||
|
return true
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
func (m *Manager) addTask(rule *PostableRule, taskName string) error {
|
||
|
m.mtx.Lock()
|
||
|
defer m.mtx.Unlock()
|
||
|
|
||
|
zap.L().Debug("adding a new rule task", zap.String("name", taskName))
|
||
|
newTask, err := m.prepareTask(false, rule, taskName)
|
||
|
|
||
|
if err != nil {
|
||
|
zap.L().Error("creating rule task failed", zap.String("name", taskName), zap.Error(err))
|
||
|
return errors.New("error loading rules, previous rule set restored")
|
||
|
}
|
||
|
|
||
|
// If there is an another task with the same identifier, raise an error
|
||
|
_, ok := m.tasks[taskName]
|
||
|
if ok {
|
||
|
return fmt.Errorf("a rule with the same name already exists")
|
||
|
}
|
||
|
|
||
|
go func() {
|
||
|
// Wait with starting evaluation until the rule manager
|
||
|
// is told to run. This is necessary to avoid running
|
||
|
// queries against a bootstrapping storage.
|
||
|
<-m.block
|
||
|
newTask.Run(m.opts.Context)
|
||
|
}()
|
||
|
|
||
|
m.tasks[taskName] = newTask
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// prepareTask prepares a rule task from postable rule
|
||
|
func (m *Manager) prepareTask(acquireLock bool, r *PostableRule, taskName string) (Task, error) {
|
||
|
|
||
|
if acquireLock {
|
||
|
m.mtx.Lock()
|
||
|
defer m.mtx.Unlock()
|
||
|
}
|
||
|
|
||
|
rules := make([]Rule, 0)
|
||
|
var task Task
|
||
|
|
||
|
if r.AlertName == "" {
|
||
|
zap.L().Error("task load failed, at least one rule must be set", zap.String("name", taskName))
|
||
|
return task, fmt.Errorf("task load failed, at least one rule must be set")
|
||
|
}
|
||
|
|
||
|
ruleId := ruleIdFromTaskName(taskName)
|
||
|
if r.RuleType == RuleTypeThreshold {
|
||
|
// create a threshold rule
|
||
|
tr, err := NewThresholdRule(
|
||
|
ruleId,
|
||
|
r,
|
||
|
ThresholdRuleOpts{},
|
||
|
m.featureFlags,
|
||
|
m.reader,
|
||
|
)
|
||
|
|
||
|
if err != nil {
|
||
|
return task, err
|
||
|
}
|
||
|
|
||
|
rules = append(rules, tr)
|
||
|
|
||
|
// create ch rule task for evalution
|
||
|
task = newTask(TaskTypeCh, taskName, taskNamesuffix, time.Duration(r.Frequency), rules, m.opts, m.prepareNotifyFunc(), m.ruleDB)
|
||
|
|
||
|
// add rule to memory
|
||
|
m.rules[ruleId] = tr
|
||
|
|
||
|
} else if r.RuleType == RuleTypeProm {
|
||
|
|
||
|
// create promql rule
|
||
|
pr, err := NewPromRule(
|
||
|
ruleId,
|
||
|
r,
|
||
|
log.With(m.logger, "alert", r.AlertName),
|
||
|
PromRuleOpts{},
|
||
|
)
|
||
|
|
||
|
if err != nil {
|
||
|
return task, err
|
||
|
}
|
||
|
|
||
|
rules = append(rules, pr)
|
||
|
|
||
|
// create promql rule task for evalution
|
||
|
task = newTask(TaskTypeProm, taskName, taskNamesuffix, time.Duration(r.Frequency), rules, m.opts, m.prepareNotifyFunc(), m.ruleDB)
|
||
|
|
||
|
// add rule to memory
|
||
|
m.rules[ruleId] = pr
|
||
|
|
||
|
} else {
|
||
|
return nil, fmt.Errorf(fmt.Sprintf("unsupported rule type. Supported types: %s, %s", RuleTypeProm, RuleTypeThreshold))
|
||
|
}
|
||
|
|
||
|
return task, nil
|
||
|
}
|
||
|
|
||
|
// RuleTasks returns the list of manager's rule tasks.
|
||
|
func (m *Manager) RuleTasks() []Task {
|
||
|
m.mtx.RLock()
|
||
|
defer m.mtx.RUnlock()
|
||
|
|
||
|
rgs := make([]Task, 0, len(m.tasks))
|
||
|
for _, g := range m.tasks {
|
||
|
rgs = append(rgs, g)
|
||
|
}
|
||
|
|
||
|
sort.Slice(rgs, func(i, j int) bool {
|
||
|
return rgs[i].Name() < rgs[j].Name()
|
||
|
})
|
||
|
|
||
|
return rgs
|
||
|
}
|
||
|
|
||
|
// RuleTasks returns the list of manager's rule tasks.
|
||
|
func (m *Manager) RuleTasksWithoutLock() []Task {
|
||
|
|
||
|
rgs := make([]Task, 0, len(m.tasks))
|
||
|
for _, g := range m.tasks {
|
||
|
rgs = append(rgs, g)
|
||
|
}
|
||
|
|
||
|
sort.Slice(rgs, func(i, j int) bool {
|
||
|
return rgs[i].Name() < rgs[j].Name()
|
||
|
})
|
||
|
|
||
|
return rgs
|
||
|
}
|
||
|
|
||
|
// Rules returns the list of the manager's rules.
|
||
|
func (m *Manager) Rules() []Rule {
|
||
|
m.mtx.RLock()
|
||
|
defer m.mtx.RUnlock()
|
||
|
|
||
|
rules := []Rule{}
|
||
|
for _, r := range m.rules {
|
||
|
rules = append(rules, r)
|
||
|
}
|
||
|
|
||
|
return rules
|
||
|
}
|
||
|
|
||
|
// TriggeredAlerts returns the list of the manager's rules.
|
||
|
func (m *Manager) TriggeredAlerts() []*NamedAlert {
|
||
|
// m.mtx.RLock()
|
||
|
// defer m.mtx.RUnlock()
|
||
|
|
||
|
namedAlerts := []*NamedAlert{}
|
||
|
|
||
|
for _, r := range m.rules {
|
||
|
active := r.ActiveAlerts()
|
||
|
|
||
|
for _, a := range active {
|
||
|
awn := &NamedAlert{
|
||
|
Alert: a,
|
||
|
Name: r.Name(),
|
||
|
}
|
||
|
namedAlerts = append(namedAlerts, awn)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return namedAlerts
|
||
|
}
|
||
|
|
||
|
// NotifyFunc sends notifications about a set of alerts generated by the given expression.
|
||
|
type NotifyFunc func(ctx context.Context, expr string, alerts ...*Alert)
|
||
|
|
||
|
// prepareNotifyFunc implements the NotifyFunc for a Notifier.
|
||
|
func (m *Manager) prepareNotifyFunc() NotifyFunc {
|
||
|
return func(ctx context.Context, expr string, alerts ...*Alert) {
|
||
|
var res []*am.Alert
|
||
|
|
||
|
for _, alert := range alerts {
|
||
|
generatorURL := alert.GeneratorURL
|
||
|
if generatorURL == "" {
|
||
|
generatorURL = m.opts.RepoURL
|
||
|
}
|
||
|
|
||
|
a := &am.Alert{
|
||
|
StartsAt: alert.FiredAt,
|
||
|
Labels: alert.Labels,
|
||
|
Annotations: alert.Annotations,
|
||
|
GeneratorURL: generatorURL,
|
||
|
Receivers: alert.Receivers,
|
||
|
}
|
||
|
if !alert.ResolvedAt.IsZero() {
|
||
|
a.EndsAt = alert.ResolvedAt
|
||
|
} else {
|
||
|
a.EndsAt = alert.ValidUntil
|
||
|
}
|
||
|
res = append(res, a)
|
||
|
}
|
||
|
|
||
|
if len(alerts) > 0 {
|
||
|
m.notifier.Send(res...)
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (m *Manager) ListActiveRules() ([]Rule, error) {
|
||
|
ruleList := []Rule{}
|
||
|
|
||
|
for _, r := range m.rules {
|
||
|
ruleList = append(ruleList, r)
|
||
|
}
|
||
|
|
||
|
return ruleList, nil
|
||
|
}
|
||
|
|
||
|
func (m *Manager) ListRuleStates(ctx context.Context) (*GettableRules, error) {
|
||
|
|
||
|
// fetch rules from DB
|
||
|
storedRules, err := m.ruleDB.GetStoredRules(ctx)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
// initiate response object
|
||
|
resp := make([]*GettableRule, 0)
|
||
|
|
||
|
for _, s := range storedRules {
|
||
|
|
||
|
ruleResponse := &GettableRule{}
|
||
|
if err := json.Unmarshal([]byte(s.Data), ruleResponse); err != nil { // Parse []byte to go struct pointer
|
||
|
zap.L().Error("failed to unmarshal rule from db", zap.Int("id", s.Id), zap.Error(err))
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
ruleResponse.Id = fmt.Sprintf("%d", s.Id)
|
||
|
|
||
|
// fetch state of rule from memory
|
||
|
if rm, ok := m.rules[ruleResponse.Id]; !ok {
|
||
|
ruleResponse.State = StateDisabled.String()
|
||
|
ruleResponse.Disabled = true
|
||
|
} else {
|
||
|
ruleResponse.State = rm.State().String()
|
||
|
}
|
||
|
ruleResponse.CreatedAt = s.CreatedAt
|
||
|
ruleResponse.CreatedBy = s.CreatedBy
|
||
|
ruleResponse.UpdatedAt = s.UpdatedAt
|
||
|
ruleResponse.UpdatedBy = s.UpdatedBy
|
||
|
resp = append(resp, ruleResponse)
|
||
|
}
|
||
|
|
||
|
return &GettableRules{Rules: resp}, nil
|
||
|
}
|
||
|
|
||
|
func (m *Manager) GetRule(ctx context.Context, id string) (*GettableRule, error) {
|
||
|
s, err := m.ruleDB.GetStoredRule(ctx, id)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
r := &GettableRule{}
|
||
|
if err := json.Unmarshal([]byte(s.Data), r); err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
r.Id = fmt.Sprintf("%d", s.Id)
|
||
|
// fetch state of rule from memory
|
||
|
if rm, ok := m.rules[r.Id]; !ok {
|
||
|
r.State = StateDisabled.String()
|
||
|
r.Disabled = true
|
||
|
} else {
|
||
|
r.State = rm.State().String()
|
||
|
}
|
||
|
r.CreatedAt = s.CreatedAt
|
||
|
r.CreatedBy = s.CreatedBy
|
||
|
r.UpdatedAt = s.UpdatedAt
|
||
|
r.UpdatedBy = s.UpdatedBy
|
||
|
|
||
|
return r, nil
|
||
|
}
|
||
|
|
||
|
// syncRuleStateWithTask ensures that the state of a stored rule matches
|
||
|
// the task state. For example - if a stored rule is disabled, then
|
||
|
// there is no task running against it.
|
||
|
func (m *Manager) syncRuleStateWithTask(taskName string, rule *PostableRule) error {
|
||
|
|
||
|
if rule.Disabled {
|
||
|
// check if rule has any task running
|
||
|
if _, ok := m.tasks[taskName]; ok {
|
||
|
// delete task from memory
|
||
|
m.deleteTask(taskName)
|
||
|
}
|
||
|
} else {
|
||
|
// check if rule has a task running
|
||
|
if _, ok := m.tasks[taskName]; !ok {
|
||
|
// rule has not task, start one
|
||
|
if err := m.addTask(rule, taskName); err != nil {
|
||
|
return err
|
||
|
}
|
||
|
} else {
|
||
|
if err := m.editTask(rule, taskName); err != nil {
|
||
|
return err
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// PatchRule supports attribute level changes to the rule definition unlike
|
||
|
// EditRule, which updates entire rule definition in the DB.
|
||
|
// the process:
|
||
|
// - get the latest rule from db
|
||
|
// - over write the patch attributes received in input (ruleStr)
|
||
|
// - re-deploy or undeploy task as necessary
|
||
|
// - update the patched rule in the DB
|
||
|
func (m *Manager) PatchRule(ctx context.Context, ruleStr string, ruleId string) (*GettableRule, error) {
|
||
|
|
||
|
if ruleId == "" {
|
||
|
return nil, fmt.Errorf("id is mandatory for patching rule")
|
||
|
}
|
||
|
|
||
|
taskName := prepareTaskName(ruleId)
|
||
|
|
||
|
// retrieve rule from DB
|
||
|
storedJSON, err := m.ruleDB.GetStoredRule(ctx, ruleId)
|
||
|
if err != nil {
|
||
|
zap.L().Error("failed to get stored rule with given id", zap.String("id", ruleId), zap.Error(err))
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
// storedRule holds the current stored rule from DB
|
||
|
storedRule := PostableRule{}
|
||
|
if err := json.Unmarshal([]byte(storedJSON.Data), &storedRule); err != nil {
|
||
|
zap.L().Error("failed to unmarshal stored rule with given id", zap.String("id", ruleId), zap.Error(err))
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
// patchedRule is combo of stored rule and patch received in the request
|
||
|
patchedRule, errs := parseIntoRule(storedRule, []byte(ruleStr), "json")
|
||
|
if len(errs) > 0 {
|
||
|
zap.L().Error("failed to parse rules", zap.Errors("errors", errs))
|
||
|
// just one rule is being parsed so expect just one error
|
||
|
return nil, errs[0]
|
||
|
}
|
||
|
|
||
|
// deploy or un-deploy task according to patched (new) rule state
|
||
|
if err := m.syncRuleStateWithTask(taskName, patchedRule); err != nil {
|
||
|
zap.L().Error("failed to sync stored rule state with the task", zap.String("taskName", taskName), zap.Error(err))
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
// prepare rule json to write to update db
|
||
|
patchedRuleBytes, err := json.Marshal(patchedRule)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
// write updated rule to db
|
||
|
if _, _, err = m.ruleDB.EditRuleTx(ctx, string(patchedRuleBytes), ruleId); err != nil {
|
||
|
// write failed, rollback task state
|
||
|
|
||
|
// restore task state from the stored rule
|
||
|
if err := m.syncRuleStateWithTask(taskName, &storedRule); err != nil {
|
||
|
zap.L().Error("failed to restore rule after patch failure", zap.String("taskName", taskName), zap.Error(err))
|
||
|
}
|
||
|
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
// prepare http response
|
||
|
response := GettableRule{
|
||
|
Id: ruleId,
|
||
|
PostableRule: *patchedRule,
|
||
|
}
|
||
|
|
||
|
// fetch state of rule from memory
|
||
|
if rm, ok := m.rules[ruleId]; !ok {
|
||
|
response.State = StateDisabled.String()
|
||
|
response.Disabled = true
|
||
|
} else {
|
||
|
response.State = rm.State().String()
|
||
|
}
|
||
|
|
||
|
return &response, nil
|
||
|
}
|
||
|
|
||
|
// TestNotification prepares a dummy rule for given rule parameters and
|
||
|
// sends a test notification. returns alert count and error (if any)
|
||
|
func (m *Manager) TestNotification(ctx context.Context, ruleStr string) (int, *model.ApiError) {
|
||
|
|
||
|
parsedRule, errs := ParsePostableRule([]byte(ruleStr))
|
||
|
|
||
|
if len(errs) > 0 {
|
||
|
zap.L().Error("failed to parse rule from request", zap.Errors("errors", errs))
|
||
|
return 0, newApiErrorBadData(errs[0])
|
||
|
}
|
||
|
|
||
|
var alertname = parsedRule.AlertName
|
||
|
if alertname == "" {
|
||
|
// alertname is not mandatory for testing, so picking
|
||
|
// a random string here
|
||
|
alertname = uuid.New().String()
|
||
|
}
|
||
|
|
||
|
// append name to indicate this is test alert
|
||
|
parsedRule.AlertName = fmt.Sprintf("%s%s", alertname, TestAlertPostFix)
|
||
|
|
||
|
var rule Rule
|
||
|
var err error
|
||
|
|
||
|
if parsedRule.RuleType == RuleTypeThreshold {
|
||
|
|
||
|
// add special labels for test alerts
|
||
|
parsedRule.Annotations[labels.AlertSummaryLabel] = fmt.Sprintf("The rule threshold is set to %.4f, and the observed metric value is {{$value}}.", *parsedRule.RuleCondition.Target)
|
||
|
parsedRule.Labels[labels.RuleSourceLabel] = ""
|
||
|
parsedRule.Labels[labels.AlertRuleIdLabel] = ""
|
||
|
|
||
|
// create a threshold rule
|
||
|
rule, err = NewThresholdRule(
|
||
|
alertname,
|
||
|
parsedRule,
|
||
|
ThresholdRuleOpts{
|
||
|
SendUnmatched: true,
|
||
|
SendAlways: true,
|
||
|
},
|
||
|
m.featureFlags,
|
||
|
m.reader,
|
||
|
)
|
||
|
|
||
|
if err != nil {
|
||
|
zap.L().Error("failed to prepare a new threshold rule for test", zap.String("name", rule.Name()), zap.Error(err))
|
||
|
return 0, newApiErrorBadData(err)
|
||
|
}
|
||
|
|
||
|
} else if parsedRule.RuleType == RuleTypeProm {
|
||
|
|
||
|
// create promql rule
|
||
|
rule, err = NewPromRule(
|
||
|
alertname,
|
||
|
parsedRule,
|
||
|
log.With(m.logger, "alert", alertname),
|
||
|
PromRuleOpts{
|
||
|
SendAlways: true,
|
||
|
},
|
||
|
)
|
||
|
|
||
|
if err != nil {
|
||
|
zap.L().Error("failed to prepare a new promql rule for test", zap.String("name", rule.Name()), zap.Error(err))
|
||
|
return 0, newApiErrorBadData(err)
|
||
|
}
|
||
|
} else {
|
||
|
return 0, newApiErrorBadData(fmt.Errorf("failed to derive ruletype with given information"))
|
||
|
}
|
||
|
|
||
|
// set timestamp to current utc time
|
||
|
ts := time.Now().UTC()
|
||
|
|
||
|
count, err := rule.Eval(ctx, ts, m.opts.Queriers)
|
||
|
if err != nil {
|
||
|
zap.L().Error("evaluating rule failed", zap.String("rule", rule.Name()), zap.Error(err))
|
||
|
return 0, newApiErrorInternal(fmt.Errorf("rule evaluation failed"))
|
||
|
}
|
||
|
alertsFound, ok := count.(int)
|
||
|
if !ok {
|
||
|
return 0, newApiErrorInternal(fmt.Errorf("something went wrong"))
|
||
|
}
|
||
|
rule.SendAlerts(ctx, ts, 0, time.Duration(1*time.Minute), m.prepareNotifyFunc())
|
||
|
|
||
|
return alertsFound, nil
|
||
|
}
|