From ce55e5074d05f570ddc619619e877e011dc270e2 Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Mon, 9 Jan 2023 12:21:38 +0100 Subject: [PATCH] Add 'keep_firing_for' field to alerting rules This commit adds a new 'keep_firing_for' field to Prometheus alerting rules. The 'resolve_delay' field specifies the minimum amount of time that an alert should remain firing, even if the expression does not return any results. This feature was discussed at a previous dev summit, and it was determined that a feature like this would be useful in order to allow the expression time to stabilize and prevent confusing resolved messages from being propagated through Alertmanager. This approach is simpler than having two PromQL queries, as was sometimes discussed, and it should be easy to implement. This commit does not include tests for the 'resolve_delay' field. This is intentional, as the purpose of this commit is to gather comments on the proposed design of the 'resolve_delay' field before implementing tests. Once the design of the 'resolve_delay' field has been finalized, a follow-up commit will be submitted with tests." See https://github.com/prometheus/prometheus/issues/11570 Signed-off-by: Julien Pivotto --- docs/configuration/recording_rules.md | 4 ++ model/rulefmt/rulefmt.go | 32 +++++++++------ rules/alerting.go | 40 +++++++++++++++---- rules/alerting_test.go | 15 ++++++- rules/manager.go | 1 + rules/manager_test.go | 18 +++++---- web/api/v1/api.go | 24 ++++++----- web/api/v1/api_test.go | 2 + .../pages/alerts/CollapsibleAlertPanel.tsx | 5 +++ .../src/pages/rules/RulesContent.tsx | 5 +++ web/ui/react-app/src/types/types.ts | 1 + 11 files changed, 108 insertions(+), 39 deletions(-) diff --git a/docs/configuration/recording_rules.md b/docs/configuration/recording_rules.md index d70ffa0cbb..eda0214b35 100644 --- a/docs/configuration/recording_rules.md +++ b/docs/configuration/recording_rules.md @@ -123,6 +123,10 @@ expr: # Alerts which have not yet fired for long enough are considered pending. [ for: | default = 0s ] +# How long an alert will continue firing after the condition that triggered it +# has cleared. +[ keep_firing_for: | default = 0s ] + # Labels to add or overwrite for each alert. labels: [ : ] diff --git a/model/rulefmt/rulefmt.go b/model/rulefmt/rulefmt.go index f1d5f39257..30b3face0d 100644 --- a/model/rulefmt/rulefmt.go +++ b/model/rulefmt/rulefmt.go @@ -143,22 +143,24 @@ type RuleGroup struct { // Rule describes an alerting or recording rule. type Rule struct { - Record string `yaml:"record,omitempty"` - Alert string `yaml:"alert,omitempty"` - Expr string `yaml:"expr"` - For model.Duration `yaml:"for,omitempty"` - Labels map[string]string `yaml:"labels,omitempty"` - Annotations map[string]string `yaml:"annotations,omitempty"` + Record string `yaml:"record,omitempty"` + Alert string `yaml:"alert,omitempty"` + Expr string `yaml:"expr"` + For model.Duration `yaml:"for,omitempty"` + KeepFiringFor model.Duration `yaml:"keep_firing_for,omitempty"` + Labels map[string]string `yaml:"labels,omitempty"` + Annotations map[string]string `yaml:"annotations,omitempty"` } // RuleNode adds yaml.v3 layer to support line and column outputs for invalid rules. type RuleNode struct { - Record yaml.Node `yaml:"record,omitempty"` - Alert yaml.Node `yaml:"alert,omitempty"` - Expr yaml.Node `yaml:"expr"` - For model.Duration `yaml:"for,omitempty"` - Labels map[string]string `yaml:"labels,omitempty"` - Annotations map[string]string `yaml:"annotations,omitempty"` + Record yaml.Node `yaml:"record,omitempty"` + Alert yaml.Node `yaml:"alert,omitempty"` + Expr yaml.Node `yaml:"expr"` + For model.Duration `yaml:"for,omitempty"` + KeepFiringFor model.Duration `yaml:"keep_firing_for,omitempty"` + Labels map[string]string `yaml:"labels,omitempty"` + Annotations map[string]string `yaml:"annotations,omitempty"` } // Validate the rule and return a list of encountered errors. @@ -208,6 +210,12 @@ func (r *RuleNode) Validate() (nodes []WrappedError) { node: &r.Record, }) } + if r.KeepFiringFor != 0 { + nodes = append(nodes, WrappedError{ + err: fmt.Errorf("invalid field 'keep_firing_for' in recording rule"), + node: &r.Record, + }) + } if !model.IsValidMetricName(model.LabelValue(r.Record.Value)) { nodes = append(nodes, WrappedError{ err: fmt.Errorf("invalid recording rule name: %s", r.Record.Value), diff --git a/rules/alerting.go b/rules/alerting.go index d456662669..9ff3e8fc32 100644 --- a/rules/alerting.go +++ b/rules/alerting.go @@ -83,11 +83,12 @@ type Alert struct { Value float64 // The interval during which the condition of this alert held true. // ResolvedAt will be 0 to indicate a still active alert. - ActiveAt time.Time - FiredAt time.Time - ResolvedAt time.Time - LastSentAt time.Time - ValidUntil time.Time + ActiveAt time.Time + FiredAt time.Time + ResolvedAt time.Time + LastSentAt time.Time + ValidUntil time.Time + KeepFiringSince time.Time } func (a *Alert) needsSending(ts time.Time, resendDelay time.Duration) bool { @@ -112,6 +113,9 @@ type AlertingRule struct { // The duration for which a labelset needs to persist in the expression // output vector before an alert transitions from Pending to Firing state. holdDuration time.Duration + // The amount of time that the alert should remain firing after the + // resolution. + keepFiringFor time.Duration // Extra labels to attach to the resulting alert sample vectors. labels labels.Labels // Non-identifying key/value pairs. @@ -142,7 +146,7 @@ type AlertingRule struct { // NewAlertingRule constructs a new AlertingRule. func NewAlertingRule( - name string, vec parser.Expr, hold time.Duration, + name string, vec parser.Expr, hold, keepFiringFor time.Duration, labels, annotations, externalLabels labels.Labels, externalURL string, restored bool, logger log.Logger, ) *AlertingRule { @@ -152,6 +156,7 @@ func NewAlertingRule( name: name, vector: vec, holdDuration: hold, + keepFiringFor: keepFiringFor, labels: labels, annotations: annotations, externalLabels: el, @@ -201,6 +206,12 @@ func (r *AlertingRule) HoldDuration() time.Duration { return r.holdDuration } +// KeepFiringFor returns the duration an alerting rule should keep firing for +// after resolution. +func (r *AlertingRule) KeepFiringFor() time.Duration { + return r.keepFiringFor +} + // Labels returns the labels of the alerting rule. func (r *AlertingRule) Labels() labels.Labels { return r.labels @@ -404,16 +415,29 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, // Check if any pending alerts should be removed or fire now. Write out alert timeseries. for fp, a := range r.active { if _, ok := resultFPs[fp]; !ok { + var keepFiring bool + if a.State == StateFiring && r.keepFiringFor > 0 { + if a.KeepFiringSince.IsZero() { + a.KeepFiringSince = ts + } + if ts.Sub(a.KeepFiringSince) < r.keepFiringFor { + keepFiring = true + } + } // If the alert was previously firing, keep it around for a given // retention time so it is reported as resolved to the AlertManager. if a.State == StatePending || (!a.ResolvedAt.IsZero() && ts.Sub(a.ResolvedAt) > resolvedRetention) { delete(r.active, fp) } - if a.State != StateInactive { + if a.State != StateInactive && !keepFiring { a.State = StateInactive a.ResolvedAt = ts } - continue + if !keepFiring { + continue + } + } else { + a.KeepFiringSince = time.Time{} } numActivePending++ diff --git a/rules/alerting_test.go b/rules/alerting_test.go index 4f5f5e683a..d95610c273 100644 --- a/rules/alerting_test.go +++ b/rules/alerting_test.go @@ -66,7 +66,7 @@ func TestAlertingRuleState(t *testing.T) { } for i, test := range tests { - rule := NewAlertingRule(test.name, nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil) + rule := NewAlertingRule(test.name, nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil) rule.active = test.active got := rule.State() require.Equal(t, test.want, got, "test case %d unexpected AlertState, want:%d got:%d", i, test.want, got) @@ -90,6 +90,7 @@ func TestAlertingRuleLabelsUpdate(t *testing.T) { "HTTPRequestRateLow", expr, time.Minute, + 0, // Basing alerting rule labels off of a value that can change is a very bad idea. // If an alert is going back and forth between two label values it will never fire. // Instead, you should write two alerts with constant labels. @@ -192,6 +193,7 @@ func TestAlertingRuleExternalLabelsInTemplate(t *testing.T) { "ExternalLabelDoesNotExist", expr, time.Minute, + 0, labels.FromStrings("templated_label", "There are {{ len $externalLabels }} external Labels, of which foo is {{ $externalLabels.foo }}."), labels.EmptyLabels(), labels.EmptyLabels(), @@ -202,6 +204,7 @@ func TestAlertingRuleExternalLabelsInTemplate(t *testing.T) { "ExternalLabelExists", expr, time.Minute, + 0, labels.FromStrings("templated_label", "There are {{ len $externalLabels }} external Labels, of which foo is {{ $externalLabels.foo }}."), labels.EmptyLabels(), labels.FromStrings("foo", "bar", "dings", "bums"), @@ -286,6 +289,7 @@ func TestAlertingRuleExternalURLInTemplate(t *testing.T) { "ExternalURLDoesNotExist", expr, time.Minute, + 0, labels.FromStrings("templated_label", "The external URL is {{ $externalURL }}."), labels.EmptyLabels(), labels.EmptyLabels(), @@ -296,6 +300,7 @@ func TestAlertingRuleExternalURLInTemplate(t *testing.T) { "ExternalURLExists", expr, time.Minute, + 0, labels.FromStrings("templated_label", "The external URL is {{ $externalURL }}."), labels.EmptyLabels(), labels.EmptyLabels(), @@ -380,6 +385,7 @@ func TestAlertingRuleEmptyLabelFromTemplate(t *testing.T) { "EmptyLabel", expr, time.Minute, + 0, labels.FromStrings("empty_label", ""), labels.EmptyLabels(), labels.EmptyLabels(), @@ -436,6 +442,7 @@ func TestAlertingRuleQueryInTemplate(t *testing.T) { "ruleWithQueryInTemplate", expr, time.Minute, + 0, labels.FromStrings("label", "value"), labels.FromStrings("templated_label", `{{- with "sort(sum(http_requests) by (instance))" | query -}} {{- range $i,$v := . -}} @@ -480,7 +487,7 @@ instance: {{ $v.Labels.instance }}, value: {{ printf "%.0f" $v.Value }}; func BenchmarkAlertingRuleAtomicField(b *testing.B) { b.ReportAllocs() - rule := NewAlertingRule("bench", nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil) + rule := NewAlertingRule("bench", nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil) done := make(chan struct{}) go func() { for i := 0; i < b.N; i++ { @@ -518,6 +525,7 @@ func TestAlertingRuleDuplicate(t *testing.T) { "foo", expr, time.Minute, + 0, labels.FromStrings("test", "test"), labels.EmptyLabels(), labels.EmptyLabels(), @@ -564,6 +572,7 @@ func TestAlertingRuleLimit(t *testing.T) { "foo", expr, time.Minute, + 0, labels.FromStrings("test", "test"), labels.EmptyLabels(), labels.EmptyLabels(), @@ -636,6 +645,7 @@ func TestQueryForStateSeries(t *testing.T) { "TestRule", nil, time.Minute, + 0, labels.FromStrings("severity", "critical"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil, ) @@ -669,6 +679,7 @@ func TestSendAlertsDontAffectActiveAlerts(t *testing.T) { "TestRule", nil, time.Minute, + 0, labels.FromStrings("severity", "critical"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil, ) diff --git a/rules/manager.go b/rules/manager.go index d1ad8afdc5..6f6ce2cfe4 100644 --- a/rules/manager.go +++ b/rules/manager.go @@ -1119,6 +1119,7 @@ func (m *Manager) LoadGroups( r.Alert.Value, expr, time.Duration(r.For), + time.Duration(r.KeepFiringFor), labels.FromMap(r.Labels), labels.FromMap(r.Annotations), externalLabels, diff --git a/rules/manager_test.go b/rules/manager_test.go index 788aa0af38..6f0dd0ddaa 100644 --- a/rules/manager_test.go +++ b/rules/manager_test.go @@ -66,6 +66,7 @@ func TestAlertingRule(t *testing.T) { "HTTPRequestRateLow", expr, time.Minute, + 0, labels.FromStrings("severity", "{{\"c\"}}ritical"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil, ) @@ -209,6 +210,7 @@ func TestForStateAddSamples(t *testing.T) { "HTTPRequestRateLow", expr, time.Minute, + 0, labels.FromStrings("severity", "{{\"c\"}}ritical"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil, ) @@ -383,6 +385,7 @@ func TestForStateRestore(t *testing.T) { "HTTPRequestRateLow", expr, alertForDuration, + 0, labels.FromStrings("severity", "critical"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil, ) @@ -449,6 +452,7 @@ func TestForStateRestore(t *testing.T) { "HTTPRequestRateLow", expr, alertForDuration, + 0, labels.FromStrings("severity", "critical"), labels.EmptyLabels(), labels.EmptyLabels(), "", false, nil, ) @@ -615,13 +619,13 @@ func readSeriesSet(ss storage.SeriesSet) (map[string][]promql.Point, error) { func TestCopyState(t *testing.T) { oldGroup := &Group{ rules: []Rule{ - NewAlertingRule("alert", nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), + NewAlertingRule("alert", nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), NewRecordingRule("rule1", nil, labels.EmptyLabels()), NewRecordingRule("rule2", nil, labels.EmptyLabels()), NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v1")), NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v2")), NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v3")), - NewAlertingRule("alert2", nil, 0, labels.FromStrings("l2", "v1"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), + NewAlertingRule("alert2", nil, 0, 0, labels.FromStrings("l2", "v1"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), }, seriesInPreviousEval: []map[string]labels.Labels{ {}, @@ -640,10 +644,10 @@ func TestCopyState(t *testing.T) { NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v0")), NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v1")), NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v2")), - NewAlertingRule("alert", nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), + NewAlertingRule("alert", nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), NewRecordingRule("rule1", nil, labels.EmptyLabels()), - NewAlertingRule("alert2", nil, 0, labels.FromStrings("l2", "v0"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), - NewAlertingRule("alert2", nil, 0, labels.FromStrings("l2", "v1"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), + NewAlertingRule("alert2", nil, 0, 0, labels.FromStrings("l2", "v0"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), + NewAlertingRule("alert2", nil, 0, 0, labels.FromStrings("l2", "v1"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), NewRecordingRule("rule4", nil, labels.EmptyLabels()), }, seriesInPreviousEval: make([]map[string]labels.Labels, 8), @@ -875,7 +879,7 @@ func TestNotify(t *testing.T) { expr, err := parser.ParseExpr("a > 1") require.NoError(t, err) - rule := NewAlertingRule("aTooHigh", expr, 0, labels.Labels{}, labels.Labels{}, labels.EmptyLabels(), "", true, log.NewNopLogger()) + rule := NewAlertingRule("aTooHigh", expr, 0, 0, labels.Labels{}, labels.Labels{}, labels.EmptyLabels(), "", true, log.NewNopLogger()) group := NewGroup(GroupOptions{ Name: "alert", Interval: time.Second, @@ -1147,7 +1151,7 @@ func TestGroupHasAlertingRules(t *testing.T) { group: &Group{ name: "HasAlertingRule", rules: []Rule{ - NewAlertingRule("alert", nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), + NewAlertingRule("alert", nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), NewRecordingRule("record", nil, labels.EmptyLabels()), }, }, diff --git a/web/api/v1/api.go b/web/api/v1/api.go index 894a8666a6..29532bceb3 100644 --- a/web/api/v1/api.go +++ b/web/api/v1/api.go @@ -1111,11 +1111,12 @@ type AlertDiscovery struct { // Alert has info for an alert. type Alert struct { - Labels labels.Labels `json:"labels"` - Annotations labels.Labels `json:"annotations"` - State string `json:"state"` - ActiveAt *time.Time `json:"activeAt,omitempty"` - Value string `json:"value"` + Labels labels.Labels `json:"labels"` + Annotations labels.Labels `json:"annotations"` + State string `json:"state"` + ActiveAt *time.Time `json:"activeAt,omitempty"` + KeepFiringSince *time.Time `json:"keep_firing_since,omitempty"` + Value string `json:"value"` } func (api *API) alerts(r *http.Request) apiFuncResult { @@ -1138,11 +1139,12 @@ func rulesAlertsToAPIAlerts(rulesAlerts []*rules.Alert) []*Alert { apiAlerts := make([]*Alert, len(rulesAlerts)) for i, ruleAlert := range rulesAlerts { apiAlerts[i] = &Alert{ - Labels: ruleAlert.Labels, - Annotations: ruleAlert.Annotations, - State: ruleAlert.State.String(), - ActiveAt: &ruleAlert.ActiveAt, - Value: strconv.FormatFloat(ruleAlert.Value, 'e', -1, 64), + Labels: ruleAlert.Labels, + Annotations: ruleAlert.Annotations, + State: ruleAlert.State.String(), + ActiveAt: &ruleAlert.ActiveAt, + KeepFiringSince: &ruleAlert.KeepFiringSince, + Value: strconv.FormatFloat(ruleAlert.Value, 'e', -1, 64), } } @@ -1241,6 +1243,7 @@ type AlertingRule struct { Name string `json:"name"` Query string `json:"query"` Duration float64 `json:"duration"` + KeepFiringFor float64 `json:"keepFiringFor"` Labels labels.Labels `json:"labels"` Annotations labels.Labels `json:"annotations"` Alerts []*Alert `json:"alerts"` @@ -1303,6 +1306,7 @@ func (api *API) rules(r *http.Request) apiFuncResult { Name: rule.Name(), Query: rule.Query().String(), Duration: rule.HoldDuration().Seconds(), + KeepFiringFor: rule.KeepFiringFor().Seconds(), Labels: rule.Labels(), Annotations: rule.Annotations(), Alerts: rulesAlertsToAPIAlerts(rule.ActiveAlerts()), diff --git a/web/api/v1/api_test.go b/web/api/v1/api_test.go index 7e2dcbd8bb..919fad34b8 100644 --- a/web/api/v1/api_test.go +++ b/web/api/v1/api_test.go @@ -209,6 +209,7 @@ func (m rulesRetrieverMock) AlertingRules() []*rules.AlertingRule { "test_metric3", expr1, time.Second, + 0, labels.Labels{}, labels.Labels{}, labels.Labels{}, @@ -220,6 +221,7 @@ func (m rulesRetrieverMock) AlertingRules() []*rules.AlertingRule { "test_metric4", expr2, time.Second, + 0, labels.Labels{}, labels.Labels{}, labels.Labels{}, diff --git a/web/ui/react-app/src/pages/alerts/CollapsibleAlertPanel.tsx b/web/ui/react-app/src/pages/alerts/CollapsibleAlertPanel.tsx index ef45c205f9..1951f0f202 100644 --- a/web/ui/react-app/src/pages/alerts/CollapsibleAlertPanel.tsx +++ b/web/ui/react-app/src/pages/alerts/CollapsibleAlertPanel.tsx @@ -43,6 +43,11 @@ const CollapsibleAlertPanel: FC = ({ rule, showAnnot
for: {formatDuration(rule.duration * 1000)}
)} + {rule.keepFiringFor > 0 && ( +
+
keep_firing_for: {formatDuration(rule.keepFiringFor * 1000)}
+
+ )} {rule.labels && Object.keys(rule.labels).length > 0 && (
labels:
diff --git a/web/ui/react-app/src/pages/rules/RulesContent.tsx b/web/ui/react-app/src/pages/rules/RulesContent.tsx index e7adfee39a..ef4a7ad8f8 100644 --- a/web/ui/react-app/src/pages/rules/RulesContent.tsx +++ b/web/ui/react-app/src/pages/rules/RulesContent.tsx @@ -96,6 +96,11 @@ export const RulesContent: FC = ({ response }) => { for: {formatDuration(r.duration * 1000)}
)} + {r.keepFiringFor > 0 && ( +
+ keep_firing_for: {formatDuration(r.keepFiringFor * 1000)} +
+ )} {r.labels && Object.keys(r.labels).length > 0 && (
labels: diff --git a/web/ui/react-app/src/types/types.ts b/web/ui/react-app/src/types/types.ts index 21f52b5fa2..a30439c77e 100644 --- a/web/ui/react-app/src/types/types.ts +++ b/web/ui/react-app/src/types/types.ts @@ -26,6 +26,7 @@ export type Rule = { alerts: Alert[]; annotations: Record; duration: number; + keepFiringFor: number; evaluationTime: string; health: string; labels: Record;