From ce55e5074d05f570ddc619619e877e011dc270e2 Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Mon, 9 Jan 2023 12:21:38 +0100 Subject: [PATCH 1/6] Add 'keep_firing_for' field to alerting rules This commit adds a new 'keep_firing_for' field to Prometheus alerting rules. The 'resolve_delay' field specifies the minimum amount of time that an alert should remain firing, even if the expression does not return any results. This feature was discussed at a previous dev summit, and it was determined that a feature like this would be useful in order to allow the expression time to stabilize and prevent confusing resolved messages from being propagated through Alertmanager. This approach is simpler than having two PromQL queries, as was sometimes discussed, and it should be easy to implement. This commit does not include tests for the 'resolve_delay' field. This is intentional, as the purpose of this commit is to gather comments on the proposed design of the 'resolve_delay' field before implementing tests. Once the design of the 'resolve_delay' field has been finalized, a follow-up commit will be submitted with tests." See https://github.com/prometheus/prometheus/issues/11570 Signed-off-by: Julien Pivotto --- docs/configuration/recording_rules.md | 4 ++ model/rulefmt/rulefmt.go | 32 +++++++++------ rules/alerting.go | 40 +++++++++++++++---- rules/alerting_test.go | 15 ++++++- rules/manager.go | 1 + rules/manager_test.go | 18 +++++---- web/api/v1/api.go | 24 ++++++----- web/api/v1/api_test.go | 2 + .../pages/alerts/CollapsibleAlertPanel.tsx | 5 +++ .../src/pages/rules/RulesContent.tsx | 5 +++ web/ui/react-app/src/types/types.ts | 1 + 11 files changed, 108 insertions(+), 39 deletions(-) diff --git a/docs/configuration/recording_rules.md b/docs/configuration/recording_rules.md index d70ffa0cbb..eda0214b35 100644 --- a/docs/configuration/recording_rules.md +++ b/docs/configuration/recording_rules.md @@ -123,6 +123,10 @@ expr: # Alerts which have not yet fired for long enough are considered pending. [ for: | default = 0s ] +# How long an alert will continue firing after the condition that triggered it +# has cleared. +[ keep_firing_for: | default = 0s ] + # Labels to add or overwrite for each alert. labels: [ : ] diff --git a/model/rulefmt/rulefmt.go b/model/rulefmt/rulefmt.go index f1d5f39257..30b3face0d 100644 --- a/model/rulefmt/rulefmt.go +++ b/model/rulefmt/rulefmt.go @@ -143,22 +143,24 @@ type RuleGroup struct { // Rule describes an alerting or recording rule. type Rule struct { - Record string `yaml:"record,omitempty"` - Alert string `yaml:"alert,omitempty"` - Expr string `yaml:"expr"` - For model.Duration `yaml:"for,omitempty"` - Labels map[string]string `yaml:"labels,omitempty"` - Annotations map[string]string `yaml:"annotations,omitempty"` + Record string `yaml:"record,omitempty"` + Alert string `yaml:"alert,omitempty"` + Expr string `yaml:"expr"` + For model.Duration `yaml:"for,omitempty"` + KeepFiringFor model.Duration `yaml:"keep_firing_for,omitempty"` + Labels map[string]string `yaml:"labels,omitempty"` + Annotations map[string]string `yaml:"annotations,omitempty"` } // RuleNode adds yaml.v3 layer to support line and column outputs for invalid rules. type RuleNode struct { - Record yaml.Node `yaml:"record,omitempty"` - Alert yaml.Node `yaml:"alert,omitempty"` - Expr yaml.Node `yaml:"expr"` - For model.Duration `yaml:"for,omitempty"` - Labels map[string]string `yaml:"labels,omitempty"` - Annotations map[string]string `yaml:"annotations,omitempty"` + Record yaml.Node `yaml:"record,omitempty"` + Alert yaml.Node `yaml:"alert,omitempty"` + Expr yaml.Node `yaml:"expr"` + For model.Duration `yaml:"for,omitempty"` + KeepFiringFor model.Duration `yaml:"keep_firing_for,omitempty"` + Labels map[string]string `yaml:"labels,omitempty"` + Annotations map[string]string `yaml:"annotations,omitempty"` } // Validate the rule and return a list of encountered errors. @@ -208,6 +210,12 @@ func (r *RuleNode) Validate() (nodes []WrappedError) { node: &r.Record, }) } + if r.KeepFiringFor != 0 { + nodes = append(nodes, WrappedError{ + err: fmt.Errorf("invalid field 'keep_firing_for' in recording rule"), + node: &r.Record, + }) + } if !model.IsValidMetricName(model.LabelValue(r.Record.Value)) { nodes = append(nodes, WrappedError{ err: fmt.Errorf("invalid recording rule name: %s", r.Record.Value), diff --git a/rules/alerting.go b/rules/alerting.go index d456662669..9ff3e8fc32 100644 --- a/rules/alerting.go +++ b/rules/alerting.go @@ -83,11 +83,12 @@ type Alert struct { Value float64 // The interval during which the condition of this alert held true. // ResolvedAt will be 0 to indicate a still active alert. - ActiveAt time.Time - FiredAt time.Time - ResolvedAt time.Time - LastSentAt time.Time - ValidUntil time.Time + ActiveAt time.Time + FiredAt time.Time + ResolvedAt time.Time + LastSentAt time.Time + ValidUntil time.Time + KeepFiringSince time.Time } func (a *Alert) needsSending(ts time.Time, resendDelay time.Duration) bool { @@ -112,6 +113,9 @@ type AlertingRule struct { // The duration for which a labelset needs to persist in the expression // output vector before an alert transitions from Pending to Firing state. holdDuration time.Duration + // The amount of time that the alert should remain firing after the + // resolution. + keepFiringFor time.Duration // Extra labels to attach to the resulting alert sample vectors. labels labels.Labels // Non-identifying key/value pairs. @@ -142,7 +146,7 @@ type AlertingRule struct { // NewAlertingRule constructs a new AlertingRule. func NewAlertingRule( - name string, vec parser.Expr, hold time.Duration, + name string, vec parser.Expr, hold, keepFiringFor time.Duration, labels, annotations, externalLabels labels.Labels, externalURL string, restored bool, logger log.Logger, ) *AlertingRule { @@ -152,6 +156,7 @@ func NewAlertingRule( name: name, vector: vec, holdDuration: hold, + keepFiringFor: keepFiringFor, labels: labels, annotations: annotations, externalLabels: el, @@ -201,6 +206,12 @@ func (r *AlertingRule) HoldDuration() time.Duration { return r.holdDuration } +// KeepFiringFor returns the duration an alerting rule should keep firing for +// after resolution. +func (r *AlertingRule) KeepFiringFor() time.Duration { + return r.keepFiringFor +} + // Labels returns the labels of the alerting rule. func (r *AlertingRule) Labels() labels.Labels { return r.labels @@ -404,16 +415,29 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, // Check if any pending alerts should be removed or fire now. Write out alert timeseries. for fp, a := range r.active { if _, ok := resultFPs[fp]; !ok { + var keepFiring bool + if a.State == StateFiring && r.keepFiringFor > 0 { + if a.KeepFiringSince.IsZero() { + a.KeepFiringSince = ts + } + if ts.Sub(a.KeepFiringSince) < r.keepFiringFor { + keepFiring = true + } + } // If the alert was previously firing, keep it around for a given // retention time so it is reported as resolved to the AlertManager. if a.State == StatePending || (!a.ResolvedAt.IsZero() && ts.Sub(a.ResolvedAt) > resolvedRetention) { delete(r.active, fp) } - if a.State != StateInactive { + if a.State != StateInactive && !keepFiring { a.State = StateInactive a.ResolvedAt = ts } - continue + if !keepFiring { + continue + } + } else { + a.KeepFiringSince = time.Time{} } numActivePending++ diff --git a/rules/alerting_test.go b/rules/alerting_test.go index 4f5f5e683a..d95610c273 100644 --- a/rules/alerting_test.go +++ b/rules/alerting_test.go @@ -66,7 +66,7 @@ func TestAlertingRuleState(t *testing.T) { } for i, test := range tests { - rule := NewAlertingRule(test.name, nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil) + rule := NewAlertingRule(test.name, nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil) rule.active = test.active got := rule.State() require.Equal(t, test.want, got, "test case %d unexpected AlertState, want:%d got:%d", i, test.want, got) @@ -90,6 +90,7 @@ func TestAlertingRuleLabelsUpdate(t *testing.T) { "HTTPRequestRateLow", expr, time.Minute, + 0, // Basing alerting rule labels off of a value that can change is a very bad idea. // If an alert is going back and forth between two label values it will never fire. // Instead, you should write two alerts with constant labels. @@ -192,6 +193,7 @@ func TestAlertingRuleExternalLabelsInTemplate(t *testing.T) { "ExternalLabelDoesNotExist", expr, time.Minute, + 0, labels.FromStrings("templated_label", "There are {{ len $externalLabels }} external Labels, of which foo is {{ $externalLabels.foo }}."), labels.EmptyLabels(), labels.EmptyLabels(), @@ -202,6 +204,7 @@ func TestAlertingRuleExternalLabelsInTemplate(t *testing.T) { "ExternalLabelExists", expr, time.Minute, + 0, labels.FromStrings("templated_label", "There are {{ len $externalLabels }} external Labels, of which foo is {{ $externalLabels.foo }}."), labels.EmptyLabels(), labels.FromStrings("foo", "bar", "dings", "bums"), @@ -286,6 +289,7 @@ func TestAlertingRuleExternalURLInTemplate(t *testing.T) { "ExternalURLDoesNotExist", expr, time.Minute, + 0, labels.FromStrings("templated_label", "The external URL is {{ $externalURL }}."), labels.EmptyLabels(), labels.EmptyLabels(), @@ -296,6 +300,7 @@ func TestAlertingRuleExternalURLInTemplate(t *testing.T) { "ExternalURLExists", expr, time.Minute, + 0, labels.FromStrings("templated_label", "The external URL is {{ $externalURL }}."), labels.EmptyLabels(), labels.EmptyLabels(), @@ -380,6 +385,7 @@ func TestAlertingRuleEmptyLabelFromTemplate(t *testing.T) { "EmptyLabel", expr, time.Minute, + 0, labels.FromStrings("empty_label", ""), labels.EmptyLabels(), labels.EmptyLabels(), @@ -436,6 +442,7 @@ func TestAlertingRuleQueryInTemplate(t *testing.T) { "ruleWithQueryInTemplate", expr, time.Minute, + 0, labels.FromStrings("label", "value"), labels.FromStrings("templated_label", `{{- with "sort(sum(http_requests) by (instance))" | query -}} {{- range $i,$v := . -}} @@ -480,7 +487,7 @@ instance: {{ $v.Labels.instance }}, value: {{ printf "%.0f" $v.Value }}; func BenchmarkAlertingRuleAtomicField(b *testing.B) { b.ReportAllocs() - rule := NewAlertingRule("bench", nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil) + rule := NewAlertingRule("bench", nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil) done := make(chan struct{}) go func() { for i := 0; i < b.N; i++ { @@ -518,6 +525,7 @@ func TestAlertingRuleDuplicate(t *testing.T) { "foo", expr, time.Minute, + 0, labels.FromStrings("test", "test"), labels.EmptyLabels(), labels.EmptyLabels(), @@ -564,6 +572,7 @@ func TestAlertingRuleLimit(t *testing.T) { "foo", expr, time.Minute, + 0, labels.FromStrings("test", "test"), labels.EmptyLabels(), labels.EmptyLabels(), @@ -636,6 +645,7 @@ func TestQueryForStateSeries(t *testing.T) { "TestRule", nil, time.Minute, + 0, labels.FromStrings("severity", "critical"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil, ) @@ -669,6 +679,7 @@ func TestSendAlertsDontAffectActiveAlerts(t *testing.T) { "TestRule", nil, time.Minute, + 0, labels.FromStrings("severity", "critical"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil, ) diff --git a/rules/manager.go b/rules/manager.go index d1ad8afdc5..6f6ce2cfe4 100644 --- a/rules/manager.go +++ b/rules/manager.go @@ -1119,6 +1119,7 @@ func (m *Manager) LoadGroups( r.Alert.Value, expr, time.Duration(r.For), + time.Duration(r.KeepFiringFor), labels.FromMap(r.Labels), labels.FromMap(r.Annotations), externalLabels, diff --git a/rules/manager_test.go b/rules/manager_test.go index 788aa0af38..6f0dd0ddaa 100644 --- a/rules/manager_test.go +++ b/rules/manager_test.go @@ -66,6 +66,7 @@ func TestAlertingRule(t *testing.T) { "HTTPRequestRateLow", expr, time.Minute, + 0, labels.FromStrings("severity", "{{\"c\"}}ritical"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil, ) @@ -209,6 +210,7 @@ func TestForStateAddSamples(t *testing.T) { "HTTPRequestRateLow", expr, time.Minute, + 0, labels.FromStrings("severity", "{{\"c\"}}ritical"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil, ) @@ -383,6 +385,7 @@ func TestForStateRestore(t *testing.T) { "HTTPRequestRateLow", expr, alertForDuration, + 0, labels.FromStrings("severity", "critical"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil, ) @@ -449,6 +452,7 @@ func TestForStateRestore(t *testing.T) { "HTTPRequestRateLow", expr, alertForDuration, + 0, labels.FromStrings("severity", "critical"), labels.EmptyLabels(), labels.EmptyLabels(), "", false, nil, ) @@ -615,13 +619,13 @@ func readSeriesSet(ss storage.SeriesSet) (map[string][]promql.Point, error) { func TestCopyState(t *testing.T) { oldGroup := &Group{ rules: []Rule{ - NewAlertingRule("alert", nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), + NewAlertingRule("alert", nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), NewRecordingRule("rule1", nil, labels.EmptyLabels()), NewRecordingRule("rule2", nil, labels.EmptyLabels()), NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v1")), NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v2")), NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v3")), - NewAlertingRule("alert2", nil, 0, labels.FromStrings("l2", "v1"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), + NewAlertingRule("alert2", nil, 0, 0, labels.FromStrings("l2", "v1"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), }, seriesInPreviousEval: []map[string]labels.Labels{ {}, @@ -640,10 +644,10 @@ func TestCopyState(t *testing.T) { NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v0")), NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v1")), NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v2")), - NewAlertingRule("alert", nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), + NewAlertingRule("alert", nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), NewRecordingRule("rule1", nil, labels.EmptyLabels()), - NewAlertingRule("alert2", nil, 0, labels.FromStrings("l2", "v0"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), - NewAlertingRule("alert2", nil, 0, labels.FromStrings("l2", "v1"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), + NewAlertingRule("alert2", nil, 0, 0, labels.FromStrings("l2", "v0"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), + NewAlertingRule("alert2", nil, 0, 0, labels.FromStrings("l2", "v1"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), NewRecordingRule("rule4", nil, labels.EmptyLabels()), }, seriesInPreviousEval: make([]map[string]labels.Labels, 8), @@ -875,7 +879,7 @@ func TestNotify(t *testing.T) { expr, err := parser.ParseExpr("a > 1") require.NoError(t, err) - rule := NewAlertingRule("aTooHigh", expr, 0, labels.Labels{}, labels.Labels{}, labels.EmptyLabels(), "", true, log.NewNopLogger()) + rule := NewAlertingRule("aTooHigh", expr, 0, 0, labels.Labels{}, labels.Labels{}, labels.EmptyLabels(), "", true, log.NewNopLogger()) group := NewGroup(GroupOptions{ Name: "alert", Interval: time.Second, @@ -1147,7 +1151,7 @@ func TestGroupHasAlertingRules(t *testing.T) { group: &Group{ name: "HasAlertingRule", rules: []Rule{ - NewAlertingRule("alert", nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), + NewAlertingRule("alert", nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), NewRecordingRule("record", nil, labels.EmptyLabels()), }, }, diff --git a/web/api/v1/api.go b/web/api/v1/api.go index 894a8666a6..29532bceb3 100644 --- a/web/api/v1/api.go +++ b/web/api/v1/api.go @@ -1111,11 +1111,12 @@ type AlertDiscovery struct { // Alert has info for an alert. type Alert struct { - Labels labels.Labels `json:"labels"` - Annotations labels.Labels `json:"annotations"` - State string `json:"state"` - ActiveAt *time.Time `json:"activeAt,omitempty"` - Value string `json:"value"` + Labels labels.Labels `json:"labels"` + Annotations labels.Labels `json:"annotations"` + State string `json:"state"` + ActiveAt *time.Time `json:"activeAt,omitempty"` + KeepFiringSince *time.Time `json:"keep_firing_since,omitempty"` + Value string `json:"value"` } func (api *API) alerts(r *http.Request) apiFuncResult { @@ -1138,11 +1139,12 @@ func rulesAlertsToAPIAlerts(rulesAlerts []*rules.Alert) []*Alert { apiAlerts := make([]*Alert, len(rulesAlerts)) for i, ruleAlert := range rulesAlerts { apiAlerts[i] = &Alert{ - Labels: ruleAlert.Labels, - Annotations: ruleAlert.Annotations, - State: ruleAlert.State.String(), - ActiveAt: &ruleAlert.ActiveAt, - Value: strconv.FormatFloat(ruleAlert.Value, 'e', -1, 64), + Labels: ruleAlert.Labels, + Annotations: ruleAlert.Annotations, + State: ruleAlert.State.String(), + ActiveAt: &ruleAlert.ActiveAt, + KeepFiringSince: &ruleAlert.KeepFiringSince, + Value: strconv.FormatFloat(ruleAlert.Value, 'e', -1, 64), } } @@ -1241,6 +1243,7 @@ type AlertingRule struct { Name string `json:"name"` Query string `json:"query"` Duration float64 `json:"duration"` + KeepFiringFor float64 `json:"keepFiringFor"` Labels labels.Labels `json:"labels"` Annotations labels.Labels `json:"annotations"` Alerts []*Alert `json:"alerts"` @@ -1303,6 +1306,7 @@ func (api *API) rules(r *http.Request) apiFuncResult { Name: rule.Name(), Query: rule.Query().String(), Duration: rule.HoldDuration().Seconds(), + KeepFiringFor: rule.KeepFiringFor().Seconds(), Labels: rule.Labels(), Annotations: rule.Annotations(), Alerts: rulesAlertsToAPIAlerts(rule.ActiveAlerts()), diff --git a/web/api/v1/api_test.go b/web/api/v1/api_test.go index 7e2dcbd8bb..919fad34b8 100644 --- a/web/api/v1/api_test.go +++ b/web/api/v1/api_test.go @@ -209,6 +209,7 @@ func (m rulesRetrieverMock) AlertingRules() []*rules.AlertingRule { "test_metric3", expr1, time.Second, + 0, labels.Labels{}, labels.Labels{}, labels.Labels{}, @@ -220,6 +221,7 @@ func (m rulesRetrieverMock) AlertingRules() []*rules.AlertingRule { "test_metric4", expr2, time.Second, + 0, labels.Labels{}, labels.Labels{}, labels.Labels{}, diff --git a/web/ui/react-app/src/pages/alerts/CollapsibleAlertPanel.tsx b/web/ui/react-app/src/pages/alerts/CollapsibleAlertPanel.tsx index ef45c205f9..1951f0f202 100644 --- a/web/ui/react-app/src/pages/alerts/CollapsibleAlertPanel.tsx +++ b/web/ui/react-app/src/pages/alerts/CollapsibleAlertPanel.tsx @@ -43,6 +43,11 @@ const CollapsibleAlertPanel: FC = ({ rule, showAnnot
for: {formatDuration(rule.duration * 1000)}
)} + {rule.keepFiringFor > 0 && ( +
+
keep_firing_for: {formatDuration(rule.keepFiringFor * 1000)}
+
+ )} {rule.labels && Object.keys(rule.labels).length > 0 && (
labels:
diff --git a/web/ui/react-app/src/pages/rules/RulesContent.tsx b/web/ui/react-app/src/pages/rules/RulesContent.tsx index e7adfee39a..ef4a7ad8f8 100644 --- a/web/ui/react-app/src/pages/rules/RulesContent.tsx +++ b/web/ui/react-app/src/pages/rules/RulesContent.tsx @@ -96,6 +96,11 @@ export const RulesContent: FC = ({ response }) => { for: {formatDuration(r.duration * 1000)}
)} + {r.keepFiringFor > 0 && ( +
+ keep_firing_for: {formatDuration(r.keepFiringFor * 1000)} +
+ )} {r.labels && Object.keys(r.labels).length > 0 && (
labels: diff --git a/web/ui/react-app/src/types/types.ts b/web/ui/react-app/src/types/types.ts index 21f52b5fa2..a30439c77e 100644 --- a/web/ui/react-app/src/types/types.ts +++ b/web/ui/react-app/src/types/types.ts @@ -26,6 +26,7 @@ export type Rule = { alerts: Alert[]; annotations: Record; duration: number; + keepFiringFor: number; evaluationTime: string; health: string; labels: Record; From 5ad74e6e718e7c4809205c7b7eb499f8af65b07d Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Thu, 19 Jan 2023 10:36:01 +0100 Subject: [PATCH 2/6] Add tests Signed-off-by: Julien Pivotto --- rules/alerting_test.go | 119 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 118 insertions(+), 1 deletion(-) diff --git a/rules/alerting_test.go b/rules/alerting_test.go index d95610c273..3f623169ee 100644 --- a/rules/alerting_test.go +++ b/rules/alerting_test.go @@ -76,7 +76,7 @@ func TestAlertingRuleState(t *testing.T) { func TestAlertingRuleLabelsUpdate(t *testing.T) { suite, err := promql.NewTest(t, ` load 1m - http_requests{job="app-server", instance="0"} 75 85 70 70 + http_requests{job="app-server", instance="0"} 75 85 70 70 stale `) require.NoError(t, err) defer suite.Close() @@ -174,6 +174,10 @@ func TestAlertingRuleLabelsUpdate(t *testing.T) { require.Equal(t, result, filteredRes) } + evalTime := baseTime.Add(time.Duration(len(results)) * time.Minute) + res, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0) + require.NoError(t, err) + require.Equal(t, 0, len(res)) } func TestAlertingRuleExternalLabelsInTemplate(t *testing.T) { @@ -723,3 +727,116 @@ func TestSendAlertsDontAffectActiveAlerts(t *testing.T) { // But the labels with the AlertingRule should not be changed. require.Equal(t, labels.FromStrings("a1", "1"), rule.active[h].Labels) } + +func TestKeepFiringFor(t *testing.T) { + suite, err := promql.NewTest(t, ` + load 1m + http_requests{job="app-server", instance="0"} 75 85 70 70 10x5 + `) + require.NoError(t, err) + defer suite.Close() + + require.NoError(t, suite.Run()) + + expr, err := parser.ParseExpr(`http_requests > 50`) + require.NoError(t, err) + + rule := NewAlertingRule( + "HTTPRequestRateHigh", + expr, + time.Minute, + time.Minute, + labels.EmptyLabels(), + labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil, + ) + + results := []promql.Vector{ + { + promql.Sample{ + Metric: labels.FromStrings( + "__name__", "ALERTS", + "alertname", "HTTPRequestRateHigh", + "alertstate", "pending", + "instance", "0", + "job", "app-server", + ), + Point: promql.Point{V: 1}, + }, + }, + { + promql.Sample{ + Metric: labels.FromStrings( + "__name__", "ALERTS", + "alertname", "HTTPRequestRateHigh", + "alertstate", "firing", + "instance", "0", + "job", "app-server", + ), + Point: promql.Point{V: 1}, + }, + }, + { + promql.Sample{ + Metric: labels.FromStrings( + "__name__", "ALERTS", + "alertname", "HTTPRequestRateHigh", + "alertstate", "firing", + "instance", "0", + "job", "app-server", + ), + Point: promql.Point{V: 1}, + }, + }, + { + promql.Sample{ + Metric: labels.FromStrings( + "__name__", "ALERTS", + "alertname", "HTTPRequestRateHigh", + "alertstate", "firing", + "instance", "0", + "job", "app-server", + ), + Point: promql.Point{V: 1}, + }, + }, + // From now on the alert should keep firing. + { + promql.Sample{ + Metric: labels.FromStrings( + "__name__", "ALERTS", + "alertname", "HTTPRequestRateHigh", + "alertstate", "firing", + "instance", "0", + "job", "app-server", + ), + Point: promql.Point{V: 1}, + }, + }, + } + + baseTime := time.Unix(0, 0) + for i, result := range results { + t.Logf("case %d", i) + evalTime := baseTime.Add(time.Duration(i) * time.Minute) + result[0].Point.T = timestamp.FromTime(evalTime) + res, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0) + require.NoError(t, err) + + var filteredRes promql.Vector // After removing 'ALERTS_FOR_STATE' samples. + for _, smpl := range res { + smplName := smpl.Metric.Get("__name__") + if smplName == "ALERTS" { + filteredRes = append(filteredRes, smpl) + } else { + // If not 'ALERTS', it has to be 'ALERTS_FOR_STATE'. + require.Equal(t, "ALERTS_FOR_STATE", smplName) + } + } + + require.Equal(t, result, filteredRes) + } + evalTime := baseTime.Add(time.Duration(len(results)) * time.Minute) + res, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0) + require.NoError(t, err) + require.Equal(t, 0, len(res)) +} From 8e500dbd3982261df2b9ced3effd239d8ffdc38c Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Thu, 19 Jan 2023 10:44:35 +0100 Subject: [PATCH 3/6] Add rulefmt tests Signed-off-by: Julien Pivotto --- model/rulefmt/rulefmt_test.go | 11 ++++++++++- model/rulefmt/testdata/record_and_for.bad.yaml | 6 ++++++ .../testdata/record_and_keep_firing_for.bad.yaml | 6 ++++++ 3 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 model/rulefmt/testdata/record_and_for.bad.yaml create mode 100644 model/rulefmt/testdata/record_and_keep_firing_for.bad.yaml diff --git a/model/rulefmt/rulefmt_test.go b/model/rulefmt/rulefmt_test.go index d79ee94e29..d6499538e2 100644 --- a/model/rulefmt/rulefmt_test.go +++ b/model/rulefmt/rulefmt_test.go @@ -73,12 +73,21 @@ func TestParseFileFailure(t *testing.T) { filename: "invalid_label_name.bad.yaml", errMsg: "invalid label name", }, + { + filename: "record_and_for.bad.yaml", + errMsg: "invalid field 'for' in recording rule", + }, + { + filename: "record_and_keep_firing_for.bad.yaml", + errMsg: "invalid field 'keep_firing_for' in recording rule", + }, } for _, c := range table { _, errs := ParseFile(filepath.Join("testdata", c.filename)) require.NotNil(t, errs, "Expected error parsing %s but got none", c.filename) - require.Error(t, errs[0], c.errMsg, "Expected error for %s.", c.filename) + require.Error(t, errs[0]) + require.Containsf(t, errs[0].Error(), c.errMsg, "Expected error for %s.", c.filename) } } diff --git a/model/rulefmt/testdata/record_and_for.bad.yaml b/model/rulefmt/testdata/record_and_for.bad.yaml new file mode 100644 index 0000000000..a15d428a08 --- /dev/null +++ b/model/rulefmt/testdata/record_and_for.bad.yaml @@ -0,0 +1,6 @@ +groups: + - name: yolo + rules: + - record: Hello + expr: 1 + for: 1m diff --git a/model/rulefmt/testdata/record_and_keep_firing_for.bad.yaml b/model/rulefmt/testdata/record_and_keep_firing_for.bad.yaml new file mode 100644 index 0000000000..eb8192f05c --- /dev/null +++ b/model/rulefmt/testdata/record_and_keep_firing_for.bad.yaml @@ -0,0 +1,6 @@ +groups: + - name: yolo + rules: + - record: Hello + expr: 1 + keep_firing_for: 1m From 2c408289f8358e83301ebaf9bface058f1a6a9b4 Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Thu, 19 Jan 2023 11:33:54 +0100 Subject: [PATCH 4/6] Add stabilizing to UI Signed-off-by: Julien Pivotto --- rules/alerting.go | 11 ++++++----- web/api/v1/api.go | 16 +++++++++------- .../react-app/src/pages/alerts/AlertContents.tsx | 1 + .../src/pages/alerts/CollapsibleAlertPanel.tsx | 7 ++++++- 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/rules/alerting.go b/rules/alerting.go index 9ff3e8fc32..ac89049e74 100644 --- a/rules/alerting.go +++ b/rules/alerting.go @@ -536,11 +536,12 @@ func (r *AlertingRule) sendAlerts(ctx context.Context, ts time.Time, resendDelay func (r *AlertingRule) String() string { ar := rulefmt.Rule{ - Alert: r.name, - Expr: r.vector.String(), - For: model.Duration(r.holdDuration), - Labels: r.labels.Map(), - Annotations: r.annotations.Map(), + Alert: r.name, + Expr: r.vector.String(), + For: model.Duration(r.holdDuration), + KeepFiringFor: model.Duration(r.keepFiringFor), + Labels: r.labels.Map(), + Annotations: r.annotations.Map(), } byt, err := yaml.Marshal(ar) diff --git a/web/api/v1/api.go b/web/api/v1/api.go index 29532bceb3..f560693d40 100644 --- a/web/api/v1/api.go +++ b/web/api/v1/api.go @@ -1115,7 +1115,7 @@ type Alert struct { Annotations labels.Labels `json:"annotations"` State string `json:"state"` ActiveAt *time.Time `json:"activeAt,omitempty"` - KeepFiringSince *time.Time `json:"keep_firing_since,omitempty"` + KeepFiringSince *time.Time `json:"keepFiringSince,omitempty"` Value string `json:"value"` } @@ -1139,12 +1139,14 @@ func rulesAlertsToAPIAlerts(rulesAlerts []*rules.Alert) []*Alert { apiAlerts := make([]*Alert, len(rulesAlerts)) for i, ruleAlert := range rulesAlerts { apiAlerts[i] = &Alert{ - Labels: ruleAlert.Labels, - Annotations: ruleAlert.Annotations, - State: ruleAlert.State.String(), - ActiveAt: &ruleAlert.ActiveAt, - KeepFiringSince: &ruleAlert.KeepFiringSince, - Value: strconv.FormatFloat(ruleAlert.Value, 'e', -1, 64), + Labels: ruleAlert.Labels, + Annotations: ruleAlert.Annotations, + State: ruleAlert.State.String(), + ActiveAt: &ruleAlert.ActiveAt, + Value: strconv.FormatFloat(ruleAlert.Value, 'e', -1, 64), + } + if !ruleAlert.KeepFiringSince.IsZero() { + apiAlerts[i].KeepFiringSince = &ruleAlert.KeepFiringSince } } diff --git a/web/ui/react-app/src/pages/alerts/AlertContents.tsx b/web/ui/react-app/src/pages/alerts/AlertContents.tsx index 2837399f7c..a619f69fc6 100644 --- a/web/ui/react-app/src/pages/alerts/AlertContents.tsx +++ b/web/ui/react-app/src/pages/alerts/AlertContents.tsx @@ -29,6 +29,7 @@ export interface Alert { value: string; annotations: Record; activeAt: string; + keepFiringSince: string; } interface RuleGroup { diff --git a/web/ui/react-app/src/pages/alerts/CollapsibleAlertPanel.tsx b/web/ui/react-app/src/pages/alerts/CollapsibleAlertPanel.tsx index 1951f0f202..676046efd5 100644 --- a/web/ui/react-app/src/pages/alerts/CollapsibleAlertPanel.tsx +++ b/web/ui/react-app/src/pages/alerts/CollapsibleAlertPanel.tsx @@ -96,9 +96,14 @@ const CollapsibleAlertPanel: FC = ({ rule, showAnnot
- + {alert.state} + {alert.keepFiringSince && ( + + Stabilizing + + )}
{alert.activeAt} From c0724f4e622ba8e15c34718c7020a2a3f256fc90 Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Thu, 19 Jan 2023 11:53:42 +0100 Subject: [PATCH 5/6] New test Signed-off-by: Julien Pivotto --- rules/alerting_test.go | 55 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/rules/alerting_test.go b/rules/alerting_test.go index 3f623169ee..036bfae1cf 100644 --- a/rules/alerting_test.go +++ b/rules/alerting_test.go @@ -840,3 +840,58 @@ func TestKeepFiringFor(t *testing.T) { require.NoError(t, err) require.Equal(t, 0, len(res)) } + +func TestPendingAndKeepFiringFor(t *testing.T) { + suite, err := promql.NewTest(t, ` + load 1m + http_requests{job="app-server", instance="0"} 75 10x10 + `) + require.NoError(t, err) + defer suite.Close() + + require.NoError(t, suite.Run()) + + expr, err := parser.ParseExpr(`http_requests > 50`) + require.NoError(t, err) + + rule := NewAlertingRule( + "HTTPRequestRateHigh", + expr, + time.Minute, + time.Minute, + labels.EmptyLabels(), + labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil, + ) + + result := promql.Sample{ + Metric: labels.FromStrings( + "__name__", "ALERTS", + "alertname", "HTTPRequestRateHigh", + "alertstate", "pending", + "instance", "0", + "job", "app-server", + ), + Point: promql.Point{V: 1}, + } + + baseTime := time.Unix(0, 0) + result.Point.T = timestamp.FromTime(baseTime) + res, err := rule.Eval(suite.Context(), baseTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0) + require.NoError(t, err) + + require.Len(t, res, 2) + for _, smpl := range res { + smplName := smpl.Metric.Get("__name__") + if smplName == "ALERTS" { + require.Equal(t, result, smpl) + } else { + // If not 'ALERTS', it has to be 'ALERTS_FOR_STATE'. + require.Equal(t, "ALERTS_FOR_STATE", smplName) + } + } + + evalTime := baseTime.Add(time.Minute) + res, err = rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0) + require.NoError(t, err) + require.Equal(t, 0, len(res)) +} From e811d1496301538aca02fc7d49a23923c9a938e6 Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Mon, 23 Jan 2023 13:59:43 +0100 Subject: [PATCH 6/6] Add comments Signed-off-by: Julien Pivotto --- rules/alerting.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/rules/alerting.go b/rules/alerting.go index ac89049e74..a7c94f7a7d 100644 --- a/rules/alerting.go +++ b/rules/alerting.go @@ -415,6 +415,11 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, // Check if any pending alerts should be removed or fire now. Write out alert timeseries. for fp, a := range r.active { if _, ok := resultFPs[fp]; !ok { + // There is no firing alerts for this fingerprint. The alert is no + // longer firing. + + // Use keepFiringFor value to determine if the alert should keep + // firing. var keepFiring bool if a.State == StateFiring && r.keepFiringFor > 0 { if a.KeepFiringSince.IsZero() { @@ -424,6 +429,7 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, keepFiring = true } } + // If the alert was previously firing, keep it around for a given // retention time so it is reported as resolved to the AlertManager. if a.State == StatePending || (!a.ResolvedAt.IsZero() && ts.Sub(a.ResolvedAt) > resolvedRetention) { @@ -437,6 +443,7 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, continue } } else { + // The alert is firing, reset keepFiringSince. a.KeepFiringSince = time.Time{} } numActivePending++