diff --git a/docs/configuration/recording_rules.md b/docs/configuration/recording_rules.md index d70ffa0cbb..eda0214b35 100644 --- a/docs/configuration/recording_rules.md +++ b/docs/configuration/recording_rules.md @@ -123,6 +123,10 @@ expr: # Alerts which have not yet fired for long enough are considered pending. [ for: | default = 0s ] +# How long an alert will continue firing after the condition that triggered it +# has cleared. +[ keep_firing_for: | default = 0s ] + # Labels to add or overwrite for each alert. labels: [ : ] diff --git a/model/rulefmt/rulefmt.go b/model/rulefmt/rulefmt.go index f1d5f39257..30b3face0d 100644 --- a/model/rulefmt/rulefmt.go +++ b/model/rulefmt/rulefmt.go @@ -143,22 +143,24 @@ type RuleGroup struct { // Rule describes an alerting or recording rule. type Rule struct { - Record string `yaml:"record,omitempty"` - Alert string `yaml:"alert,omitempty"` - Expr string `yaml:"expr"` - For model.Duration `yaml:"for,omitempty"` - Labels map[string]string `yaml:"labels,omitempty"` - Annotations map[string]string `yaml:"annotations,omitempty"` + Record string `yaml:"record,omitempty"` + Alert string `yaml:"alert,omitempty"` + Expr string `yaml:"expr"` + For model.Duration `yaml:"for,omitempty"` + KeepFiringFor model.Duration `yaml:"keep_firing_for,omitempty"` + Labels map[string]string `yaml:"labels,omitempty"` + Annotations map[string]string `yaml:"annotations,omitempty"` } // RuleNode adds yaml.v3 layer to support line and column outputs for invalid rules. type RuleNode struct { - Record yaml.Node `yaml:"record,omitempty"` - Alert yaml.Node `yaml:"alert,omitempty"` - Expr yaml.Node `yaml:"expr"` - For model.Duration `yaml:"for,omitempty"` - Labels map[string]string `yaml:"labels,omitempty"` - Annotations map[string]string `yaml:"annotations,omitempty"` + Record yaml.Node `yaml:"record,omitempty"` + Alert yaml.Node `yaml:"alert,omitempty"` + Expr yaml.Node `yaml:"expr"` + For model.Duration `yaml:"for,omitempty"` + KeepFiringFor model.Duration `yaml:"keep_firing_for,omitempty"` + Labels map[string]string `yaml:"labels,omitempty"` + Annotations map[string]string `yaml:"annotations,omitempty"` } // Validate the rule and return a list of encountered errors. @@ -208,6 +210,12 @@ func (r *RuleNode) Validate() (nodes []WrappedError) { node: &r.Record, }) } + if r.KeepFiringFor != 0 { + nodes = append(nodes, WrappedError{ + err: fmt.Errorf("invalid field 'keep_firing_for' in recording rule"), + node: &r.Record, + }) + } if !model.IsValidMetricName(model.LabelValue(r.Record.Value)) { nodes = append(nodes, WrappedError{ err: fmt.Errorf("invalid recording rule name: %s", r.Record.Value), diff --git a/rules/alerting.go b/rules/alerting.go index d456662669..9ff3e8fc32 100644 --- a/rules/alerting.go +++ b/rules/alerting.go @@ -83,11 +83,12 @@ type Alert struct { Value float64 // The interval during which the condition of this alert held true. // ResolvedAt will be 0 to indicate a still active alert. - ActiveAt time.Time - FiredAt time.Time - ResolvedAt time.Time - LastSentAt time.Time - ValidUntil time.Time + ActiveAt time.Time + FiredAt time.Time + ResolvedAt time.Time + LastSentAt time.Time + ValidUntil time.Time + KeepFiringSince time.Time } func (a *Alert) needsSending(ts time.Time, resendDelay time.Duration) bool { @@ -112,6 +113,9 @@ type AlertingRule struct { // The duration for which a labelset needs to persist in the expression // output vector before an alert transitions from Pending to Firing state. holdDuration time.Duration + // The amount of time that the alert should remain firing after the + // resolution. + keepFiringFor time.Duration // Extra labels to attach to the resulting alert sample vectors. labels labels.Labels // Non-identifying key/value pairs. @@ -142,7 +146,7 @@ type AlertingRule struct { // NewAlertingRule constructs a new AlertingRule. func NewAlertingRule( - name string, vec parser.Expr, hold time.Duration, + name string, vec parser.Expr, hold, keepFiringFor time.Duration, labels, annotations, externalLabels labels.Labels, externalURL string, restored bool, logger log.Logger, ) *AlertingRule { @@ -152,6 +156,7 @@ func NewAlertingRule( name: name, vector: vec, holdDuration: hold, + keepFiringFor: keepFiringFor, labels: labels, annotations: annotations, externalLabels: el, @@ -201,6 +206,12 @@ func (r *AlertingRule) HoldDuration() time.Duration { return r.holdDuration } +// KeepFiringFor returns the duration an alerting rule should keep firing for +// after resolution. +func (r *AlertingRule) KeepFiringFor() time.Duration { + return r.keepFiringFor +} + // Labels returns the labels of the alerting rule. func (r *AlertingRule) Labels() labels.Labels { return r.labels @@ -404,16 +415,29 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, // Check if any pending alerts should be removed or fire now. Write out alert timeseries. for fp, a := range r.active { if _, ok := resultFPs[fp]; !ok { + var keepFiring bool + if a.State == StateFiring && r.keepFiringFor > 0 { + if a.KeepFiringSince.IsZero() { + a.KeepFiringSince = ts + } + if ts.Sub(a.KeepFiringSince) < r.keepFiringFor { + keepFiring = true + } + } // If the alert was previously firing, keep it around for a given // retention time so it is reported as resolved to the AlertManager. if a.State == StatePending || (!a.ResolvedAt.IsZero() && ts.Sub(a.ResolvedAt) > resolvedRetention) { delete(r.active, fp) } - if a.State != StateInactive { + if a.State != StateInactive && !keepFiring { a.State = StateInactive a.ResolvedAt = ts } - continue + if !keepFiring { + continue + } + } else { + a.KeepFiringSince = time.Time{} } numActivePending++ diff --git a/rules/alerting_test.go b/rules/alerting_test.go index 4f5f5e683a..d95610c273 100644 --- a/rules/alerting_test.go +++ b/rules/alerting_test.go @@ -66,7 +66,7 @@ func TestAlertingRuleState(t *testing.T) { } for i, test := range tests { - rule := NewAlertingRule(test.name, nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil) + rule := NewAlertingRule(test.name, nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil) rule.active = test.active got := rule.State() require.Equal(t, test.want, got, "test case %d unexpected AlertState, want:%d got:%d", i, test.want, got) @@ -90,6 +90,7 @@ func TestAlertingRuleLabelsUpdate(t *testing.T) { "HTTPRequestRateLow", expr, time.Minute, + 0, // Basing alerting rule labels off of a value that can change is a very bad idea. // If an alert is going back and forth between two label values it will never fire. // Instead, you should write two alerts with constant labels. @@ -192,6 +193,7 @@ func TestAlertingRuleExternalLabelsInTemplate(t *testing.T) { "ExternalLabelDoesNotExist", expr, time.Minute, + 0, labels.FromStrings("templated_label", "There are {{ len $externalLabels }} external Labels, of which foo is {{ $externalLabels.foo }}."), labels.EmptyLabels(), labels.EmptyLabels(), @@ -202,6 +204,7 @@ func TestAlertingRuleExternalLabelsInTemplate(t *testing.T) { "ExternalLabelExists", expr, time.Minute, + 0, labels.FromStrings("templated_label", "There are {{ len $externalLabels }} external Labels, of which foo is {{ $externalLabels.foo }}."), labels.EmptyLabels(), labels.FromStrings("foo", "bar", "dings", "bums"), @@ -286,6 +289,7 @@ func TestAlertingRuleExternalURLInTemplate(t *testing.T) { "ExternalURLDoesNotExist", expr, time.Minute, + 0, labels.FromStrings("templated_label", "The external URL is {{ $externalURL }}."), labels.EmptyLabels(), labels.EmptyLabels(), @@ -296,6 +300,7 @@ func TestAlertingRuleExternalURLInTemplate(t *testing.T) { "ExternalURLExists", expr, time.Minute, + 0, labels.FromStrings("templated_label", "The external URL is {{ $externalURL }}."), labels.EmptyLabels(), labels.EmptyLabels(), @@ -380,6 +385,7 @@ func TestAlertingRuleEmptyLabelFromTemplate(t *testing.T) { "EmptyLabel", expr, time.Minute, + 0, labels.FromStrings("empty_label", ""), labels.EmptyLabels(), labels.EmptyLabels(), @@ -436,6 +442,7 @@ func TestAlertingRuleQueryInTemplate(t *testing.T) { "ruleWithQueryInTemplate", expr, time.Minute, + 0, labels.FromStrings("label", "value"), labels.FromStrings("templated_label", `{{- with "sort(sum(http_requests) by (instance))" | query -}} {{- range $i,$v := . -}} @@ -480,7 +487,7 @@ instance: {{ $v.Labels.instance }}, value: {{ printf "%.0f" $v.Value }}; func BenchmarkAlertingRuleAtomicField(b *testing.B) { b.ReportAllocs() - rule := NewAlertingRule("bench", nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil) + rule := NewAlertingRule("bench", nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil) done := make(chan struct{}) go func() { for i := 0; i < b.N; i++ { @@ -518,6 +525,7 @@ func TestAlertingRuleDuplicate(t *testing.T) { "foo", expr, time.Minute, + 0, labels.FromStrings("test", "test"), labels.EmptyLabels(), labels.EmptyLabels(), @@ -564,6 +572,7 @@ func TestAlertingRuleLimit(t *testing.T) { "foo", expr, time.Minute, + 0, labels.FromStrings("test", "test"), labels.EmptyLabels(), labels.EmptyLabels(), @@ -636,6 +645,7 @@ func TestQueryForStateSeries(t *testing.T) { "TestRule", nil, time.Minute, + 0, labels.FromStrings("severity", "critical"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil, ) @@ -669,6 +679,7 @@ func TestSendAlertsDontAffectActiveAlerts(t *testing.T) { "TestRule", nil, time.Minute, + 0, labels.FromStrings("severity", "critical"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil, ) diff --git a/rules/manager.go b/rules/manager.go index d1ad8afdc5..6f6ce2cfe4 100644 --- a/rules/manager.go +++ b/rules/manager.go @@ -1119,6 +1119,7 @@ func (m *Manager) LoadGroups( r.Alert.Value, expr, time.Duration(r.For), + time.Duration(r.KeepFiringFor), labels.FromMap(r.Labels), labels.FromMap(r.Annotations), externalLabels, diff --git a/rules/manager_test.go b/rules/manager_test.go index 788aa0af38..6f0dd0ddaa 100644 --- a/rules/manager_test.go +++ b/rules/manager_test.go @@ -66,6 +66,7 @@ func TestAlertingRule(t *testing.T) { "HTTPRequestRateLow", expr, time.Minute, + 0, labels.FromStrings("severity", "{{\"c\"}}ritical"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil, ) @@ -209,6 +210,7 @@ func TestForStateAddSamples(t *testing.T) { "HTTPRequestRateLow", expr, time.Minute, + 0, labels.FromStrings("severity", "{{\"c\"}}ritical"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil, ) @@ -383,6 +385,7 @@ func TestForStateRestore(t *testing.T) { "HTTPRequestRateLow", expr, alertForDuration, + 0, labels.FromStrings("severity", "critical"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil, ) @@ -449,6 +452,7 @@ func TestForStateRestore(t *testing.T) { "HTTPRequestRateLow", expr, alertForDuration, + 0, labels.FromStrings("severity", "critical"), labels.EmptyLabels(), labels.EmptyLabels(), "", false, nil, ) @@ -615,13 +619,13 @@ func readSeriesSet(ss storage.SeriesSet) (map[string][]promql.Point, error) { func TestCopyState(t *testing.T) { oldGroup := &Group{ rules: []Rule{ - NewAlertingRule("alert", nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), + NewAlertingRule("alert", nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), NewRecordingRule("rule1", nil, labels.EmptyLabels()), NewRecordingRule("rule2", nil, labels.EmptyLabels()), NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v1")), NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v2")), NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v3")), - NewAlertingRule("alert2", nil, 0, labels.FromStrings("l2", "v1"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), + NewAlertingRule("alert2", nil, 0, 0, labels.FromStrings("l2", "v1"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), }, seriesInPreviousEval: []map[string]labels.Labels{ {}, @@ -640,10 +644,10 @@ func TestCopyState(t *testing.T) { NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v0")), NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v1")), NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v2")), - NewAlertingRule("alert", nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), + NewAlertingRule("alert", nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), NewRecordingRule("rule1", nil, labels.EmptyLabels()), - NewAlertingRule("alert2", nil, 0, labels.FromStrings("l2", "v0"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), - NewAlertingRule("alert2", nil, 0, labels.FromStrings("l2", "v1"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), + NewAlertingRule("alert2", nil, 0, 0, labels.FromStrings("l2", "v0"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), + NewAlertingRule("alert2", nil, 0, 0, labels.FromStrings("l2", "v1"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), NewRecordingRule("rule4", nil, labels.EmptyLabels()), }, seriesInPreviousEval: make([]map[string]labels.Labels, 8), @@ -875,7 +879,7 @@ func TestNotify(t *testing.T) { expr, err := parser.ParseExpr("a > 1") require.NoError(t, err) - rule := NewAlertingRule("aTooHigh", expr, 0, labels.Labels{}, labels.Labels{}, labels.EmptyLabels(), "", true, log.NewNopLogger()) + rule := NewAlertingRule("aTooHigh", expr, 0, 0, labels.Labels{}, labels.Labels{}, labels.EmptyLabels(), "", true, log.NewNopLogger()) group := NewGroup(GroupOptions{ Name: "alert", Interval: time.Second, @@ -1147,7 +1151,7 @@ func TestGroupHasAlertingRules(t *testing.T) { group: &Group{ name: "HasAlertingRule", rules: []Rule{ - NewAlertingRule("alert", nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), + NewAlertingRule("alert", nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil), NewRecordingRule("record", nil, labels.EmptyLabels()), }, }, diff --git a/web/api/v1/api.go b/web/api/v1/api.go index 894a8666a6..29532bceb3 100644 --- a/web/api/v1/api.go +++ b/web/api/v1/api.go @@ -1111,11 +1111,12 @@ type AlertDiscovery struct { // Alert has info for an alert. type Alert struct { - Labels labels.Labels `json:"labels"` - Annotations labels.Labels `json:"annotations"` - State string `json:"state"` - ActiveAt *time.Time `json:"activeAt,omitempty"` - Value string `json:"value"` + Labels labels.Labels `json:"labels"` + Annotations labels.Labels `json:"annotations"` + State string `json:"state"` + ActiveAt *time.Time `json:"activeAt,omitempty"` + KeepFiringSince *time.Time `json:"keep_firing_since,omitempty"` + Value string `json:"value"` } func (api *API) alerts(r *http.Request) apiFuncResult { @@ -1138,11 +1139,12 @@ func rulesAlertsToAPIAlerts(rulesAlerts []*rules.Alert) []*Alert { apiAlerts := make([]*Alert, len(rulesAlerts)) for i, ruleAlert := range rulesAlerts { apiAlerts[i] = &Alert{ - Labels: ruleAlert.Labels, - Annotations: ruleAlert.Annotations, - State: ruleAlert.State.String(), - ActiveAt: &ruleAlert.ActiveAt, - Value: strconv.FormatFloat(ruleAlert.Value, 'e', -1, 64), + Labels: ruleAlert.Labels, + Annotations: ruleAlert.Annotations, + State: ruleAlert.State.String(), + ActiveAt: &ruleAlert.ActiveAt, + KeepFiringSince: &ruleAlert.KeepFiringSince, + Value: strconv.FormatFloat(ruleAlert.Value, 'e', -1, 64), } } @@ -1241,6 +1243,7 @@ type AlertingRule struct { Name string `json:"name"` Query string `json:"query"` Duration float64 `json:"duration"` + KeepFiringFor float64 `json:"keepFiringFor"` Labels labels.Labels `json:"labels"` Annotations labels.Labels `json:"annotations"` Alerts []*Alert `json:"alerts"` @@ -1303,6 +1306,7 @@ func (api *API) rules(r *http.Request) apiFuncResult { Name: rule.Name(), Query: rule.Query().String(), Duration: rule.HoldDuration().Seconds(), + KeepFiringFor: rule.KeepFiringFor().Seconds(), Labels: rule.Labels(), Annotations: rule.Annotations(), Alerts: rulesAlertsToAPIAlerts(rule.ActiveAlerts()), diff --git a/web/api/v1/api_test.go b/web/api/v1/api_test.go index 7e2dcbd8bb..919fad34b8 100644 --- a/web/api/v1/api_test.go +++ b/web/api/v1/api_test.go @@ -209,6 +209,7 @@ func (m rulesRetrieverMock) AlertingRules() []*rules.AlertingRule { "test_metric3", expr1, time.Second, + 0, labels.Labels{}, labels.Labels{}, labels.Labels{}, @@ -220,6 +221,7 @@ func (m rulesRetrieverMock) AlertingRules() []*rules.AlertingRule { "test_metric4", expr2, time.Second, + 0, labels.Labels{}, labels.Labels{}, labels.Labels{}, diff --git a/web/ui/react-app/src/pages/alerts/CollapsibleAlertPanel.tsx b/web/ui/react-app/src/pages/alerts/CollapsibleAlertPanel.tsx index ef45c205f9..1951f0f202 100644 --- a/web/ui/react-app/src/pages/alerts/CollapsibleAlertPanel.tsx +++ b/web/ui/react-app/src/pages/alerts/CollapsibleAlertPanel.tsx @@ -43,6 +43,11 @@ const CollapsibleAlertPanel: FC = ({ rule, showAnnot
for: {formatDuration(rule.duration * 1000)}
)} + {rule.keepFiringFor > 0 && ( +
+
keep_firing_for: {formatDuration(rule.keepFiringFor * 1000)}
+
+ )} {rule.labels && Object.keys(rule.labels).length > 0 && (
labels:
diff --git a/web/ui/react-app/src/pages/rules/RulesContent.tsx b/web/ui/react-app/src/pages/rules/RulesContent.tsx index e7adfee39a..ef4a7ad8f8 100644 --- a/web/ui/react-app/src/pages/rules/RulesContent.tsx +++ b/web/ui/react-app/src/pages/rules/RulesContent.tsx @@ -96,6 +96,11 @@ export const RulesContent: FC = ({ response }) => { for: {formatDuration(r.duration * 1000)}
)} + {r.keepFiringFor > 0 && ( +
+ keep_firing_for: {formatDuration(r.keepFiringFor * 1000)} +
+ )} {r.labels && Object.keys(r.labels).length > 0 && (
labels: diff --git a/web/ui/react-app/src/types/types.ts b/web/ui/react-app/src/types/types.ts index 21f52b5fa2..a30439c77e 100644 --- a/web/ui/react-app/src/types/types.ts +++ b/web/ui/react-app/src/types/types.ts @@ -26,6 +26,7 @@ export type Rule = { alerts: Alert[]; annotations: Record; duration: number; + keepFiringFor: number; evaluationTime: string; health: string; labels: Record;