Add 'keep_firing_for' field to alerting rules

This commit adds a new 'keep_firing_for' field to Prometheus alerting
rules. The 'resolve_delay' field specifies the minimum amount of time
that an alert should remain firing, even if the expression does not
return any results.

This feature was discussed at a previous dev summit, and it was
determined that a feature like this would be useful in order to allow
the expression time to stabilize and prevent confusing resolved messages
from being propagated through Alertmanager.

This approach is simpler than having two PromQL queries, as was
sometimes discussed, and it should be easy to implement.

This commit does not include tests for the 'resolve_delay' field.  This
is intentional, as the purpose of this commit is to gather comments on
the proposed design of the 'resolve_delay' field before implementing
tests. Once the design of the 'resolve_delay' field has been finalized,
a follow-up commit will be submitted with tests."

See https://github.com/prometheus/prometheus/issues/11570

Signed-off-by: Julien Pivotto <roidelapluie@o11y.eu>
This commit is contained in:
Julien Pivotto 2023-01-09 12:21:38 +01:00
parent 72f20d949a
commit ce55e5074d
11 changed files with 108 additions and 39 deletions

View file

@ -123,6 +123,10 @@ expr: <string>
# Alerts which have not yet fired for long enough are considered pending.
[ for: <duration> | default = 0s ]
# How long an alert will continue firing after the condition that triggered it
# has cleared.
[ keep_firing_for: <duration> | default = 0s ]
# Labels to add or overwrite for each alert.
labels:
[ <labelname>: <tmpl_string> ]

View file

@ -147,6 +147,7 @@ type Rule struct {
Alert string `yaml:"alert,omitempty"`
Expr string `yaml:"expr"`
For model.Duration `yaml:"for,omitempty"`
KeepFiringFor model.Duration `yaml:"keep_firing_for,omitempty"`
Labels map[string]string `yaml:"labels,omitempty"`
Annotations map[string]string `yaml:"annotations,omitempty"`
}
@ -157,6 +158,7 @@ type RuleNode struct {
Alert yaml.Node `yaml:"alert,omitempty"`
Expr yaml.Node `yaml:"expr"`
For model.Duration `yaml:"for,omitempty"`
KeepFiringFor model.Duration `yaml:"keep_firing_for,omitempty"`
Labels map[string]string `yaml:"labels,omitempty"`
Annotations map[string]string `yaml:"annotations,omitempty"`
}
@ -208,6 +210,12 @@ func (r *RuleNode) Validate() (nodes []WrappedError) {
node: &r.Record,
})
}
if r.KeepFiringFor != 0 {
nodes = append(nodes, WrappedError{
err: fmt.Errorf("invalid field 'keep_firing_for' in recording rule"),
node: &r.Record,
})
}
if !model.IsValidMetricName(model.LabelValue(r.Record.Value)) {
nodes = append(nodes, WrappedError{
err: fmt.Errorf("invalid recording rule name: %s", r.Record.Value),

View file

@ -88,6 +88,7 @@ type Alert struct {
ResolvedAt time.Time
LastSentAt time.Time
ValidUntil time.Time
KeepFiringSince time.Time
}
func (a *Alert) needsSending(ts time.Time, resendDelay time.Duration) bool {
@ -112,6 +113,9 @@ type AlertingRule struct {
// The duration for which a labelset needs to persist in the expression
// output vector before an alert transitions from Pending to Firing state.
holdDuration time.Duration
// The amount of time that the alert should remain firing after the
// resolution.
keepFiringFor time.Duration
// Extra labels to attach to the resulting alert sample vectors.
labels labels.Labels
// Non-identifying key/value pairs.
@ -142,7 +146,7 @@ type AlertingRule struct {
// NewAlertingRule constructs a new AlertingRule.
func NewAlertingRule(
name string, vec parser.Expr, hold time.Duration,
name string, vec parser.Expr, hold, keepFiringFor time.Duration,
labels, annotations, externalLabels labels.Labels, externalURL string,
restored bool, logger log.Logger,
) *AlertingRule {
@ -152,6 +156,7 @@ func NewAlertingRule(
name: name,
vector: vec,
holdDuration: hold,
keepFiringFor: keepFiringFor,
labels: labels,
annotations: annotations,
externalLabels: el,
@ -201,6 +206,12 @@ func (r *AlertingRule) HoldDuration() time.Duration {
return r.holdDuration
}
// KeepFiringFor returns the duration an alerting rule should keep firing for
// after resolution.
func (r *AlertingRule) KeepFiringFor() time.Duration {
return r.keepFiringFor
}
// Labels returns the labels of the alerting rule.
func (r *AlertingRule) Labels() labels.Labels {
return r.labels
@ -404,17 +415,30 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc,
// Check if any pending alerts should be removed or fire now. Write out alert timeseries.
for fp, a := range r.active {
if _, ok := resultFPs[fp]; !ok {
var keepFiring bool
if a.State == StateFiring && r.keepFiringFor > 0 {
if a.KeepFiringSince.IsZero() {
a.KeepFiringSince = ts
}
if ts.Sub(a.KeepFiringSince) < r.keepFiringFor {
keepFiring = true
}
}
// If the alert was previously firing, keep it around for a given
// retention time so it is reported as resolved to the AlertManager.
if a.State == StatePending || (!a.ResolvedAt.IsZero() && ts.Sub(a.ResolvedAt) > resolvedRetention) {
delete(r.active, fp)
}
if a.State != StateInactive {
if a.State != StateInactive && !keepFiring {
a.State = StateInactive
a.ResolvedAt = ts
}
if !keepFiring {
continue
}
} else {
a.KeepFiringSince = time.Time{}
}
numActivePending++
if a.State == StatePending && ts.Sub(a.ActiveAt) >= r.holdDuration {

View file

@ -66,7 +66,7 @@ func TestAlertingRuleState(t *testing.T) {
}
for i, test := range tests {
rule := NewAlertingRule(test.name, nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil)
rule := NewAlertingRule(test.name, nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil)
rule.active = test.active
got := rule.State()
require.Equal(t, test.want, got, "test case %d unexpected AlertState, want:%d got:%d", i, test.want, got)
@ -90,6 +90,7 @@ func TestAlertingRuleLabelsUpdate(t *testing.T) {
"HTTPRequestRateLow",
expr,
time.Minute,
0,
// Basing alerting rule labels off of a value that can change is a very bad idea.
// If an alert is going back and forth between two label values it will never fire.
// Instead, you should write two alerts with constant labels.
@ -192,6 +193,7 @@ func TestAlertingRuleExternalLabelsInTemplate(t *testing.T) {
"ExternalLabelDoesNotExist",
expr,
time.Minute,
0,
labels.FromStrings("templated_label", "There are {{ len $externalLabels }} external Labels, of which foo is {{ $externalLabels.foo }}."),
labels.EmptyLabels(),
labels.EmptyLabels(),
@ -202,6 +204,7 @@ func TestAlertingRuleExternalLabelsInTemplate(t *testing.T) {
"ExternalLabelExists",
expr,
time.Minute,
0,
labels.FromStrings("templated_label", "There are {{ len $externalLabels }} external Labels, of which foo is {{ $externalLabels.foo }}."),
labels.EmptyLabels(),
labels.FromStrings("foo", "bar", "dings", "bums"),
@ -286,6 +289,7 @@ func TestAlertingRuleExternalURLInTemplate(t *testing.T) {
"ExternalURLDoesNotExist",
expr,
time.Minute,
0,
labels.FromStrings("templated_label", "The external URL is {{ $externalURL }}."),
labels.EmptyLabels(),
labels.EmptyLabels(),
@ -296,6 +300,7 @@ func TestAlertingRuleExternalURLInTemplate(t *testing.T) {
"ExternalURLExists",
expr,
time.Minute,
0,
labels.FromStrings("templated_label", "The external URL is {{ $externalURL }}."),
labels.EmptyLabels(),
labels.EmptyLabels(),
@ -380,6 +385,7 @@ func TestAlertingRuleEmptyLabelFromTemplate(t *testing.T) {
"EmptyLabel",
expr,
time.Minute,
0,
labels.FromStrings("empty_label", ""),
labels.EmptyLabels(),
labels.EmptyLabels(),
@ -436,6 +442,7 @@ func TestAlertingRuleQueryInTemplate(t *testing.T) {
"ruleWithQueryInTemplate",
expr,
time.Minute,
0,
labels.FromStrings("label", "value"),
labels.FromStrings("templated_label", `{{- with "sort(sum(http_requests) by (instance))" | query -}}
{{- range $i,$v := . -}}
@ -480,7 +487,7 @@ instance: {{ $v.Labels.instance }}, value: {{ printf "%.0f" $v.Value }};
func BenchmarkAlertingRuleAtomicField(b *testing.B) {
b.ReportAllocs()
rule := NewAlertingRule("bench", nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil)
rule := NewAlertingRule("bench", nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil)
done := make(chan struct{})
go func() {
for i := 0; i < b.N; i++ {
@ -518,6 +525,7 @@ func TestAlertingRuleDuplicate(t *testing.T) {
"foo",
expr,
time.Minute,
0,
labels.FromStrings("test", "test"),
labels.EmptyLabels(),
labels.EmptyLabels(),
@ -564,6 +572,7 @@ func TestAlertingRuleLimit(t *testing.T) {
"foo",
expr,
time.Minute,
0,
labels.FromStrings("test", "test"),
labels.EmptyLabels(),
labels.EmptyLabels(),
@ -636,6 +645,7 @@ func TestQueryForStateSeries(t *testing.T) {
"TestRule",
nil,
time.Minute,
0,
labels.FromStrings("severity", "critical"),
labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil,
)
@ -669,6 +679,7 @@ func TestSendAlertsDontAffectActiveAlerts(t *testing.T) {
"TestRule",
nil,
time.Minute,
0,
labels.FromStrings("severity", "critical"),
labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil,
)

View file

@ -1119,6 +1119,7 @@ func (m *Manager) LoadGroups(
r.Alert.Value,
expr,
time.Duration(r.For),
time.Duration(r.KeepFiringFor),
labels.FromMap(r.Labels),
labels.FromMap(r.Annotations),
externalLabels,

View file

@ -66,6 +66,7 @@ func TestAlertingRule(t *testing.T) {
"HTTPRequestRateLow",
expr,
time.Minute,
0,
labels.FromStrings("severity", "{{\"c\"}}ritical"),
labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil,
)
@ -209,6 +210,7 @@ func TestForStateAddSamples(t *testing.T) {
"HTTPRequestRateLow",
expr,
time.Minute,
0,
labels.FromStrings("severity", "{{\"c\"}}ritical"),
labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil,
)
@ -383,6 +385,7 @@ func TestForStateRestore(t *testing.T) {
"HTTPRequestRateLow",
expr,
alertForDuration,
0,
labels.FromStrings("severity", "critical"),
labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil,
)
@ -449,6 +452,7 @@ func TestForStateRestore(t *testing.T) {
"HTTPRequestRateLow",
expr,
alertForDuration,
0,
labels.FromStrings("severity", "critical"),
labels.EmptyLabels(), labels.EmptyLabels(), "", false, nil,
)
@ -615,13 +619,13 @@ func readSeriesSet(ss storage.SeriesSet) (map[string][]promql.Point, error) {
func TestCopyState(t *testing.T) {
oldGroup := &Group{
rules: []Rule{
NewAlertingRule("alert", nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
NewAlertingRule("alert", nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
NewRecordingRule("rule1", nil, labels.EmptyLabels()),
NewRecordingRule("rule2", nil, labels.EmptyLabels()),
NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v1")),
NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v2")),
NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v3")),
NewAlertingRule("alert2", nil, 0, labels.FromStrings("l2", "v1"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
NewAlertingRule("alert2", nil, 0, 0, labels.FromStrings("l2", "v1"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
},
seriesInPreviousEval: []map[string]labels.Labels{
{},
@ -640,10 +644,10 @@ func TestCopyState(t *testing.T) {
NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v0")),
NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v1")),
NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v2")),
NewAlertingRule("alert", nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
NewAlertingRule("alert", nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
NewRecordingRule("rule1", nil, labels.EmptyLabels()),
NewAlertingRule("alert2", nil, 0, labels.FromStrings("l2", "v0"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
NewAlertingRule("alert2", nil, 0, labels.FromStrings("l2", "v1"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
NewAlertingRule("alert2", nil, 0, 0, labels.FromStrings("l2", "v0"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
NewAlertingRule("alert2", nil, 0, 0, labels.FromStrings("l2", "v1"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
NewRecordingRule("rule4", nil, labels.EmptyLabels()),
},
seriesInPreviousEval: make([]map[string]labels.Labels, 8),
@ -875,7 +879,7 @@ func TestNotify(t *testing.T) {
expr, err := parser.ParseExpr("a > 1")
require.NoError(t, err)
rule := NewAlertingRule("aTooHigh", expr, 0, labels.Labels{}, labels.Labels{}, labels.EmptyLabels(), "", true, log.NewNopLogger())
rule := NewAlertingRule("aTooHigh", expr, 0, 0, labels.Labels{}, labels.Labels{}, labels.EmptyLabels(), "", true, log.NewNopLogger())
group := NewGroup(GroupOptions{
Name: "alert",
Interval: time.Second,
@ -1147,7 +1151,7 @@ func TestGroupHasAlertingRules(t *testing.T) {
group: &Group{
name: "HasAlertingRule",
rules: []Rule{
NewAlertingRule("alert", nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
NewAlertingRule("alert", nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
NewRecordingRule("record", nil, labels.EmptyLabels()),
},
},

View file

@ -1115,6 +1115,7 @@ type Alert struct {
Annotations labels.Labels `json:"annotations"`
State string `json:"state"`
ActiveAt *time.Time `json:"activeAt,omitempty"`
KeepFiringSince *time.Time `json:"keep_firing_since,omitempty"`
Value string `json:"value"`
}
@ -1142,6 +1143,7 @@ func rulesAlertsToAPIAlerts(rulesAlerts []*rules.Alert) []*Alert {
Annotations: ruleAlert.Annotations,
State: ruleAlert.State.String(),
ActiveAt: &ruleAlert.ActiveAt,
KeepFiringSince: &ruleAlert.KeepFiringSince,
Value: strconv.FormatFloat(ruleAlert.Value, 'e', -1, 64),
}
}
@ -1241,6 +1243,7 @@ type AlertingRule struct {
Name string `json:"name"`
Query string `json:"query"`
Duration float64 `json:"duration"`
KeepFiringFor float64 `json:"keepFiringFor"`
Labels labels.Labels `json:"labels"`
Annotations labels.Labels `json:"annotations"`
Alerts []*Alert `json:"alerts"`
@ -1303,6 +1306,7 @@ func (api *API) rules(r *http.Request) apiFuncResult {
Name: rule.Name(),
Query: rule.Query().String(),
Duration: rule.HoldDuration().Seconds(),
KeepFiringFor: rule.KeepFiringFor().Seconds(),
Labels: rule.Labels(),
Annotations: rule.Annotations(),
Alerts: rulesAlertsToAPIAlerts(rule.ActiveAlerts()),

View file

@ -209,6 +209,7 @@ func (m rulesRetrieverMock) AlertingRules() []*rules.AlertingRule {
"test_metric3",
expr1,
time.Second,
0,
labels.Labels{},
labels.Labels{},
labels.Labels{},
@ -220,6 +221,7 @@ func (m rulesRetrieverMock) AlertingRules() []*rules.AlertingRule {
"test_metric4",
expr2,
time.Second,
0,
labels.Labels{},
labels.Labels{},
labels.Labels{},

View file

@ -43,6 +43,11 @@ const CollapsibleAlertPanel: FC<CollapsibleAlertPanelProps> = ({ rule, showAnnot
<div>for: {formatDuration(rule.duration * 1000)}</div>
</div>
)}
{rule.keepFiringFor > 0 && (
<div>
<div>keep_firing_for: {formatDuration(rule.keepFiringFor * 1000)}</div>
</div>
)}
{rule.labels && Object.keys(rule.labels).length > 0 && (
<div>
<div>labels:</div>

View file

@ -96,6 +96,11 @@ export const RulesContent: FC<RulesContentProps> = ({ response }) => {
<strong>for:</strong> {formatDuration(r.duration * 1000)}
</div>
)}
{r.keepFiringFor > 0 && (
<div>
<strong>keep_firing_for:</strong> {formatDuration(r.keepFiringFor * 1000)}
</div>
)}
{r.labels && Object.keys(r.labels).length > 0 && (
<div>
<strong>labels:</strong>

View file

@ -26,6 +26,7 @@ export type Rule = {
alerts: Alert[];
annotations: Record<string, string>;
duration: number;
keepFiringFor: number;
evaluationTime: string;
health: string;
labels: Record<string, string>;