mirror of
https://github.com/prometheus/prometheus.git
synced 2025-01-11 05:47:27 -08:00
Add 'keep_firing_for' field to alerting rules
This commit adds a new 'keep_firing_for' field to Prometheus alerting rules. The 'resolve_delay' field specifies the minimum amount of time that an alert should remain firing, even if the expression does not return any results. This feature was discussed at a previous dev summit, and it was determined that a feature like this would be useful in order to allow the expression time to stabilize and prevent confusing resolved messages from being propagated through Alertmanager. This approach is simpler than having two PromQL queries, as was sometimes discussed, and it should be easy to implement. This commit does not include tests for the 'resolve_delay' field. This is intentional, as the purpose of this commit is to gather comments on the proposed design of the 'resolve_delay' field before implementing tests. Once the design of the 'resolve_delay' field has been finalized, a follow-up commit will be submitted with tests." See https://github.com/prometheus/prometheus/issues/11570 Signed-off-by: Julien Pivotto <roidelapluie@o11y.eu>
This commit is contained in:
parent
72f20d949a
commit
ce55e5074d
|
@ -123,6 +123,10 @@ expr: <string>
|
|||
# Alerts which have not yet fired for long enough are considered pending.
|
||||
[ for: <duration> | default = 0s ]
|
||||
|
||||
# How long an alert will continue firing after the condition that triggered it
|
||||
# has cleared.
|
||||
[ keep_firing_for: <duration> | default = 0s ]
|
||||
|
||||
# Labels to add or overwrite for each alert.
|
||||
labels:
|
||||
[ <labelname>: <tmpl_string> ]
|
||||
|
|
|
@ -147,6 +147,7 @@ type Rule struct {
|
|||
Alert string `yaml:"alert,omitempty"`
|
||||
Expr string `yaml:"expr"`
|
||||
For model.Duration `yaml:"for,omitempty"`
|
||||
KeepFiringFor model.Duration `yaml:"keep_firing_for,omitempty"`
|
||||
Labels map[string]string `yaml:"labels,omitempty"`
|
||||
Annotations map[string]string `yaml:"annotations,omitempty"`
|
||||
}
|
||||
|
@ -157,6 +158,7 @@ type RuleNode struct {
|
|||
Alert yaml.Node `yaml:"alert,omitempty"`
|
||||
Expr yaml.Node `yaml:"expr"`
|
||||
For model.Duration `yaml:"for,omitempty"`
|
||||
KeepFiringFor model.Duration `yaml:"keep_firing_for,omitempty"`
|
||||
Labels map[string]string `yaml:"labels,omitempty"`
|
||||
Annotations map[string]string `yaml:"annotations,omitempty"`
|
||||
}
|
||||
|
@ -208,6 +210,12 @@ func (r *RuleNode) Validate() (nodes []WrappedError) {
|
|||
node: &r.Record,
|
||||
})
|
||||
}
|
||||
if r.KeepFiringFor != 0 {
|
||||
nodes = append(nodes, WrappedError{
|
||||
err: fmt.Errorf("invalid field 'keep_firing_for' in recording rule"),
|
||||
node: &r.Record,
|
||||
})
|
||||
}
|
||||
if !model.IsValidMetricName(model.LabelValue(r.Record.Value)) {
|
||||
nodes = append(nodes, WrappedError{
|
||||
err: fmt.Errorf("invalid recording rule name: %s", r.Record.Value),
|
||||
|
|
|
@ -88,6 +88,7 @@ type Alert struct {
|
|||
ResolvedAt time.Time
|
||||
LastSentAt time.Time
|
||||
ValidUntil time.Time
|
||||
KeepFiringSince time.Time
|
||||
}
|
||||
|
||||
func (a *Alert) needsSending(ts time.Time, resendDelay time.Duration) bool {
|
||||
|
@ -112,6 +113,9 @@ type AlertingRule struct {
|
|||
// The duration for which a labelset needs to persist in the expression
|
||||
// output vector before an alert transitions from Pending to Firing state.
|
||||
holdDuration time.Duration
|
||||
// The amount of time that the alert should remain firing after the
|
||||
// resolution.
|
||||
keepFiringFor time.Duration
|
||||
// Extra labels to attach to the resulting alert sample vectors.
|
||||
labels labels.Labels
|
||||
// Non-identifying key/value pairs.
|
||||
|
@ -142,7 +146,7 @@ type AlertingRule struct {
|
|||
|
||||
// NewAlertingRule constructs a new AlertingRule.
|
||||
func NewAlertingRule(
|
||||
name string, vec parser.Expr, hold time.Duration,
|
||||
name string, vec parser.Expr, hold, keepFiringFor time.Duration,
|
||||
labels, annotations, externalLabels labels.Labels, externalURL string,
|
||||
restored bool, logger log.Logger,
|
||||
) *AlertingRule {
|
||||
|
@ -152,6 +156,7 @@ func NewAlertingRule(
|
|||
name: name,
|
||||
vector: vec,
|
||||
holdDuration: hold,
|
||||
keepFiringFor: keepFiringFor,
|
||||
labels: labels,
|
||||
annotations: annotations,
|
||||
externalLabels: el,
|
||||
|
@ -201,6 +206,12 @@ func (r *AlertingRule) HoldDuration() time.Duration {
|
|||
return r.holdDuration
|
||||
}
|
||||
|
||||
// KeepFiringFor returns the duration an alerting rule should keep firing for
|
||||
// after resolution.
|
||||
func (r *AlertingRule) KeepFiringFor() time.Duration {
|
||||
return r.keepFiringFor
|
||||
}
|
||||
|
||||
// Labels returns the labels of the alerting rule.
|
||||
func (r *AlertingRule) Labels() labels.Labels {
|
||||
return r.labels
|
||||
|
@ -404,17 +415,30 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc,
|
|||
// Check if any pending alerts should be removed or fire now. Write out alert timeseries.
|
||||
for fp, a := range r.active {
|
||||
if _, ok := resultFPs[fp]; !ok {
|
||||
var keepFiring bool
|
||||
if a.State == StateFiring && r.keepFiringFor > 0 {
|
||||
if a.KeepFiringSince.IsZero() {
|
||||
a.KeepFiringSince = ts
|
||||
}
|
||||
if ts.Sub(a.KeepFiringSince) < r.keepFiringFor {
|
||||
keepFiring = true
|
||||
}
|
||||
}
|
||||
// If the alert was previously firing, keep it around for a given
|
||||
// retention time so it is reported as resolved to the AlertManager.
|
||||
if a.State == StatePending || (!a.ResolvedAt.IsZero() && ts.Sub(a.ResolvedAt) > resolvedRetention) {
|
||||
delete(r.active, fp)
|
||||
}
|
||||
if a.State != StateInactive {
|
||||
if a.State != StateInactive && !keepFiring {
|
||||
a.State = StateInactive
|
||||
a.ResolvedAt = ts
|
||||
}
|
||||
if !keepFiring {
|
||||
continue
|
||||
}
|
||||
} else {
|
||||
a.KeepFiringSince = time.Time{}
|
||||
}
|
||||
numActivePending++
|
||||
|
||||
if a.State == StatePending && ts.Sub(a.ActiveAt) >= r.holdDuration {
|
||||
|
|
|
@ -66,7 +66,7 @@ func TestAlertingRuleState(t *testing.T) {
|
|||
}
|
||||
|
||||
for i, test := range tests {
|
||||
rule := NewAlertingRule(test.name, nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil)
|
||||
rule := NewAlertingRule(test.name, nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil)
|
||||
rule.active = test.active
|
||||
got := rule.State()
|
||||
require.Equal(t, test.want, got, "test case %d unexpected AlertState, want:%d got:%d", i, test.want, got)
|
||||
|
@ -90,6 +90,7 @@ func TestAlertingRuleLabelsUpdate(t *testing.T) {
|
|||
"HTTPRequestRateLow",
|
||||
expr,
|
||||
time.Minute,
|
||||
0,
|
||||
// Basing alerting rule labels off of a value that can change is a very bad idea.
|
||||
// If an alert is going back and forth between two label values it will never fire.
|
||||
// Instead, you should write two alerts with constant labels.
|
||||
|
@ -192,6 +193,7 @@ func TestAlertingRuleExternalLabelsInTemplate(t *testing.T) {
|
|||
"ExternalLabelDoesNotExist",
|
||||
expr,
|
||||
time.Minute,
|
||||
0,
|
||||
labels.FromStrings("templated_label", "There are {{ len $externalLabels }} external Labels, of which foo is {{ $externalLabels.foo }}."),
|
||||
labels.EmptyLabels(),
|
||||
labels.EmptyLabels(),
|
||||
|
@ -202,6 +204,7 @@ func TestAlertingRuleExternalLabelsInTemplate(t *testing.T) {
|
|||
"ExternalLabelExists",
|
||||
expr,
|
||||
time.Minute,
|
||||
0,
|
||||
labels.FromStrings("templated_label", "There are {{ len $externalLabels }} external Labels, of which foo is {{ $externalLabels.foo }}."),
|
||||
labels.EmptyLabels(),
|
||||
labels.FromStrings("foo", "bar", "dings", "bums"),
|
||||
|
@ -286,6 +289,7 @@ func TestAlertingRuleExternalURLInTemplate(t *testing.T) {
|
|||
"ExternalURLDoesNotExist",
|
||||
expr,
|
||||
time.Minute,
|
||||
0,
|
||||
labels.FromStrings("templated_label", "The external URL is {{ $externalURL }}."),
|
||||
labels.EmptyLabels(),
|
||||
labels.EmptyLabels(),
|
||||
|
@ -296,6 +300,7 @@ func TestAlertingRuleExternalURLInTemplate(t *testing.T) {
|
|||
"ExternalURLExists",
|
||||
expr,
|
||||
time.Minute,
|
||||
0,
|
||||
labels.FromStrings("templated_label", "The external URL is {{ $externalURL }}."),
|
||||
labels.EmptyLabels(),
|
||||
labels.EmptyLabels(),
|
||||
|
@ -380,6 +385,7 @@ func TestAlertingRuleEmptyLabelFromTemplate(t *testing.T) {
|
|||
"EmptyLabel",
|
||||
expr,
|
||||
time.Minute,
|
||||
0,
|
||||
labels.FromStrings("empty_label", ""),
|
||||
labels.EmptyLabels(),
|
||||
labels.EmptyLabels(),
|
||||
|
@ -436,6 +442,7 @@ func TestAlertingRuleQueryInTemplate(t *testing.T) {
|
|||
"ruleWithQueryInTemplate",
|
||||
expr,
|
||||
time.Minute,
|
||||
0,
|
||||
labels.FromStrings("label", "value"),
|
||||
labels.FromStrings("templated_label", `{{- with "sort(sum(http_requests) by (instance))" | query -}}
|
||||
{{- range $i,$v := . -}}
|
||||
|
@ -480,7 +487,7 @@ instance: {{ $v.Labels.instance }}, value: {{ printf "%.0f" $v.Value }};
|
|||
|
||||
func BenchmarkAlertingRuleAtomicField(b *testing.B) {
|
||||
b.ReportAllocs()
|
||||
rule := NewAlertingRule("bench", nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil)
|
||||
rule := NewAlertingRule("bench", nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil)
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
for i := 0; i < b.N; i++ {
|
||||
|
@ -518,6 +525,7 @@ func TestAlertingRuleDuplicate(t *testing.T) {
|
|||
"foo",
|
||||
expr,
|
||||
time.Minute,
|
||||
0,
|
||||
labels.FromStrings("test", "test"),
|
||||
labels.EmptyLabels(),
|
||||
labels.EmptyLabels(),
|
||||
|
@ -564,6 +572,7 @@ func TestAlertingRuleLimit(t *testing.T) {
|
|||
"foo",
|
||||
expr,
|
||||
time.Minute,
|
||||
0,
|
||||
labels.FromStrings("test", "test"),
|
||||
labels.EmptyLabels(),
|
||||
labels.EmptyLabels(),
|
||||
|
@ -636,6 +645,7 @@ func TestQueryForStateSeries(t *testing.T) {
|
|||
"TestRule",
|
||||
nil,
|
||||
time.Minute,
|
||||
0,
|
||||
labels.FromStrings("severity", "critical"),
|
||||
labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil,
|
||||
)
|
||||
|
@ -669,6 +679,7 @@ func TestSendAlertsDontAffectActiveAlerts(t *testing.T) {
|
|||
"TestRule",
|
||||
nil,
|
||||
time.Minute,
|
||||
0,
|
||||
labels.FromStrings("severity", "critical"),
|
||||
labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil,
|
||||
)
|
||||
|
|
|
@ -1119,6 +1119,7 @@ func (m *Manager) LoadGroups(
|
|||
r.Alert.Value,
|
||||
expr,
|
||||
time.Duration(r.For),
|
||||
time.Duration(r.KeepFiringFor),
|
||||
labels.FromMap(r.Labels),
|
||||
labels.FromMap(r.Annotations),
|
||||
externalLabels,
|
||||
|
|
|
@ -66,6 +66,7 @@ func TestAlertingRule(t *testing.T) {
|
|||
"HTTPRequestRateLow",
|
||||
expr,
|
||||
time.Minute,
|
||||
0,
|
||||
labels.FromStrings("severity", "{{\"c\"}}ritical"),
|
||||
labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil,
|
||||
)
|
||||
|
@ -209,6 +210,7 @@ func TestForStateAddSamples(t *testing.T) {
|
|||
"HTTPRequestRateLow",
|
||||
expr,
|
||||
time.Minute,
|
||||
0,
|
||||
labels.FromStrings("severity", "{{\"c\"}}ritical"),
|
||||
labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil,
|
||||
)
|
||||
|
@ -383,6 +385,7 @@ func TestForStateRestore(t *testing.T) {
|
|||
"HTTPRequestRateLow",
|
||||
expr,
|
||||
alertForDuration,
|
||||
0,
|
||||
labels.FromStrings("severity", "critical"),
|
||||
labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil,
|
||||
)
|
||||
|
@ -449,6 +452,7 @@ func TestForStateRestore(t *testing.T) {
|
|||
"HTTPRequestRateLow",
|
||||
expr,
|
||||
alertForDuration,
|
||||
0,
|
||||
labels.FromStrings("severity", "critical"),
|
||||
labels.EmptyLabels(), labels.EmptyLabels(), "", false, nil,
|
||||
)
|
||||
|
@ -615,13 +619,13 @@ func readSeriesSet(ss storage.SeriesSet) (map[string][]promql.Point, error) {
|
|||
func TestCopyState(t *testing.T) {
|
||||
oldGroup := &Group{
|
||||
rules: []Rule{
|
||||
NewAlertingRule("alert", nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
|
||||
NewAlertingRule("alert", nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
|
||||
NewRecordingRule("rule1", nil, labels.EmptyLabels()),
|
||||
NewRecordingRule("rule2", nil, labels.EmptyLabels()),
|
||||
NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v1")),
|
||||
NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v2")),
|
||||
NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v3")),
|
||||
NewAlertingRule("alert2", nil, 0, labels.FromStrings("l2", "v1"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
|
||||
NewAlertingRule("alert2", nil, 0, 0, labels.FromStrings("l2", "v1"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
|
||||
},
|
||||
seriesInPreviousEval: []map[string]labels.Labels{
|
||||
{},
|
||||
|
@ -640,10 +644,10 @@ func TestCopyState(t *testing.T) {
|
|||
NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v0")),
|
||||
NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v1")),
|
||||
NewRecordingRule("rule3", nil, labels.FromStrings("l1", "v2")),
|
||||
NewAlertingRule("alert", nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
|
||||
NewAlertingRule("alert", nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
|
||||
NewRecordingRule("rule1", nil, labels.EmptyLabels()),
|
||||
NewAlertingRule("alert2", nil, 0, labels.FromStrings("l2", "v0"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
|
||||
NewAlertingRule("alert2", nil, 0, labels.FromStrings("l2", "v1"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
|
||||
NewAlertingRule("alert2", nil, 0, 0, labels.FromStrings("l2", "v0"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
|
||||
NewAlertingRule("alert2", nil, 0, 0, labels.FromStrings("l2", "v1"), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
|
||||
NewRecordingRule("rule4", nil, labels.EmptyLabels()),
|
||||
},
|
||||
seriesInPreviousEval: make([]map[string]labels.Labels, 8),
|
||||
|
@ -875,7 +879,7 @@ func TestNotify(t *testing.T) {
|
|||
|
||||
expr, err := parser.ParseExpr("a > 1")
|
||||
require.NoError(t, err)
|
||||
rule := NewAlertingRule("aTooHigh", expr, 0, labels.Labels{}, labels.Labels{}, labels.EmptyLabels(), "", true, log.NewNopLogger())
|
||||
rule := NewAlertingRule("aTooHigh", expr, 0, 0, labels.Labels{}, labels.Labels{}, labels.EmptyLabels(), "", true, log.NewNopLogger())
|
||||
group := NewGroup(GroupOptions{
|
||||
Name: "alert",
|
||||
Interval: time.Second,
|
||||
|
@ -1147,7 +1151,7 @@ func TestGroupHasAlertingRules(t *testing.T) {
|
|||
group: &Group{
|
||||
name: "HasAlertingRule",
|
||||
rules: []Rule{
|
||||
NewAlertingRule("alert", nil, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
|
||||
NewAlertingRule("alert", nil, 0, 0, labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil),
|
||||
NewRecordingRule("record", nil, labels.EmptyLabels()),
|
||||
},
|
||||
},
|
||||
|
|
|
@ -1115,6 +1115,7 @@ type Alert struct {
|
|||
Annotations labels.Labels `json:"annotations"`
|
||||
State string `json:"state"`
|
||||
ActiveAt *time.Time `json:"activeAt,omitempty"`
|
||||
KeepFiringSince *time.Time `json:"keep_firing_since,omitempty"`
|
||||
Value string `json:"value"`
|
||||
}
|
||||
|
||||
|
@ -1142,6 +1143,7 @@ func rulesAlertsToAPIAlerts(rulesAlerts []*rules.Alert) []*Alert {
|
|||
Annotations: ruleAlert.Annotations,
|
||||
State: ruleAlert.State.String(),
|
||||
ActiveAt: &ruleAlert.ActiveAt,
|
||||
KeepFiringSince: &ruleAlert.KeepFiringSince,
|
||||
Value: strconv.FormatFloat(ruleAlert.Value, 'e', -1, 64),
|
||||
}
|
||||
}
|
||||
|
@ -1241,6 +1243,7 @@ type AlertingRule struct {
|
|||
Name string `json:"name"`
|
||||
Query string `json:"query"`
|
||||
Duration float64 `json:"duration"`
|
||||
KeepFiringFor float64 `json:"keepFiringFor"`
|
||||
Labels labels.Labels `json:"labels"`
|
||||
Annotations labels.Labels `json:"annotations"`
|
||||
Alerts []*Alert `json:"alerts"`
|
||||
|
@ -1303,6 +1306,7 @@ func (api *API) rules(r *http.Request) apiFuncResult {
|
|||
Name: rule.Name(),
|
||||
Query: rule.Query().String(),
|
||||
Duration: rule.HoldDuration().Seconds(),
|
||||
KeepFiringFor: rule.KeepFiringFor().Seconds(),
|
||||
Labels: rule.Labels(),
|
||||
Annotations: rule.Annotations(),
|
||||
Alerts: rulesAlertsToAPIAlerts(rule.ActiveAlerts()),
|
||||
|
|
|
@ -209,6 +209,7 @@ func (m rulesRetrieverMock) AlertingRules() []*rules.AlertingRule {
|
|||
"test_metric3",
|
||||
expr1,
|
||||
time.Second,
|
||||
0,
|
||||
labels.Labels{},
|
||||
labels.Labels{},
|
||||
labels.Labels{},
|
||||
|
@ -220,6 +221,7 @@ func (m rulesRetrieverMock) AlertingRules() []*rules.AlertingRule {
|
|||
"test_metric4",
|
||||
expr2,
|
||||
time.Second,
|
||||
0,
|
||||
labels.Labels{},
|
||||
labels.Labels{},
|
||||
labels.Labels{},
|
||||
|
|
|
@ -43,6 +43,11 @@ const CollapsibleAlertPanel: FC<CollapsibleAlertPanelProps> = ({ rule, showAnnot
|
|||
<div>for: {formatDuration(rule.duration * 1000)}</div>
|
||||
</div>
|
||||
)}
|
||||
{rule.keepFiringFor > 0 && (
|
||||
<div>
|
||||
<div>keep_firing_for: {formatDuration(rule.keepFiringFor * 1000)}</div>
|
||||
</div>
|
||||
)}
|
||||
{rule.labels && Object.keys(rule.labels).length > 0 && (
|
||||
<div>
|
||||
<div>labels:</div>
|
||||
|
|
|
@ -96,6 +96,11 @@ export const RulesContent: FC<RulesContentProps> = ({ response }) => {
|
|||
<strong>for:</strong> {formatDuration(r.duration * 1000)}
|
||||
</div>
|
||||
)}
|
||||
{r.keepFiringFor > 0 && (
|
||||
<div>
|
||||
<strong>keep_firing_for:</strong> {formatDuration(r.keepFiringFor * 1000)}
|
||||
</div>
|
||||
)}
|
||||
{r.labels && Object.keys(r.labels).length > 0 && (
|
||||
<div>
|
||||
<strong>labels:</strong>
|
||||
|
|
|
@ -26,6 +26,7 @@ export type Rule = {
|
|||
alerts: Alert[];
|
||||
annotations: Record<string, string>;
|
||||
duration: number;
|
||||
keepFiringFor: number;
|
||||
evaluationTime: string;
|
||||
health: string;
|
||||
labels: Record<string, string>;
|
||||
|
|
Loading…
Reference in a new issue