From d81bbe154d8768df66004eeef8546e5b733edde8 Mon Sep 17 00:00:00 2001 From: Levi Harrison Date: Thu, 21 Oct 2021 17:14:17 -0400 Subject: [PATCH] Rule alerts/series limit updates (#9541) * Add docs and do not limit inactive alerts. Signed-off-by: Levi Harrison --- docs/configuration/recording_rules.md | 12 +++- rules/alerting.go | 7 ++- rules/alerting_test.go | 70 ++++++++++++------------ rules/recording.go | 6 +- rules/recording_test.go | 79 +++++++++++++++++---------- 5 files changed, 102 insertions(+), 72 deletions(-) diff --git a/docs/configuration/recording_rules.md b/docs/configuration/recording_rules.md index 8b12632dc..a14aab09d 100644 --- a/docs/configuration/recording_rules.md +++ b/docs/configuration/recording_rules.md @@ -78,8 +78,8 @@ name: # How often rules in the group are evaluated. [ interval: | default = global.evaluation_interval ] -# Limit the number of alerts and series individual rules can produce. -# 0 is no limit. +# Limit the number of alerts an alerting rule and series a recording +# rule can produce. 0 is no limit. [ limit: | default = 0 ] rules: @@ -128,3 +128,11 @@ annotations: [ : ] ``` +# Limiting alerts and series + +A limit for alerts produced by alerting rules and series produced recording rules +can be configured per-group. When the limit is exceeded, _all_ series produced +by the rule are discarded, and if it's an alerting rule, _all_ alerts for +the rule, active, pending, or inactive, are cleared as well. The event will be +recorded as an error in the evaluation, and as such no stale markers are +written. diff --git a/rules/alerting.go b/rules/alerting.go index 5e7b8975c..28abcf2a5 100644 --- a/rules/alerting.go +++ b/rules/alerting.go @@ -389,6 +389,7 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, r.active[h] = a } + var numActivePending int // Check if any pending alerts should be removed or fire now. Write out alert timeseries. for fp, a := range r.active { if _, ok := resultFPs[fp]; !ok { @@ -403,6 +404,7 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, } continue } + numActivePending++ if a.State == StatePending && ts.Sub(a.ActiveAt) >= r.holdDuration { a.State = StateFiring @@ -415,10 +417,9 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, } } - numActive := len(r.active) - if limit != 0 && numActive > limit { + if limit > 0 && numActivePending > limit { r.active = map[uint64]*Alert{} - return nil, errors.Errorf("exceeded limit of %d with %d alerts", limit, numActive) + return nil, errors.Errorf("exceeded limit of %d with %d alerts", limit, numActivePending) } return vec, nil diff --git a/rules/alerting_test.go b/rules/alerting_test.go index 94d505445..42882f4f6 100644 --- a/rules/alerting_test.go +++ b/rules/alerting_test.go @@ -466,23 +466,17 @@ func TestAlertingRuleDuplicate(t *testing.T) { } func TestAlertingRuleLimit(t *testing.T) { - storage := teststorage.New(t) - defer storage.Close() + suite, err := promql.NewTest(t, ` + load 1m + metric{label="1"} 1 + metric{label="2"} 1 + `) + require.NoError(t, err) + defer suite.Close() - opts := promql.EngineOpts{ - Logger: nil, - Reg: nil, - MaxSamples: 10, - Timeout: 10 * time.Second, - } + require.NoError(t, suite.Run()) - engine := promql.NewEngine(opts) - ctx, cancelCtx := context.WithCancel(context.Background()) - defer cancelCtx() - - now := time.Now() - - suite := []struct { + tests := []struct { limit int err string }{ @@ -490,31 +484,37 @@ func TestAlertingRuleLimit(t *testing.T) { limit: 0, }, { - limit: 1, + limit: -1, }, { - limit: -1, - err: "exceeded limit of -1 with 1 alerts", + limit: 2, + }, + { + limit: 1, + err: "exceeded limit of 1 with 2 alerts", }, } - for _, test := range suite { - expr, _ := parser.ParseExpr(`1`) - rule := NewAlertingRule( - "foo", - expr, - time.Minute, - labels.FromStrings("test", "test"), - nil, - nil, - "", - true, log.NewNopLogger(), - ) - _, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil, test.limit) - if test.err == "" { - require.NoError(t, err) - } else { - require.Equal(t, test.err, err.Error()) + expr, _ := parser.ParseExpr(`metric > 0`) + rule := NewAlertingRule( + "foo", + expr, + time.Minute, + labels.FromStrings("test", "test"), + nil, + nil, + "", + true, log.NewNopLogger(), + ) + + evalTime := time.Unix(0, 0) + + for _, test := range tests { + _, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, test.limit) + if err != nil { + require.EqualError(t, err, test.err) + } else if test.err != "" { + t.Errorf("Expected errror %s, got none", test.err) } } } diff --git a/rules/recording.go b/rules/recording.go index 08a7e37ca..0081690b8 100644 --- a/rules/recording.go +++ b/rules/recording.go @@ -99,9 +99,9 @@ func (rule *RecordingRule) Eval(ctx context.Context, ts time.Time, query QueryFu return nil, fmt.Errorf("vector contains metrics with the same labelset after applying rule labels") } - numSamples := len(vector) - if limit != 0 && numSamples > limit { - return nil, fmt.Errorf("exceeded limit %d with %d samples", limit, numSamples) + numSeries := len(vector) + if limit > 0 && numSeries > limit { + return nil, fmt.Errorf("exceeded limit of %d with %d series", limit, numSeries) } rule.SetHealth(HealthGood) diff --git a/rules/recording_test.go b/rules/recording_test.go index 211b6cda6..d6b0b6c2a 100644 --- a/rules/recording_test.go +++ b/rules/recording_test.go @@ -49,7 +49,6 @@ func TestRuleEval(t *testing.T) { name string expr parser.Expr labels labels.Labels - limit int result promql.Vector err string }{ @@ -71,38 +70,11 @@ func TestRuleEval(t *testing.T) { Point: promql.Point{V: 1, T: timestamp.FromTime(now)}, }}, }, - { - name: "underlimit", - expr: &parser.NumberLiteral{Val: 1}, - labels: labels.FromStrings("foo", "bar"), - limit: 2, - result: promql.Vector{promql.Sample{ - Metric: labels.FromStrings("__name__", "underlimit", "foo", "bar"), - Point: promql.Point{V: 1, T: timestamp.FromTime(now)}, - }}, - }, - { - name: "atlimit", - expr: &parser.NumberLiteral{Val: 1}, - labels: labels.FromStrings("foo", "bar"), - limit: 1, - result: promql.Vector{promql.Sample{ - Metric: labels.FromStrings("__name__", "atlimit", "foo", "bar"), - Point: promql.Point{V: 1, T: timestamp.FromTime(now)}, - }}, - }, - { - name: "overlimit", - expr: &parser.NumberLiteral{Val: 1}, - labels: labels.FromStrings("foo", "bar"), - limit: -1, - err: "exceeded limit -1 with 1 samples", - }, } for _, test := range suite { rule := NewRecordingRule(test.name, test.expr, test.labels) - result, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil, test.limit) + result, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil, 0) if test.err == "" { require.NoError(t, err) } else { @@ -151,3 +123,52 @@ func TestRuleEvalDuplicate(t *testing.T) { require.Error(t, err) require.EqualError(t, err, "vector contains metrics with the same labelset after applying rule labels") } + +func TestRecordingRuleLimit(t *testing.T) { + suite, err := promql.NewTest(t, ` + load 1m + metric{label="1"} 1 + metric{label="2"} 1 + `) + require.NoError(t, err) + defer suite.Close() + + require.NoError(t, suite.Run()) + + tests := []struct { + limit int + err string + }{ + { + limit: 0, + }, + { + limit: -1, + }, + { + limit: 2, + }, + { + limit: 1, + err: "exceeded limit of 1 with 2 series", + }, + } + + expr, _ := parser.ParseExpr(`metric > 0`) + rule := NewRecordingRule( + "foo", + expr, + labels.FromStrings("test", "test"), + ) + + evalTime := time.Unix(0, 0) + + for _, test := range tests { + _, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, test.limit) + if err != nil { + require.EqualError(t, err, test.err) + } else if test.err != "" { + t.Errorf("Expected error %s, got none", test.err) + } + } +}