From d81bbe154d8768df66004eeef8546e5b733edde8 Mon Sep 17 00:00:00 2001
From: Levi Harrison <git@leviharrison.dev>
Date: Thu, 21 Oct 2021 17:14:17 -0400
Subject: [PATCH] Rule alerts/series limit updates (#9541)

* Add docs and do not limit inactive alerts.

Signed-off-by: Levi Harrison <git@leviharrison.dev>
---
 docs/configuration/recording_rules.md | 12 +++-
 rules/alerting.go                     |  7 ++-
 rules/alerting_test.go                | 70 ++++++++++++------------
 rules/recording.go                    |  6 +-
 rules/recording_test.go               | 79 +++++++++++++++++----------
 5 files changed, 102 insertions(+), 72 deletions(-)
diff --git a/docs/configuration/recording_rules.md b/docs/configuration/recording_rules.md
index 8b12632dc..a14aab09d 100644
--- a/docs/configuration/recording_rules.md
+++ b/docs/configuration/recording_rules.md
@@ -78,8 +78,8 @@ name: <string>
 # How often rules in the group are evaluated.
 [ interval: <duration> | default = global.evaluation_interval ]
 
-# Limit the number of alerts and series individual rules can produce.
-# 0 is no limit.
+# Limit the number of alerts an alerting rule and series a recording
+# rule can produce. 0 is no limit.
 [ limit: <int> | default = 0 ]
 
 rules:
@@ -128,3 +128,11 @@ annotations:
   [ <labelname>: <tmpl_string> ]
 ```
 
+# Limiting alerts and series
+
+A limit for alerts produced by alerting rules and series produced recording rules
+can be configured per-group. When the limit is exceeded, _all_ series produced
+by the rule are discarded, and if it's an alerting rule, _all_ alerts for
+the rule, active, pending, or inactive, are cleared as well. The event will be
+recorded as an error in the evaluation, and as such no stale markers are
+written.
diff --git a/rules/alerting.go b/rules/alerting.go
index 5e7b8975c..28abcf2a5 100644
--- a/rules/alerting.go
+++ b/rules/alerting.go
@@ -389,6 +389,7 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc,
 		r.active[h] = a
 	}
 
+	var numActivePending int
 	// Check if any pending alerts should be removed or fire now. Write out alert timeseries.
 	for fp, a := range r.active {
 		if _, ok := resultFPs[fp]; !ok {
@@ -403,6 +404,7 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc,
 			}
 			continue
 		}
+		numActivePending++
 
 		if a.State == StatePending && ts.Sub(a.ActiveAt) >= r.holdDuration {
 			a.State = StateFiring
@@ -415,10 +417,9 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc,
 		}
 	}
 
-	numActive := len(r.active)
-	if limit != 0 && numActive > limit {
+	if limit > 0 && numActivePending > limit {
 		r.active = map[uint64]*Alert{}
-		return nil, errors.Errorf("exceeded limit of %d with %d alerts", limit, numActive)
+		return nil, errors.Errorf("exceeded limit of %d with %d alerts", limit, numActivePending)
 	}
 
 	return vec, nil
diff --git a/rules/alerting_test.go b/rules/alerting_test.go
index 94d505445..42882f4f6 100644
--- a/rules/alerting_test.go
+++ b/rules/alerting_test.go
@@ -466,23 +466,17 @@ func TestAlertingRuleDuplicate(t *testing.T) {
 }
 
 func TestAlertingRuleLimit(t *testing.T) {
-	storage := teststorage.New(t)
-	defer storage.Close()
+	suite, err := promql.NewTest(t, `
+		load 1m
+			metric{label="1"} 1
+			metric{label="2"} 1
+	`)
+	require.NoError(t, err)
+	defer suite.Close()
 
-	opts := promql.EngineOpts{
-		Logger:     nil,
-		Reg:        nil,
-		MaxSamples: 10,
-		Timeout:    10 * time.Second,
-	}
+	require.NoError(t, suite.Run())
 
-	engine := promql.NewEngine(opts)
-	ctx, cancelCtx := context.WithCancel(context.Background())
-	defer cancelCtx()
-
-	now := time.Now()
-
-	suite := []struct {
+	tests := []struct {
 		limit int
 		err   string
 	}{
@@ -490,31 +484,37 @@ func TestAlertingRuleLimit(t *testing.T) {
 			limit: 0,
 		},
 		{
-			limit: 1,
+			limit: -1,
 		},
 		{
-			limit: -1,
-			err:   "exceeded limit of -1 with 1 alerts",
+			limit: 2,
+		},
+		{
+			limit: 1,
+			err:   "exceeded limit of 1 with 2 alerts",
 		},
 	}
 
-	for _, test := range suite {
-		expr, _ := parser.ParseExpr(`1`)
-		rule := NewAlertingRule(
-			"foo",
-			expr,
-			time.Minute,
-			labels.FromStrings("test", "test"),
-			nil,
-			nil,
-			"",
-			true, log.NewNopLogger(),
-		)
-		_, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil, test.limit)
-		if test.err == "" {
-			require.NoError(t, err)
-		} else {
-			require.Equal(t, test.err, err.Error())
+	expr, _ := parser.ParseExpr(`metric > 0`)
+	rule := NewAlertingRule(
+		"foo",
+		expr,
+		time.Minute,
+		labels.FromStrings("test", "test"),
+		nil,
+		nil,
+		"",
+		true, log.NewNopLogger(),
+	)
+
+	evalTime := time.Unix(0, 0)
+
+	for _, test := range tests {
+		_, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, test.limit)
+		if err != nil {
+			require.EqualError(t, err, test.err)
+		} else if test.err != "" {
+			t.Errorf("Expected errror %s, got none", test.err)
 		}
 	}
 }
diff --git a/rules/recording.go b/rules/recording.go
index 08a7e37ca..0081690b8 100644
--- a/rules/recording.go
+++ b/rules/recording.go
@@ -99,9 +99,9 @@ func (rule *RecordingRule) Eval(ctx context.Context, ts time.Time, query QueryFu
 		return nil, fmt.Errorf("vector contains metrics with the same labelset after applying rule labels")
 	}
 
-	numSamples := len(vector)
-	if limit != 0 && numSamples > limit {
-		return nil, fmt.Errorf("exceeded limit %d with %d samples", limit, numSamples)
+	numSeries := len(vector)
+	if limit > 0 && numSeries > limit {
+		return nil, fmt.Errorf("exceeded limit of %d with %d series", limit, numSeries)
 	}
 
 	rule.SetHealth(HealthGood)
diff --git a/rules/recording_test.go b/rules/recording_test.go
index 211b6cda6..d6b0b6c2a 100644
--- a/rules/recording_test.go
+++ b/rules/recording_test.go
@@ -49,7 +49,6 @@ func TestRuleEval(t *testing.T) {
 		name   string
 		expr   parser.Expr
 		labels labels.Labels
-		limit  int
 		result promql.Vector
 		err    string
 	}{
@@ -71,38 +70,11 @@ func TestRuleEval(t *testing.T) {
 				Point:  promql.Point{V: 1, T: timestamp.FromTime(now)},
 			}},
 		},
-		{
-			name:   "underlimit",
-			expr:   &parser.NumberLiteral{Val: 1},
-			labels: labels.FromStrings("foo", "bar"),
-			limit:  2,
-			result: promql.Vector{promql.Sample{
-				Metric: labels.FromStrings("__name__", "underlimit", "foo", "bar"),
-				Point:  promql.Point{V: 1, T: timestamp.FromTime(now)},
-			}},
-		},
-		{
-			name:   "atlimit",
-			expr:   &parser.NumberLiteral{Val: 1},
-			labels: labels.FromStrings("foo", "bar"),
-			limit:  1,
-			result: promql.Vector{promql.Sample{
-				Metric: labels.FromStrings("__name__", "atlimit", "foo", "bar"),
-				Point:  promql.Point{V: 1, T: timestamp.FromTime(now)},
-			}},
-		},
-		{
-			name:   "overlimit",
-			expr:   &parser.NumberLiteral{Val: 1},
-			labels: labels.FromStrings("foo", "bar"),
-			limit:  -1,
-			err:    "exceeded limit -1 with 1 samples",
-		},
 	}
 
 	for _, test := range suite {
 		rule := NewRecordingRule(test.name, test.expr, test.labels)
-		result, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil, test.limit)
+		result, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil, 0)
 		if test.err == "" {
 			require.NoError(t, err)
 		} else {
@@ -151,3 +123,52 @@ func TestRuleEvalDuplicate(t *testing.T) {
 	require.Error(t, err)
 	require.EqualError(t, err, "vector contains metrics with the same labelset after applying rule labels")
 }
+
+func TestRecordingRuleLimit(t *testing.T) {
+	suite, err := promql.NewTest(t, `
+		load 1m
+			metric{label="1"} 1
+			metric{label="2"} 1
+	`)
+	require.NoError(t, err)
+	defer suite.Close()
+
+	require.NoError(t, suite.Run())
+
+	tests := []struct {
+		limit int
+		err   string
+	}{
+		{
+			limit: 0,
+		},
+		{
+			limit: -1,
+		},
+		{
+			limit: 2,
+		},
+		{
+			limit: 1,
+			err:   "exceeded limit of 1 with 2 series",
+		},
+	}
+
+	expr, _ := parser.ParseExpr(`metric > 0`)
+	rule := NewRecordingRule(
+		"foo",
+		expr,
+		labels.FromStrings("test", "test"),
+	)
+
+	evalTime := time.Unix(0, 0)
+
+	for _, test := range tests {
+		_, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, test.limit)
+		if err != nil {
+			require.EqualError(t, err, test.err)
+		} else if test.err != "" {
+			t.Errorf("Expected error %s, got none", test.err)
+		}
+	}
+}