Rule alerts/series limit updates (#9541)

* Add docs and do not limit inactive alerts.

Signed-off-by: Levi Harrison <git@leviharrison.dev>
This commit is contained in:
Levi Harrison 2021-10-21 17:14:17 -04:00 committed by GitHub
parent 8c3eca84db
commit d81bbe154d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 102 additions and 72 deletions

View file

@ -78,8 +78,8 @@ name: <string>
# How often rules in the group are evaluated.
[ interval: <duration> | default = global.evaluation_interval ]
# Limit the number of alerts and series individual rules can produce.
# 0 is no limit.
# Limit the number of alerts an alerting rule and series a recording
# rule can produce. 0 is no limit.
[ limit: <int> | default = 0 ]
rules:
@ -128,3 +128,11 @@ annotations:
[ <labelname>: <tmpl_string> ]
```
# Limiting alerts and series
A limit for alerts produced by alerting rules and series produced recording rules
can be configured per-group. When the limit is exceeded, _all_ series produced
by the rule are discarded, and if it's an alerting rule, _all_ alerts for
the rule, active, pending, or inactive, are cleared as well. The event will be
recorded as an error in the evaluation, and as such no stale markers are
written.

View file

@ -389,6 +389,7 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc,
r.active[h] = a
}
var numActivePending int
// Check if any pending alerts should be removed or fire now. Write out alert timeseries.
for fp, a := range r.active {
if _, ok := resultFPs[fp]; !ok {
@ -403,6 +404,7 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc,
}
continue
}
numActivePending++
if a.State == StatePending && ts.Sub(a.ActiveAt) >= r.holdDuration {
a.State = StateFiring
@ -415,10 +417,9 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc,
}
}
numActive := len(r.active)
if limit != 0 && numActive > limit {
if limit > 0 && numActivePending > limit {
r.active = map[uint64]*Alert{}
return nil, errors.Errorf("exceeded limit of %d with %d alerts", limit, numActive)
return nil, errors.Errorf("exceeded limit of %d with %d alerts", limit, numActivePending)
}
return vec, nil

View file

@ -466,23 +466,17 @@ func TestAlertingRuleDuplicate(t *testing.T) {
}
func TestAlertingRuleLimit(t *testing.T) {
storage := teststorage.New(t)
defer storage.Close()
suite, err := promql.NewTest(t, `
load 1m
metric{label="1"} 1
metric{label="2"} 1
`)
require.NoError(t, err)
defer suite.Close()
opts := promql.EngineOpts{
Logger: nil,
Reg: nil,
MaxSamples: 10,
Timeout: 10 * time.Second,
}
require.NoError(t, suite.Run())
engine := promql.NewEngine(opts)
ctx, cancelCtx := context.WithCancel(context.Background())
defer cancelCtx()
now := time.Now()
suite := []struct {
tests := []struct {
limit int
err string
}{
@ -490,16 +484,18 @@ func TestAlertingRuleLimit(t *testing.T) {
limit: 0,
},
{
limit: 1,
limit: -1,
},
{
limit: -1,
err: "exceeded limit of -1 with 1 alerts",
limit: 2,
},
{
limit: 1,
err: "exceeded limit of 1 with 2 alerts",
},
}
for _, test := range suite {
expr, _ := parser.ParseExpr(`1`)
expr, _ := parser.ParseExpr(`metric > 0`)
rule := NewAlertingRule(
"foo",
expr,
@ -510,11 +506,15 @@ func TestAlertingRuleLimit(t *testing.T) {
"",
true, log.NewNopLogger(),
)
_, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil, test.limit)
if test.err == "" {
require.NoError(t, err)
} else {
require.Equal(t, test.err, err.Error())
evalTime := time.Unix(0, 0)
for _, test := range tests {
_, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, test.limit)
if err != nil {
require.EqualError(t, err, test.err)
} else if test.err != "" {
t.Errorf("Expected errror %s, got none", test.err)
}
}
}

View file

@ -99,9 +99,9 @@ func (rule *RecordingRule) Eval(ctx context.Context, ts time.Time, query QueryFu
return nil, fmt.Errorf("vector contains metrics with the same labelset after applying rule labels")
}
numSamples := len(vector)
if limit != 0 && numSamples > limit {
return nil, fmt.Errorf("exceeded limit %d with %d samples", limit, numSamples)
numSeries := len(vector)
if limit > 0 && numSeries > limit {
return nil, fmt.Errorf("exceeded limit of %d with %d series", limit, numSeries)
}
rule.SetHealth(HealthGood)

View file

@ -49,7 +49,6 @@ func TestRuleEval(t *testing.T) {
name string
expr parser.Expr
labels labels.Labels
limit int
result promql.Vector
err string
}{
@ -71,38 +70,11 @@ func TestRuleEval(t *testing.T) {
Point: promql.Point{V: 1, T: timestamp.FromTime(now)},
}},
},
{
name: "underlimit",
expr: &parser.NumberLiteral{Val: 1},
labels: labels.FromStrings("foo", "bar"),
limit: 2,
result: promql.Vector{promql.Sample{
Metric: labels.FromStrings("__name__", "underlimit", "foo", "bar"),
Point: promql.Point{V: 1, T: timestamp.FromTime(now)},
}},
},
{
name: "atlimit",
expr: &parser.NumberLiteral{Val: 1},
labels: labels.FromStrings("foo", "bar"),
limit: 1,
result: promql.Vector{promql.Sample{
Metric: labels.FromStrings("__name__", "atlimit", "foo", "bar"),
Point: promql.Point{V: 1, T: timestamp.FromTime(now)},
}},
},
{
name: "overlimit",
expr: &parser.NumberLiteral{Val: 1},
labels: labels.FromStrings("foo", "bar"),
limit: -1,
err: "exceeded limit -1 with 1 samples",
},
}
for _, test := range suite {
rule := NewRecordingRule(test.name, test.expr, test.labels)
result, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil, test.limit)
result, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil, 0)
if test.err == "" {
require.NoError(t, err)
} else {
@ -151,3 +123,52 @@ func TestRuleEvalDuplicate(t *testing.T) {
require.Error(t, err)
require.EqualError(t, err, "vector contains metrics with the same labelset after applying rule labels")
}
func TestRecordingRuleLimit(t *testing.T) {
suite, err := promql.NewTest(t, `
load 1m
metric{label="1"} 1
metric{label="2"} 1
`)
require.NoError(t, err)
defer suite.Close()
require.NoError(t, suite.Run())
tests := []struct {
limit int
err string
}{
{
limit: 0,
},
{
limit: -1,
},
{
limit: 2,
},
{
limit: 1,
err: "exceeded limit of 1 with 2 series",
},
}
expr, _ := parser.ParseExpr(`metric > 0`)
rule := NewRecordingRule(
"foo",
expr,
labels.FromStrings("test", "test"),
)
evalTime := time.Unix(0, 0)
for _, test := range tests {
_, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, test.limit)
if err != nil {
require.EqualError(t, err, test.err)
} else if test.err != "" {
t.Errorf("Expected error %s, got none", test.err)
}
}
}