mirror of
https://github.com/prometheus/prometheus.git
synced 2025-01-11 22:07:27 -08:00
Rule alerts/series limit updates (#9541)
* Add docs and do not limit inactive alerts. Signed-off-by: Levi Harrison <git@leviharrison.dev>
This commit is contained in:
parent
8c3eca84db
commit
d81bbe154d
|
@ -78,8 +78,8 @@ name: <string>
|
||||||
# How often rules in the group are evaluated.
|
# How often rules in the group are evaluated.
|
||||||
[ interval: <duration> | default = global.evaluation_interval ]
|
[ interval: <duration> | default = global.evaluation_interval ]
|
||||||
|
|
||||||
# Limit the number of alerts and series individual rules can produce.
|
# Limit the number of alerts an alerting rule and series a recording
|
||||||
# 0 is no limit.
|
# rule can produce. 0 is no limit.
|
||||||
[ limit: <int> | default = 0 ]
|
[ limit: <int> | default = 0 ]
|
||||||
|
|
||||||
rules:
|
rules:
|
||||||
|
@ -128,3 +128,11 @@ annotations:
|
||||||
[ <labelname>: <tmpl_string> ]
|
[ <labelname>: <tmpl_string> ]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
# Limiting alerts and series
|
||||||
|
|
||||||
|
A limit for alerts produced by alerting rules and series produced recording rules
|
||||||
|
can be configured per-group. When the limit is exceeded, _all_ series produced
|
||||||
|
by the rule are discarded, and if it's an alerting rule, _all_ alerts for
|
||||||
|
the rule, active, pending, or inactive, are cleared as well. The event will be
|
||||||
|
recorded as an error in the evaluation, and as such no stale markers are
|
||||||
|
written.
|
||||||
|
|
|
@ -389,6 +389,7 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc,
|
||||||
r.active[h] = a
|
r.active[h] = a
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var numActivePending int
|
||||||
// Check if any pending alerts should be removed or fire now. Write out alert timeseries.
|
// Check if any pending alerts should be removed or fire now. Write out alert timeseries.
|
||||||
for fp, a := range r.active {
|
for fp, a := range r.active {
|
||||||
if _, ok := resultFPs[fp]; !ok {
|
if _, ok := resultFPs[fp]; !ok {
|
||||||
|
@ -403,6 +404,7 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc,
|
||||||
}
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
numActivePending++
|
||||||
|
|
||||||
if a.State == StatePending && ts.Sub(a.ActiveAt) >= r.holdDuration {
|
if a.State == StatePending && ts.Sub(a.ActiveAt) >= r.holdDuration {
|
||||||
a.State = StateFiring
|
a.State = StateFiring
|
||||||
|
@ -415,10 +417,9 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
numActive := len(r.active)
|
if limit > 0 && numActivePending > limit {
|
||||||
if limit != 0 && numActive > limit {
|
|
||||||
r.active = map[uint64]*Alert{}
|
r.active = map[uint64]*Alert{}
|
||||||
return nil, errors.Errorf("exceeded limit of %d with %d alerts", limit, numActive)
|
return nil, errors.Errorf("exceeded limit of %d with %d alerts", limit, numActivePending)
|
||||||
}
|
}
|
||||||
|
|
||||||
return vec, nil
|
return vec, nil
|
||||||
|
|
|
@ -466,23 +466,17 @@ func TestAlertingRuleDuplicate(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestAlertingRuleLimit(t *testing.T) {
|
func TestAlertingRuleLimit(t *testing.T) {
|
||||||
storage := teststorage.New(t)
|
suite, err := promql.NewTest(t, `
|
||||||
defer storage.Close()
|
load 1m
|
||||||
|
metric{label="1"} 1
|
||||||
|
metric{label="2"} 1
|
||||||
|
`)
|
||||||
|
require.NoError(t, err)
|
||||||
|
defer suite.Close()
|
||||||
|
|
||||||
opts := promql.EngineOpts{
|
require.NoError(t, suite.Run())
|
||||||
Logger: nil,
|
|
||||||
Reg: nil,
|
|
||||||
MaxSamples: 10,
|
|
||||||
Timeout: 10 * time.Second,
|
|
||||||
}
|
|
||||||
|
|
||||||
engine := promql.NewEngine(opts)
|
tests := []struct {
|
||||||
ctx, cancelCtx := context.WithCancel(context.Background())
|
|
||||||
defer cancelCtx()
|
|
||||||
|
|
||||||
now := time.Now()
|
|
||||||
|
|
||||||
suite := []struct {
|
|
||||||
limit int
|
limit int
|
||||||
err string
|
err string
|
||||||
}{
|
}{
|
||||||
|
@ -490,16 +484,18 @@ func TestAlertingRuleLimit(t *testing.T) {
|
||||||
limit: 0,
|
limit: 0,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
limit: 1,
|
limit: -1,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
limit: -1,
|
limit: 2,
|
||||||
err: "exceeded limit of -1 with 1 alerts",
|
},
|
||||||
|
{
|
||||||
|
limit: 1,
|
||||||
|
err: "exceeded limit of 1 with 2 alerts",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, test := range suite {
|
expr, _ := parser.ParseExpr(`metric > 0`)
|
||||||
expr, _ := parser.ParseExpr(`1`)
|
|
||||||
rule := NewAlertingRule(
|
rule := NewAlertingRule(
|
||||||
"foo",
|
"foo",
|
||||||
expr,
|
expr,
|
||||||
|
@ -510,11 +506,15 @@ func TestAlertingRuleLimit(t *testing.T) {
|
||||||
"",
|
"",
|
||||||
true, log.NewNopLogger(),
|
true, log.NewNopLogger(),
|
||||||
)
|
)
|
||||||
_, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil, test.limit)
|
|
||||||
if test.err == "" {
|
evalTime := time.Unix(0, 0)
|
||||||
require.NoError(t, err)
|
|
||||||
} else {
|
for _, test := range tests {
|
||||||
require.Equal(t, test.err, err.Error())
|
_, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, test.limit)
|
||||||
|
if err != nil {
|
||||||
|
require.EqualError(t, err, test.err)
|
||||||
|
} else if test.err != "" {
|
||||||
|
t.Errorf("Expected errror %s, got none", test.err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -99,9 +99,9 @@ func (rule *RecordingRule) Eval(ctx context.Context, ts time.Time, query QueryFu
|
||||||
return nil, fmt.Errorf("vector contains metrics with the same labelset after applying rule labels")
|
return nil, fmt.Errorf("vector contains metrics with the same labelset after applying rule labels")
|
||||||
}
|
}
|
||||||
|
|
||||||
numSamples := len(vector)
|
numSeries := len(vector)
|
||||||
if limit != 0 && numSamples > limit {
|
if limit > 0 && numSeries > limit {
|
||||||
return nil, fmt.Errorf("exceeded limit %d with %d samples", limit, numSamples)
|
return nil, fmt.Errorf("exceeded limit of %d with %d series", limit, numSeries)
|
||||||
}
|
}
|
||||||
|
|
||||||
rule.SetHealth(HealthGood)
|
rule.SetHealth(HealthGood)
|
||||||
|
|
|
@ -49,7 +49,6 @@ func TestRuleEval(t *testing.T) {
|
||||||
name string
|
name string
|
||||||
expr parser.Expr
|
expr parser.Expr
|
||||||
labels labels.Labels
|
labels labels.Labels
|
||||||
limit int
|
|
||||||
result promql.Vector
|
result promql.Vector
|
||||||
err string
|
err string
|
||||||
}{
|
}{
|
||||||
|
@ -71,38 +70,11 @@ func TestRuleEval(t *testing.T) {
|
||||||
Point: promql.Point{V: 1, T: timestamp.FromTime(now)},
|
Point: promql.Point{V: 1, T: timestamp.FromTime(now)},
|
||||||
}},
|
}},
|
||||||
},
|
},
|
||||||
{
|
|
||||||
name: "underlimit",
|
|
||||||
expr: &parser.NumberLiteral{Val: 1},
|
|
||||||
labels: labels.FromStrings("foo", "bar"),
|
|
||||||
limit: 2,
|
|
||||||
result: promql.Vector{promql.Sample{
|
|
||||||
Metric: labels.FromStrings("__name__", "underlimit", "foo", "bar"),
|
|
||||||
Point: promql.Point{V: 1, T: timestamp.FromTime(now)},
|
|
||||||
}},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "atlimit",
|
|
||||||
expr: &parser.NumberLiteral{Val: 1},
|
|
||||||
labels: labels.FromStrings("foo", "bar"),
|
|
||||||
limit: 1,
|
|
||||||
result: promql.Vector{promql.Sample{
|
|
||||||
Metric: labels.FromStrings("__name__", "atlimit", "foo", "bar"),
|
|
||||||
Point: promql.Point{V: 1, T: timestamp.FromTime(now)},
|
|
||||||
}},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "overlimit",
|
|
||||||
expr: &parser.NumberLiteral{Val: 1},
|
|
||||||
labels: labels.FromStrings("foo", "bar"),
|
|
||||||
limit: -1,
|
|
||||||
err: "exceeded limit -1 with 1 samples",
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, test := range suite {
|
for _, test := range suite {
|
||||||
rule := NewRecordingRule(test.name, test.expr, test.labels)
|
rule := NewRecordingRule(test.name, test.expr, test.labels)
|
||||||
result, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil, test.limit)
|
result, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil, 0)
|
||||||
if test.err == "" {
|
if test.err == "" {
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
} else {
|
} else {
|
||||||
|
@ -151,3 +123,52 @@ func TestRuleEvalDuplicate(t *testing.T) {
|
||||||
require.Error(t, err)
|
require.Error(t, err)
|
||||||
require.EqualError(t, err, "vector contains metrics with the same labelset after applying rule labels")
|
require.EqualError(t, err, "vector contains metrics with the same labelset after applying rule labels")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRecordingRuleLimit(t *testing.T) {
|
||||||
|
suite, err := promql.NewTest(t, `
|
||||||
|
load 1m
|
||||||
|
metric{label="1"} 1
|
||||||
|
metric{label="2"} 1
|
||||||
|
`)
|
||||||
|
require.NoError(t, err)
|
||||||
|
defer suite.Close()
|
||||||
|
|
||||||
|
require.NoError(t, suite.Run())
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
limit int
|
||||||
|
err string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
limit: 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
limit: -1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
limit: 2,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
limit: 1,
|
||||||
|
err: "exceeded limit of 1 with 2 series",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
expr, _ := parser.ParseExpr(`metric > 0`)
|
||||||
|
rule := NewRecordingRule(
|
||||||
|
"foo",
|
||||||
|
expr,
|
||||||
|
labels.FromStrings("test", "test"),
|
||||||
|
)
|
||||||
|
|
||||||
|
evalTime := time.Unix(0, 0)
|
||||||
|
|
||||||
|
for _, test := range tests {
|
||||||
|
_, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, test.limit)
|
||||||
|
if err != nil {
|
||||||
|
require.EqualError(t, err, test.err)
|
||||||
|
} else if test.err != "" {
|
||||||
|
t.Errorf("Expected error %s, got none", test.err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue