Rule Manager: Only query once per alert rule when restoring alert state

Prometheus restores alert state between restarts and updates. For each rule, it looks at the alerts that are meant to be active and then queries the `ALERTS_FOR_STATE` series for _each_ alert within the rules.

If the alert rule has 120 instances (or series) it'll execute the same query with slightly different labels.

This PR changes the approach so that we only query once per alert rule and then match the corresponding alert that we're about to restore against the series-set. While the approach might use a bit more memory at start-up (if even?) the restore proccess is only ran once per restart so I'd consider this a big win.

This builds on top of #13974

Signed-off-by: gotjosh <josue.abreu@gmail.com>
This commit is contained in:
gotjosh 2024-04-23 19:40:10 +01:00
parent 4ac78063ee
commit 4daaa59c08
No known key found for this signature in database
GPG key ID: A6E1DDE38FF3C74E
2 changed files with 36 additions and 29 deletions

View file

@ -246,13 +246,16 @@ func (r *AlertingRule) sample(alert *Alert, ts time.Time) promql.Sample {
return s
}
// forStateSample returns the sample for ALERTS_FOR_STATE.
// forStateSample returns a promql.Sample with the rule labels, `ALERTS_FOR_STATE` as the metric name and the rule name as the `alertname` label.
// Optionally, if an alert is provided it'll copy the labels of the alert into the sample labels.
func (r *AlertingRule) forStateSample(alert *Alert, ts time.Time, v float64) promql.Sample {
lb := labels.NewBuilder(r.labels)
alert.Labels.Range(func(l labels.Label) {
lb.Set(l.Name, l.Value)
})
if alert != nil {
alert.Labels.Range(func(l labels.Label) {
lb.Set(l.Name, l.Value)
})
}
lb.Set(labels.MetricName, alertForStateMetricName)
lb.Set(labels.AlertName, r.name)
@ -265,9 +268,11 @@ func (r *AlertingRule) forStateSample(alert *Alert, ts time.Time, v float64) pro
return s
}
// QueryforStateSeries returns the series for ALERTS_FOR_STATE.
func (r *AlertingRule) QueryforStateSeries(ctx context.Context, alert *Alert, q storage.Querier) (storage.Series, error) {
smpl := r.forStateSample(alert, time.Now(), 0)
// QueryforStateSeries returns the series for ALERTS_FOR_STATE of the alert rule.
func (r *AlertingRule) QueryforStateSeries(ctx context.Context, q storage.Querier) (storage.SeriesSet, error) {
// We use a sample to ease the building of matchers.
// Don't provide an alert as we want matchers that match all series for the alert rule.
smpl := r.forStateSample(nil, time.Now(), 0)
var matchers []*labels.Matcher
smpl.Metric.Range(func(l labels.Label) {
mt, err := labels.NewMatcher(labels.MatchEqual, l.Name, l.Value)
@ -278,18 +283,7 @@ func (r *AlertingRule) QueryforStateSeries(ctx context.Context, alert *Alert, q
})
sset := q.Select(ctx, false, nil, matchers...)
var s storage.Series
for sset.Next() {
// Query assures that smpl.Metric is included in sset.At().Labels(),
// hence just checking the length would act like equality.
// (This is faster than calling labels.Compare again as we already have some info).
if sset.At().Labels().Len() == len(matchers) {
s = sset.At()
break
}
}
return s, sset.Err()
return sset, sset.Err()
}
// SetEvaluationDuration updates evaluationDuration to the duration it took to evaluate the rule on its last evaluation.

View file

@ -664,19 +664,32 @@ func (g *Group) RestoreForState(ts time.Time) {
continue
}
sset, err := alertRule.QueryforStateSeries(g.opts.Context, q)
if err != nil {
level.Error(g.logger).Log(
"msg", "Failed to restore 'for' state",
labels.AlertName, alertRule.Name(),
"stage", "Select",
"err", err,
)
continue
}
// No results for this alert rule.
if err == nil {
level.Debug(g.logger).Log("msg", "Failed to find a series to restore the 'for' state", labels.AlertName, alertRule.Name())
continue
}
alertRule.ForEachActiveAlert(func(a *Alert) {
var s storage.Series
s, err := alertRule.QueryforStateSeries(g.opts.Context, a, q)
if err != nil {
// Querier Warnings are ignored. We do not care unless we have an error.
level.Error(g.logger).Log(
"msg", "Failed to restore 'for' state",
labels.AlertName, alertRule.Name(),
"stage", "Select",
"err", err,
)
return
// Find the series for the given alert from the set.
for sset.Next() {
if sset.At().Labels().Hash() == a.Labels.Hash() {
s = sset.At()
break
}
}
if s == nil {