mirror of
https://github.com/prometheus/prometheus.git
synced 2025-01-26 21:22:33 -08:00
Rule Manager: Only query once per alert rule when restoring alert state
Prometheus restores alert state between restarts and updates. For each rule, it looks at the alerts that are meant to be active and then queries the `ALERTS_FOR_STATE` series for _each_ alert within the rules. If the alert rule has 120 instances (or series) it'll execute the same query with slightly different labels. This PR changes the approach so that we only query once per alert rule and then match the corresponding alert that we're about to restore against the series-set. While the approach might use a bit more memory at start-up (if even?) the restore proccess is only ran once per restart so I'd consider this a big win. This builds on top of #13974 Signed-off-by: gotjosh <josue.abreu@gmail.com>
This commit is contained in:
parent
4ac78063ee
commit
4daaa59c08
|
@ -246,13 +246,16 @@ func (r *AlertingRule) sample(alert *Alert, ts time.Time) promql.Sample {
|
|||
return s
|
||||
}
|
||||
|
||||
// forStateSample returns the sample for ALERTS_FOR_STATE.
|
||||
// forStateSample returns a promql.Sample with the rule labels, `ALERTS_FOR_STATE` as the metric name and the rule name as the `alertname` label.
|
||||
// Optionally, if an alert is provided it'll copy the labels of the alert into the sample labels.
|
||||
func (r *AlertingRule) forStateSample(alert *Alert, ts time.Time, v float64) promql.Sample {
|
||||
lb := labels.NewBuilder(r.labels)
|
||||
|
||||
alert.Labels.Range(func(l labels.Label) {
|
||||
lb.Set(l.Name, l.Value)
|
||||
})
|
||||
if alert != nil {
|
||||
alert.Labels.Range(func(l labels.Label) {
|
||||
lb.Set(l.Name, l.Value)
|
||||
})
|
||||
}
|
||||
|
||||
lb.Set(labels.MetricName, alertForStateMetricName)
|
||||
lb.Set(labels.AlertName, r.name)
|
||||
|
@ -265,9 +268,11 @@ func (r *AlertingRule) forStateSample(alert *Alert, ts time.Time, v float64) pro
|
|||
return s
|
||||
}
|
||||
|
||||
// QueryforStateSeries returns the series for ALERTS_FOR_STATE.
|
||||
func (r *AlertingRule) QueryforStateSeries(ctx context.Context, alert *Alert, q storage.Querier) (storage.Series, error) {
|
||||
smpl := r.forStateSample(alert, time.Now(), 0)
|
||||
// QueryforStateSeries returns the series for ALERTS_FOR_STATE of the alert rule.
|
||||
func (r *AlertingRule) QueryforStateSeries(ctx context.Context, q storage.Querier) (storage.SeriesSet, error) {
|
||||
// We use a sample to ease the building of matchers.
|
||||
// Don't provide an alert as we want matchers that match all series for the alert rule.
|
||||
smpl := r.forStateSample(nil, time.Now(), 0)
|
||||
var matchers []*labels.Matcher
|
||||
smpl.Metric.Range(func(l labels.Label) {
|
||||
mt, err := labels.NewMatcher(labels.MatchEqual, l.Name, l.Value)
|
||||
|
@ -278,18 +283,7 @@ func (r *AlertingRule) QueryforStateSeries(ctx context.Context, alert *Alert, q
|
|||
})
|
||||
sset := q.Select(ctx, false, nil, matchers...)
|
||||
|
||||
var s storage.Series
|
||||
for sset.Next() {
|
||||
// Query assures that smpl.Metric is included in sset.At().Labels(),
|
||||
// hence just checking the length would act like equality.
|
||||
// (This is faster than calling labels.Compare again as we already have some info).
|
||||
if sset.At().Labels().Len() == len(matchers) {
|
||||
s = sset.At()
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return s, sset.Err()
|
||||
return sset, sset.Err()
|
||||
}
|
||||
|
||||
// SetEvaluationDuration updates evaluationDuration to the duration it took to evaluate the rule on its last evaluation.
|
||||
|
|
|
@ -664,19 +664,32 @@ func (g *Group) RestoreForState(ts time.Time) {
|
|||
continue
|
||||
}
|
||||
|
||||
sset, err := alertRule.QueryforStateSeries(g.opts.Context, q)
|
||||
if err != nil {
|
||||
level.Error(g.logger).Log(
|
||||
"msg", "Failed to restore 'for' state",
|
||||
labels.AlertName, alertRule.Name(),
|
||||
"stage", "Select",
|
||||
"err", err,
|
||||
)
|
||||
continue
|
||||
}
|
||||
|
||||
// No results for this alert rule.
|
||||
if err == nil {
|
||||
level.Debug(g.logger).Log("msg", "Failed to find a series to restore the 'for' state", labels.AlertName, alertRule.Name())
|
||||
continue
|
||||
}
|
||||
|
||||
alertRule.ForEachActiveAlert(func(a *Alert) {
|
||||
var s storage.Series
|
||||
|
||||
s, err := alertRule.QueryforStateSeries(g.opts.Context, a, q)
|
||||
if err != nil {
|
||||
// Querier Warnings are ignored. We do not care unless we have an error.
|
||||
level.Error(g.logger).Log(
|
||||
"msg", "Failed to restore 'for' state",
|
||||
labels.AlertName, alertRule.Name(),
|
||||
"stage", "Select",
|
||||
"err", err,
|
||||
)
|
||||
return
|
||||
// Find the series for the given alert from the set.
|
||||
for sset.Next() {
|
||||
if sset.At().Labels().Hash() == a.Labels.Hash() {
|
||||
s = sset.At()
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if s == nil {
|
||||
|
|
Loading…
Reference in a new issue