From e7219e3d366bc912381577e7eb5ec256fc13f275 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Tue, 23 Apr 2024 09:54:21 +0100 Subject: [PATCH] Rule Manager: Add `rule_group_last_restore_duration_seconds` to measure restore time per rule group When a rule group changes or prometheus is restarted we need to ensure we restore the active alerts that were firing for a corresponding rule, for that Prometheus uses the `ALERTS_FOR_STATE` series to query the previous state and restore it. If a given rule has high cardinality (think 100s of 1000s for series) this proccess can take a bit of time - this is the first of a series of PRs to improve this problem and I'd like to start with exposing the time it takes to restore a rule group as a gauge. Signed-off-by: gotjosh --- rules/group.go | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/rules/group.go b/rules/group.go index c268d2df7..aafab5544 100644 --- a/rules/group.go +++ b/rules/group.go @@ -230,7 +230,9 @@ func (g *Group) run(ctx context.Context) { g.evalIterationFunc(ctx, g, evalTimestamp) } - g.RestoreForState(time.Now()) + now := time.Now() + g.RestoreForState(now) + g.metrics.GroupLastRestoreDuration.WithLabelValues(GroupKey(g.file, g.name)).Set(time.Since(now).Seconds()) g.shouldRestore = false } @@ -779,17 +781,18 @@ const namespace = "prometheus" // Metrics for rule evaluation. type Metrics struct { - EvalDuration prometheus.Summary - IterationDuration prometheus.Summary - IterationsMissed *prometheus.CounterVec - IterationsScheduled *prometheus.CounterVec - EvalTotal *prometheus.CounterVec - EvalFailures *prometheus.CounterVec - GroupInterval *prometheus.GaugeVec - GroupLastEvalTime *prometheus.GaugeVec - GroupLastDuration *prometheus.GaugeVec - GroupRules *prometheus.GaugeVec - GroupSamples *prometheus.GaugeVec + EvalDuration prometheus.Summary + IterationDuration prometheus.Summary + IterationsMissed *prometheus.CounterVec + IterationsScheduled *prometheus.CounterVec + EvalTotal *prometheus.CounterVec + EvalFailures *prometheus.CounterVec + GroupInterval *prometheus.GaugeVec + GroupLastEvalTime *prometheus.GaugeVec + GroupLastDuration *prometheus.GaugeVec + GroupLastRestoreDuration *prometheus.GaugeVec + GroupRules *prometheus.GaugeVec + GroupSamples *prometheus.GaugeVec } // NewGroupMetrics creates a new instance of Metrics and registers it with the provided registerer, @@ -865,6 +868,14 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics { }, []string{"rule_group"}, ), + GroupLastRestoreDuration: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: "rule_group_last_restore_duration_seconds", + Help: "The duration of the last rule group restoration.", + }, + []string{"rule_group"}, + ), GroupRules: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: namespace, @@ -894,6 +905,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics { m.GroupInterval, m.GroupLastEvalTime, m.GroupLastDuration, + m.GroupLastRestoreDuration, m.GroupRules, m.GroupSamples, )