From e7219e3d366bc912381577e7eb5ec256fc13f275 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Tue, 23 Apr 2024 09:54:21 +0100 Subject: [PATCH 1/4] Rule Manager: Add `rule_group_last_restore_duration_seconds` to measure restore time per rule group When a rule group changes or prometheus is restarted we need to ensure we restore the active alerts that were firing for a corresponding rule, for that Prometheus uses the `ALERTS_FOR_STATE` series to query the previous state and restore it. If a given rule has high cardinality (think 100s of 1000s for series) this proccess can take a bit of time - this is the first of a series of PRs to improve this problem and I'd like to start with exposing the time it takes to restore a rule group as a gauge. Signed-off-by: gotjosh --- rules/group.go | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/rules/group.go b/rules/group.go index c268d2df7d..aafab55445 100644 --- a/rules/group.go +++ b/rules/group.go @@ -230,7 +230,9 @@ func (g *Group) run(ctx context.Context) { g.evalIterationFunc(ctx, g, evalTimestamp) } - g.RestoreForState(time.Now()) + now := time.Now() + g.RestoreForState(now) + g.metrics.GroupLastRestoreDuration.WithLabelValues(GroupKey(g.file, g.name)).Set(time.Since(now).Seconds()) g.shouldRestore = false } @@ -779,17 +781,18 @@ const namespace = "prometheus" // Metrics for rule evaluation. type Metrics struct { - EvalDuration prometheus.Summary - IterationDuration prometheus.Summary - IterationsMissed *prometheus.CounterVec - IterationsScheduled *prometheus.CounterVec - EvalTotal *prometheus.CounterVec - EvalFailures *prometheus.CounterVec - GroupInterval *prometheus.GaugeVec - GroupLastEvalTime *prometheus.GaugeVec - GroupLastDuration *prometheus.GaugeVec - GroupRules *prometheus.GaugeVec - GroupSamples *prometheus.GaugeVec + EvalDuration prometheus.Summary + IterationDuration prometheus.Summary + IterationsMissed *prometheus.CounterVec + IterationsScheduled *prometheus.CounterVec + EvalTotal *prometheus.CounterVec + EvalFailures *prometheus.CounterVec + GroupInterval *prometheus.GaugeVec + GroupLastEvalTime *prometheus.GaugeVec + GroupLastDuration *prometheus.GaugeVec + GroupLastRestoreDuration *prometheus.GaugeVec + GroupRules *prometheus.GaugeVec + GroupSamples *prometheus.GaugeVec } // NewGroupMetrics creates a new instance of Metrics and registers it with the provided registerer, @@ -865,6 +868,14 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics { }, []string{"rule_group"}, ), + GroupLastRestoreDuration: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: "rule_group_last_restore_duration_seconds", + Help: "The duration of the last rule group restoration.", + }, + []string{"rule_group"}, + ), GroupRules: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: namespace, @@ -894,6 +905,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics { m.GroupInterval, m.GroupLastEvalTime, m.GroupLastDuration, + m.GroupLastRestoreDuration, m.GroupRules, m.GroupSamples, ) From 381a77ac1e1ef5e616ad45630dc6700a1916ba1d Mon Sep 17 00:00:00 2001 From: gotjosh Date: Wed, 24 Apr 2024 14:21:11 +0100 Subject: [PATCH 2/4] Change variable name to `restoreStartTime` from `now` and introduce a log line to record total time Signed-off-by: gotjosh --- rules/group.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/rules/group.go b/rules/group.go index aafab55445..27be4b1f40 100644 --- a/rules/group.go +++ b/rules/group.go @@ -230,9 +230,11 @@ func (g *Group) run(ctx context.Context) { g.evalIterationFunc(ctx, g, evalTimestamp) } - now := time.Now() - g.RestoreForState(now) - g.metrics.GroupLastRestoreDuration.WithLabelValues(GroupKey(g.file, g.name)).Set(time.Since(now).Seconds()) + restoreStartTime := time.Now() + g.RestoreForState(restoreStartTime) + totalRestoreTimeSeconds := time.Since(restoreStartTime).Seconds() + g.metrics.GroupLastRestoreDuration.WithLabelValues(GroupKey(g.file, g.name)).Set(totalRestoreTimeSeconds) + level.Debug(g.logger).Log("msg", "'for' state restoration completed", "duration_seconds", totalRestoreTimeSeconds) g.shouldRestore = false } From d672eda97949aabc57d151705f4cdbc847256626 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Wed, 24 Apr 2024 14:31:18 +0100 Subject: [PATCH 3/4] Add a changelog entry Signed-off-by: gotjosh --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0afd8d7026..23d2c89da8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## unreleased * [CHANGE] TSDB: Fix the predicate checking for blocks which are beyond the retention period to include the ones right at the retention boundary. #9633 +* [ENHANCEMENT] Rules: Add `rule_group_last_restore_duration_seconds` to measure the time it takes to restore a rule group. #13974 ## 2.51.2 / 2024-04-09 From 5beb2fe0051fb0ea04e32ab0e0b8bdac86d3ae75 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Wed, 24 Apr 2024 15:24:35 +0100 Subject: [PATCH 4/4] Improve the metric description Signed-off-by: gotjosh --- rules/group.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rules/group.go b/rules/group.go index 27be4b1f40..987136a003 100644 --- a/rules/group.go +++ b/rules/group.go @@ -874,7 +874,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics { prometheus.GaugeOpts{ Namespace: namespace, Name: "rule_group_last_restore_duration_seconds", - Help: "The duration of the last rule group restoration.", + Help: "The duration of the last alert rules alerts restoration using the `ALERTS_FOR_STATE` series.", }, []string{"rule_group"}, ),