From e7219e3d366bc912381577e7eb5ec256fc13f275 Mon Sep 17 00:00:00 2001
From: gotjosh <josue.abreu@gmail.com>
Date: Tue, 23 Apr 2024 09:54:21 +0100
Subject: [PATCH 1/4] Rule Manager: Add
 `rule_group_last_restore_duration_seconds` to measure restore time per rule
 group

When a rule group changes or prometheus is restarted we need to ensure we restore the active alerts that were firing for a corresponding rule, for that Prometheus uses the `ALERTS_FOR_STATE` series to query the previous state and restore it. If a given rule has high cardinality (think 100s of 1000s for series) this proccess can take a bit of time - this is the first of a series of PRs to improve this problem and I'd like to start with exposing the time it takes to restore a rule group as a gauge.

Signed-off-by: gotjosh <josue.abreu@gmail.com>
---
 rules/group.go | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/rules/group.go b/rules/group.go
index c268d2df7d..aafab55445 100644
--- a/rules/group.go
+++ b/rules/group.go
@@ -230,7 +230,9 @@ func (g *Group) run(ctx context.Context) {
 			g.evalIterationFunc(ctx, g, evalTimestamp)
 		}
 
-		g.RestoreForState(time.Now())
+		now := time.Now()
+		g.RestoreForState(now)
+		g.metrics.GroupLastRestoreDuration.WithLabelValues(GroupKey(g.file, g.name)).Set(time.Since(now).Seconds())
 		g.shouldRestore = false
 	}
 
@@ -779,17 +781,18 @@ const namespace = "prometheus"
 
 // Metrics for rule evaluation.
 type Metrics struct {
-	EvalDuration        prometheus.Summary
-	IterationDuration   prometheus.Summary
-	IterationsMissed    *prometheus.CounterVec
-	IterationsScheduled *prometheus.CounterVec
-	EvalTotal           *prometheus.CounterVec
-	EvalFailures        *prometheus.CounterVec
-	GroupInterval       *prometheus.GaugeVec
-	GroupLastEvalTime   *prometheus.GaugeVec
-	GroupLastDuration   *prometheus.GaugeVec
-	GroupRules          *prometheus.GaugeVec
-	GroupSamples        *prometheus.GaugeVec
+	EvalDuration             prometheus.Summary
+	IterationDuration        prometheus.Summary
+	IterationsMissed         *prometheus.CounterVec
+	IterationsScheduled      *prometheus.CounterVec
+	EvalTotal                *prometheus.CounterVec
+	EvalFailures             *prometheus.CounterVec
+	GroupInterval            *prometheus.GaugeVec
+	GroupLastEvalTime        *prometheus.GaugeVec
+	GroupLastDuration        *prometheus.GaugeVec
+	GroupLastRestoreDuration *prometheus.GaugeVec
+	GroupRules               *prometheus.GaugeVec
+	GroupSamples             *prometheus.GaugeVec
 }
 
 // NewGroupMetrics creates a new instance of Metrics and registers it with the provided registerer,
@@ -865,6 +868,14 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
 			},
 			[]string{"rule_group"},
 		),
+		GroupLastRestoreDuration: prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Namespace: namespace,
+				Name:      "rule_group_last_restore_duration_seconds",
+				Help:      "The duration of the last rule group restoration.",
+			},
+			[]string{"rule_group"},
+		),
 		GroupRules: prometheus.NewGaugeVec(
 			prometheus.GaugeOpts{
 				Namespace: namespace,
@@ -894,6 +905,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
 			m.GroupInterval,
 			m.GroupLastEvalTime,
 			m.GroupLastDuration,
+			m.GroupLastRestoreDuration,
 			m.GroupRules,
 			m.GroupSamples,
 		)

From 381a77ac1e1ef5e616ad45630dc6700a1916ba1d Mon Sep 17 00:00:00 2001
From: gotjosh <josue.abreu@gmail.com>
Date: Wed, 24 Apr 2024 14:21:11 +0100
Subject: [PATCH 2/4] Change variable name to `restoreStartTime` from `now` and
 introduce a log line to record total time

Signed-off-by: gotjosh <josue.abreu@gmail.com>
---
 rules/group.go | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/rules/group.go b/rules/group.go
index aafab55445..27be4b1f40 100644
--- a/rules/group.go
+++ b/rules/group.go
@@ -230,9 +230,11 @@ func (g *Group) run(ctx context.Context) {
 			g.evalIterationFunc(ctx, g, evalTimestamp)
 		}
 
-		now := time.Now()
-		g.RestoreForState(now)
-		g.metrics.GroupLastRestoreDuration.WithLabelValues(GroupKey(g.file, g.name)).Set(time.Since(now).Seconds())
+		restoreStartTime := time.Now()
+		g.RestoreForState(restoreStartTime)
+		totalRestoreTimeSeconds := time.Since(restoreStartTime).Seconds()
+		g.metrics.GroupLastRestoreDuration.WithLabelValues(GroupKey(g.file, g.name)).Set(totalRestoreTimeSeconds)
+		level.Debug(g.logger).Log("msg", "'for' state restoration completed", "duration_seconds", totalRestoreTimeSeconds)
 		g.shouldRestore = false
 	}
 

From d672eda97949aabc57d151705f4cdbc847256626 Mon Sep 17 00:00:00 2001
From: gotjosh <josue.abreu@gmail.com>
Date: Wed, 24 Apr 2024 14:31:18 +0100
Subject: [PATCH 3/4] Add a changelog entry

Signed-off-by: gotjosh <josue.abreu@gmail.com>
---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0afd8d7026..23d2c89da8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## unreleased
 
 * [CHANGE] TSDB: Fix the predicate checking for blocks which are beyond the retention period to include the ones right at the retention boundary. #9633
+* [ENHANCEMENT] Rules: Add `rule_group_last_restore_duration_seconds` to measure the time it takes to restore a rule group. #13974
 
 ## 2.51.2 / 2024-04-09
 

From 5beb2fe0051fb0ea04e32ab0e0b8bdac86d3ae75 Mon Sep 17 00:00:00 2001
From: gotjosh <josue.abreu@gmail.com>
Date: Wed, 24 Apr 2024 15:24:35 +0100
Subject: [PATCH 4/4] Improve the metric description

Signed-off-by: gotjosh <josue.abreu@gmail.com>
---
 rules/group.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rules/group.go b/rules/group.go
index 27be4b1f40..987136a003 100644
--- a/rules/group.go
+++ b/rules/group.go
@@ -874,7 +874,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
 			prometheus.GaugeOpts{
 				Namespace: namespace,
 				Name:      "rule_group_last_restore_duration_seconds",
-				Help:      "The duration of the last rule group restoration.",
+				Help:      "The duration of the last alert rules alerts restoration using the `ALERTS_FOR_STATE` series.",
 			},
 			[]string{"rule_group"},
 		),