From 4daaa59c081fc63bcdf3d73e77babeb6f5494a53 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Tue, 23 Apr 2024 19:40:10 +0100 Subject: [PATCH 01/13] Rule Manager: Only query once per alert rule when restoring alert state Prometheus restores alert state between restarts and updates. For each rule, it looks at the alerts that are meant to be active and then queries the `ALERTS_FOR_STATE` series for _each_ alert within the rules. If the alert rule has 120 instances (or series) it'll execute the same query with slightly different labels. This PR changes the approach so that we only query once per alert rule and then match the corresponding alert that we're about to restore against the series-set. While the approach might use a bit more memory at start-up (if even?) the restore proccess is only ran once per restart so I'd consider this a big win. This builds on top of #13974 Signed-off-by: gotjosh --- rules/alerting.go | 32 +++++++++++++------------------- rules/group.go | 33 +++++++++++++++++++++++---------- 2 files changed, 36 insertions(+), 29 deletions(-) diff --git a/rules/alerting.go b/rules/alerting.go index 50c67fa2d..1bcf0a034 100644 --- a/rules/alerting.go +++ b/rules/alerting.go @@ -246,13 +246,16 @@ func (r *AlertingRule) sample(alert *Alert, ts time.Time) promql.Sample { return s } -// forStateSample returns the sample for ALERTS_FOR_STATE. +// forStateSample returns a promql.Sample with the rule labels, `ALERTS_FOR_STATE` as the metric name and the rule name as the `alertname` label. +// Optionally, if an alert is provided it'll copy the labels of the alert into the sample labels. func (r *AlertingRule) forStateSample(alert *Alert, ts time.Time, v float64) promql.Sample { lb := labels.NewBuilder(r.labels) - alert.Labels.Range(func(l labels.Label) { - lb.Set(l.Name, l.Value) - }) + if alert != nil { + alert.Labels.Range(func(l labels.Label) { + lb.Set(l.Name, l.Value) + }) + } lb.Set(labels.MetricName, alertForStateMetricName) lb.Set(labels.AlertName, r.name) @@ -265,9 +268,11 @@ func (r *AlertingRule) forStateSample(alert *Alert, ts time.Time, v float64) pro return s } -// QueryforStateSeries returns the series for ALERTS_FOR_STATE. -func (r *AlertingRule) QueryforStateSeries(ctx context.Context, alert *Alert, q storage.Querier) (storage.Series, error) { - smpl := r.forStateSample(alert, time.Now(), 0) +// QueryforStateSeries returns the series for ALERTS_FOR_STATE of the alert rule. +func (r *AlertingRule) QueryforStateSeries(ctx context.Context, q storage.Querier) (storage.SeriesSet, error) { + // We use a sample to ease the building of matchers. + // Don't provide an alert as we want matchers that match all series for the alert rule. + smpl := r.forStateSample(nil, time.Now(), 0) var matchers []*labels.Matcher smpl.Metric.Range(func(l labels.Label) { mt, err := labels.NewMatcher(labels.MatchEqual, l.Name, l.Value) @@ -278,18 +283,7 @@ func (r *AlertingRule) QueryforStateSeries(ctx context.Context, alert *Alert, q }) sset := q.Select(ctx, false, nil, matchers...) - var s storage.Series - for sset.Next() { - // Query assures that smpl.Metric is included in sset.At().Labels(), - // hence just checking the length would act like equality. - // (This is faster than calling labels.Compare again as we already have some info). - if sset.At().Labels().Len() == len(matchers) { - s = sset.At() - break - } - } - - return s, sset.Err() + return sset, sset.Err() } // SetEvaluationDuration updates evaluationDuration to the duration it took to evaluate the rule on its last evaluation. diff --git a/rules/group.go b/rules/group.go index 987136a00..81f7b1df2 100644 --- a/rules/group.go +++ b/rules/group.go @@ -664,19 +664,32 @@ func (g *Group) RestoreForState(ts time.Time) { continue } + sset, err := alertRule.QueryforStateSeries(g.opts.Context, q) + if err != nil { + level.Error(g.logger).Log( + "msg", "Failed to restore 'for' state", + labels.AlertName, alertRule.Name(), + "stage", "Select", + "err", err, + ) + continue + } + + // No results for this alert rule. + if err == nil { + level.Debug(g.logger).Log("msg", "Failed to find a series to restore the 'for' state", labels.AlertName, alertRule.Name()) + continue + } + alertRule.ForEachActiveAlert(func(a *Alert) { var s storage.Series - s, err := alertRule.QueryforStateSeries(g.opts.Context, a, q) - if err != nil { - // Querier Warnings are ignored. We do not care unless we have an error. - level.Error(g.logger).Log( - "msg", "Failed to restore 'for' state", - labels.AlertName, alertRule.Name(), - "stage", "Select", - "err", err, - ) - return + // Find the series for the given alert from the set. + for sset.Next() { + if sset.At().Labels().Hash() == a.Labels.Hash() { + s = sset.At() + break + } } if s == nil { From e6dcbd2e26b73c6e4e4e0b8bc89c3d33f7de05c3 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Tue, 23 Apr 2024 19:49:07 +0100 Subject: [PATCH 02/13] bug: nil check against the series set not errors Signed-off-by: gotjosh --- rules/alerting.go | 2 +- rules/group.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rules/alerting.go b/rules/alerting.go index 1bcf0a034..2cadd3ac5 100644 --- a/rules/alerting.go +++ b/rules/alerting.go @@ -281,8 +281,8 @@ func (r *AlertingRule) QueryforStateSeries(ctx context.Context, q storage.Querie } matchers = append(matchers, mt) }) - sset := q.Select(ctx, false, nil, matchers...) + sset := q.Select(ctx, false, nil, matchers...) return sset, sset.Err() } diff --git a/rules/group.go b/rules/group.go index 81f7b1df2..7afeaf96e 100644 --- a/rules/group.go +++ b/rules/group.go @@ -676,7 +676,7 @@ func (g *Group) RestoreForState(ts time.Time) { } // No results for this alert rule. - if err == nil { + if sset == nil { level.Debug(g.logger).Log("msg", "Failed to find a series to restore the 'for' state", labels.AlertName, alertRule.Name()) continue } From 276201598c18b0ddb62defb9badebadc000c4e4a Mon Sep 17 00:00:00 2001 From: gotjosh Date: Wed, 24 Apr 2024 12:27:35 +0100 Subject: [PATCH 03/13] Fix tests and a bug with the series lookup logic. Signed-off-by: gotjosh --- rules/alerting_test.go | 26 ++++--- rules/group.go | 17 ++--- rules/manager_test.go | 162 +++++++++++++++++++++-------------------- 3 files changed, 105 insertions(+), 100 deletions(-) diff --git a/rules/alerting_test.go b/rules/alerting_test.go index ddfe345ef..366297b4a 100644 --- a/rules/alerting_test.go +++ b/rules/alerting_test.go @@ -710,19 +710,21 @@ func TestQueryForStateSeries(t *testing.T) { labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil, ) - alert := &Alert{ - State: 0, - Labels: labels.EmptyLabels(), - Annotations: labels.EmptyLabels(), - Value: 0, - ActiveAt: time.Time{}, - FiredAt: time.Time{}, - ResolvedAt: time.Time{}, - LastSentAt: time.Time{}, - ValidUntil: time.Time{}, - } + sample := rule.forStateSample(nil, time.Time{}, 0) + var matchersCount int + sample.Metric.Range(func(l labels.Label) { + matchersCount++ + }) - series, err := rule.QueryforStateSeries(context.Background(), alert, querier) + seriesSet, err := rule.QueryforStateSeries(context.Background(), querier) + + var series storage.Series + for seriesSet.Next() { + if seriesSet.At().Labels().Len() == matchersCount { + series = seriesSet.At() + break + } + } require.Equal(t, tst.expectedSeries, series) require.Equal(t, tst.expectedError, err) diff --git a/rules/group.go b/rules/group.go index 7afeaf96e..411ca68e2 100644 --- a/rules/group.go +++ b/rules/group.go @@ -681,21 +681,18 @@ func (g *Group) RestoreForState(ts time.Time) { continue } + result := map[uint64]storage.Series{} + for sset.Next() { + result[sset.At().Labels().DropMetricName().Hash()] = sset.At() + } + alertRule.ForEachActiveAlert(func(a *Alert) { var s storage.Series - // Find the series for the given alert from the set. - for sset.Next() { - if sset.At().Labels().Hash() == a.Labels.Hash() { - s = sset.At() - break - } - } - - if s == nil { + s, ok := result[a.Labels.Hash()] + if !ok { return } - // Series found for the 'for' state. var t int64 var v float64 diff --git a/rules/manager_test.go b/rules/manager_test.go index 50ab6b861..a3bd335d1 100644 --- a/rules/manager_test.go +++ b/rules/manager_test.go @@ -407,6 +407,7 @@ func TestForStateRestore(t *testing.T) { // Prometheus goes down here. We create new rules and groups. type testInput struct { + name string restoreDuration time.Duration alerts []*Alert @@ -414,105 +415,110 @@ func TestForStateRestore(t *testing.T) { noRestore bool gracePeriod bool downDuration time.Duration + before func() } tests := []testInput{ { - // Normal restore (alerts were not firing). + name: "normal restore (alerts were not firing)", restoreDuration: 15 * time.Minute, alerts: rule.ActiveAlerts(), downDuration: 10 * time.Minute, }, { - // Testing Outage Tolerance. + name: "outage tolerance", restoreDuration: 40 * time.Minute, noRestore: true, num: 2, }, { - // No active alerts. + name: "no active alerts", restoreDuration: 50 * time.Minute, alerts: []*Alert{}, }, + { + name: "test the grace period", + restoreDuration: 25 * time.Minute, + alerts: []*Alert{}, + gracePeriod: true, + before: func() { + for _, duration := range []time.Duration{10 * time.Minute, 15 * time.Minute, 20 * time.Minute} { + evalTime := baseTime.Add(duration) + group.Eval(context.TODO(), evalTime) + } + }, + num: 2, + }, } - testFunc := func(tst testInput) { - newRule := NewAlertingRule( - "HTTPRequestRateLow", - expr, - alertForDuration, - 0, - labels.FromStrings("severity", "critical"), - labels.EmptyLabels(), labels.EmptyLabels(), "", false, nil, - ) - newGroup := NewGroup(GroupOptions{ - Name: "default", - Interval: time.Second, - Rules: []Rule{newRule}, - ShouldRestore: true, - Opts: opts, + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if tt.before != nil { + tt.before() + } + + newRule := NewAlertingRule( + "HTTPRequestRateLow", + expr, + alertForDuration, + 0, + labels.FromStrings("severity", "critical"), + labels.EmptyLabels(), labels.EmptyLabels(), "", false, nil, + ) + newGroup := NewGroup(GroupOptions{ + Name: "default", + Interval: time.Second, + Rules: []Rule{newRule}, + ShouldRestore: true, + Opts: opts, + }) + + newGroups := make(map[string]*Group) + newGroups["default;"] = newGroup + + restoreTime := baseTime.Add(tt.restoreDuration) + // First eval before restoration. + newGroup.Eval(context.TODO(), restoreTime) + // Restore happens here. + newGroup.RestoreForState(restoreTime) + + got := newRule.ActiveAlerts() + for _, aa := range got { + require.Zero(t, aa.Labels.Get(model.MetricNameLabel), "%s label set on active alert: %s", model.MetricNameLabel, aa.Labels) + } + sort.Slice(got, func(i, j int) bool { + return labels.Compare(got[i].Labels, got[j].Labels) < 0 + }) + + // Checking if we have restored it correctly. + switch { + case tt.noRestore: + require.Len(t, got, tt.num) + for _, e := range got { + require.Equal(t, e.ActiveAt, restoreTime) + } + case tt.gracePeriod: + + require.Len(t, got, tt.num) + for _, e := range got { + require.Equal(t, opts.ForGracePeriod, e.ActiveAt.Add(alertForDuration).Sub(restoreTime)) + } + default: + exp := tt.alerts + require.Equal(t, len(exp), len(got)) + sortAlerts(exp) + sortAlerts(got) + for i, e := range exp { + require.Equal(t, e.Labels, got[i].Labels) + + // Difference in time should be within 1e6 ns, i.e. 1ms + // (due to conversion between ns & ms, float64 & int64). + activeAtDiff := float64(e.ActiveAt.Unix() + int64(tt.downDuration/time.Second) - got[i].ActiveAt.Unix()) + require.Equal(t, 0.0, math.Abs(activeAtDiff), "'for' state restored time is wrong") + } + } }) - - newGroups := make(map[string]*Group) - newGroups["default;"] = newGroup - - restoreTime := baseTime.Add(tst.restoreDuration) - // First eval before restoration. - newGroup.Eval(context.TODO(), restoreTime) - // Restore happens here. - newGroup.RestoreForState(restoreTime) - - got := newRule.ActiveAlerts() - for _, aa := range got { - require.Zero(t, aa.Labels.Get(model.MetricNameLabel), "%s label set on active alert: %s", model.MetricNameLabel, aa.Labels) - } - sort.Slice(got, func(i, j int) bool { - return labels.Compare(got[i].Labels, got[j].Labels) < 0 - }) - - // Checking if we have restored it correctly. - switch { - case tst.noRestore: - require.Len(t, got, tst.num) - for _, e := range got { - require.Equal(t, e.ActiveAt, restoreTime) - } - case tst.gracePeriod: - require.Len(t, got, tst.num) - for _, e := range got { - require.Equal(t, opts.ForGracePeriod, e.ActiveAt.Add(alertForDuration).Sub(restoreTime)) - } - default: - exp := tst.alerts - require.Equal(t, len(exp), len(got)) - sortAlerts(exp) - sortAlerts(got) - for i, e := range exp { - require.Equal(t, e.Labels, got[i].Labels) - - // Difference in time should be within 1e6 ns, i.e. 1ms - // (due to conversion between ns & ms, float64 & int64). - activeAtDiff := float64(e.ActiveAt.Unix() + int64(tst.downDuration/time.Second) - got[i].ActiveAt.Unix()) - require.Equal(t, 0.0, math.Abs(activeAtDiff), "'for' state restored time is wrong") - } - } } - - for _, tst := range tests { - testFunc(tst) - } - - // Testing the grace period. - for _, duration := range []time.Duration{10 * time.Minute, 15 * time.Minute, 20 * time.Minute} { - evalTime := baseTime.Add(duration) - group.Eval(context.TODO(), evalTime) - } - testFunc(testInput{ - restoreDuration: 25 * time.Minute, - alerts: []*Alert{}, - gracePeriod: true, - num: 2, - }) } func TestStaleness(t *testing.T) { From fa75985c1c4695b077cefdbb9e6fd9ec73ee8c0e Mon Sep 17 00:00:00 2001 From: gotjosh Date: Wed, 24 Apr 2024 14:04:16 +0100 Subject: [PATCH 04/13] Use the string representation of the labels instead of the hash Signed-off-by: gotjosh --- rules/group.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rules/group.go b/rules/group.go index 411ca68e2..4acce6e60 100644 --- a/rules/group.go +++ b/rules/group.go @@ -681,15 +681,15 @@ func (g *Group) RestoreForState(ts time.Time) { continue } - result := map[uint64]storage.Series{} + result := map[string]storage.Series{} for sset.Next() { - result[sset.At().Labels().DropMetricName().Hash()] = sset.At() + result[sset.At().Labels().DropMetricName().String()] = sset.At() } alertRule.ForEachActiveAlert(func(a *Alert) { var s storage.Series - s, ok := result[a.Labels.Hash()] + s, ok := result[a.Labels.String()] if !ok { return } From 6cfc58430829895dbf10e749da9bddd8ae8c25f4 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Wed, 24 Apr 2024 19:02:47 +0100 Subject: [PATCH 05/13] - Add a changelog entry - Improve variable name of the map produced by the series set Signed-off-by: gotjosh --- CHANGELOG.md | 1 + rules/group.go | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 23d2c89da..58293351c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## unreleased * [CHANGE] TSDB: Fix the predicate checking for blocks which are beyond the retention period to include the ones right at the retention boundary. #9633 +* [CHANGE] Rules: Execute 1 query instead of N (where N is the number of alerts within alert rule) when restoring alerts. #13980 * [ENHANCEMENT] Rules: Add `rule_group_last_restore_duration_seconds` to measure the time it takes to restore a rule group. #13974 ## 2.51.2 / 2024-04-09 diff --git a/rules/group.go b/rules/group.go index 4acce6e60..c4331a465 100644 --- a/rules/group.go +++ b/rules/group.go @@ -681,15 +681,15 @@ func (g *Group) RestoreForState(ts time.Time) { continue } - result := map[string]storage.Series{} + seriesByLabels := map[string]storage.Series{} for sset.Next() { - result[sset.At().Labels().DropMetricName().String()] = sset.At() + seriesByLabels[sset.At().Labels().DropMetricName().String()] = sset.At() } alertRule.ForEachActiveAlert(func(a *Alert) { var s storage.Series - s, ok := result[a.Labels.String()] + s, ok := seriesByLabels[a.Labels.String()] if !ok { return } From 2de2fee0354eaab35e8dbca8388dd6d3d2158220 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Wed, 24 Apr 2024 19:10:34 +0100 Subject: [PATCH 06/13] Allow the result map for the series set before hand with a hint. Signed-off-by: gotjosh --- rules/alerting.go | 7 +++++++ rules/alerting_test.go | 19 +++++++++++++++++++ rules/group.go | 3 ++- 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/rules/alerting.go b/rules/alerting.go index 2cadd3ac5..e357cc615 100644 --- a/rules/alerting.go +++ b/rules/alerting.go @@ -542,6 +542,13 @@ func (r *AlertingRule) ForEachActiveAlert(f func(*Alert)) { } } +func (r *AlertingRule) ActiveAlertsCount() int { + r.activeMtx.Lock() + defer r.activeMtx.Unlock() + + return len(r.active) +} + func (r *AlertingRule) sendAlerts(ctx context.Context, ts time.Time, resendDelay, interval time.Duration, notifyFunc NotifyFunc) { alerts := []*Alert{} r.ForEachActiveAlert(func(alert *Alert) { diff --git a/rules/alerting_test.go b/rules/alerting_test.go index 366297b4a..78f6d6715 100644 --- a/rules/alerting_test.go +++ b/rules/alerting_test.go @@ -1027,3 +1027,22 @@ func TestAlertingRule_SetNoDependencyRules(t *testing.T) { rule.SetNoDependencyRules(true) require.True(t, rule.NoDependencyRules()) } + +func TestAlertingRule_ActiveAlertsCount(t *testing.T) { + rule := NewAlertingRule( + "TestRule", + nil, + time.Minute, + 0, + labels.FromStrings("severity", "critical"), + labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil, + ) + + // Set an active alert. + lbls := labels.FromStrings("a1", "1") + h := lbls.Hash() + al := &Alert{State: StateFiring, Labels: lbls, ActiveAt: time.Now()} + rule.active[h] = al + + require.Equal(t, 1, t rule.ActiveAlertsCount()) +} diff --git a/rules/group.go b/rules/group.go index c4331a465..28a0ff6e1 100644 --- a/rules/group.go +++ b/rules/group.go @@ -681,7 +681,8 @@ func (g *Group) RestoreForState(ts time.Time) { continue } - seriesByLabels := map[string]storage.Series{} + // While not technically the same number of series we expect, it's as good of an approximation as any. + seriesByLabels := make(map[string]storage.Series, alertRule.ActiveAlertsCount()) for sset.Next() { seriesByLabels[sset.At().Labels().DropMetricName().String()] = sset.At() } From cc2207148edf9c348837b2631e4adf6d85a8819b Mon Sep 17 00:00:00 2001 From: gotjosh Date: Wed, 24 Apr 2024 19:20:57 +0100 Subject: [PATCH 07/13] fix typo Signed-off-by: gotjosh --- rules/alerting_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rules/alerting_test.go b/rules/alerting_test.go index 78f6d6715..a3f97b2a4 100644 --- a/rules/alerting_test.go +++ b/rules/alerting_test.go @@ -1044,5 +1044,5 @@ func TestAlertingRule_ActiveAlertsCount(t *testing.T) { al := &Alert{State: StateFiring, Labels: lbls, ActiveAt: time.Now()} rule.active[h] = al - require.Equal(t, 1, t rule.ActiveAlertsCount()) + require.Equal(t, 1, rule.ActiveAlertsCount()) } From 151f6e0ed6e9e60c0c667da294c099bc3ece8047 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Tue, 30 Apr 2024 12:17:56 +0100 Subject: [PATCH 08/13] Add an assertion on the count of alerts before adding an active alert Signed-off-by: gotjosh --- rules/alerting_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/rules/alerting_test.go b/rules/alerting_test.go index a3f97b2a4..28b4f2cdc 100644 --- a/rules/alerting_test.go +++ b/rules/alerting_test.go @@ -1038,6 +1038,8 @@ func TestAlertingRule_ActiveAlertsCount(t *testing.T) { labels.EmptyLabels(), labels.EmptyLabels(), "", true, nil, ) + require.Equal(t, 0, rule.ActiveAlertsCount()) + // Set an active alert. lbls := labels.FromStrings("a1", "1") h := lbls.Hash() From ccfafae36d4a0d49a4b099fafbe56eb66ffcb6bb Mon Sep 17 00:00:00 2001 From: gotjosh Date: Tue, 30 Apr 2024 12:19:18 +0100 Subject: [PATCH 09/13] Rename QueryforStateSeries to QueryForStateSeries Signed-off-by: gotjosh --- rules/alerting.go | 4 ++-- rules/alerting_test.go | 2 +- rules/group.go | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/rules/alerting.go b/rules/alerting.go index e357cc615..fb0fb35b1 100644 --- a/rules/alerting.go +++ b/rules/alerting.go @@ -268,8 +268,8 @@ func (r *AlertingRule) forStateSample(alert *Alert, ts time.Time, v float64) pro return s } -// QueryforStateSeries returns the series for ALERTS_FOR_STATE of the alert rule. -func (r *AlertingRule) QueryforStateSeries(ctx context.Context, q storage.Querier) (storage.SeriesSet, error) { +// QueryForStateSeries returns the series for ALERTS_FOR_STATE of the alert rule. +func (r *AlertingRule) QueryForStateSeries(ctx context.Context, q storage.Querier) (storage.SeriesSet, error) { // We use a sample to ease the building of matchers. // Don't provide an alert as we want matchers that match all series for the alert rule. smpl := r.forStateSample(nil, time.Now(), 0) diff --git a/rules/alerting_test.go b/rules/alerting_test.go index 28b4f2cdc..165f38b86 100644 --- a/rules/alerting_test.go +++ b/rules/alerting_test.go @@ -716,7 +716,7 @@ func TestQueryForStateSeries(t *testing.T) { matchersCount++ }) - seriesSet, err := rule.QueryforStateSeries(context.Background(), querier) + seriesSet, err := rule.QueryForStateSeries(context.Background(), querier) var series storage.Series for seriesSet.Next() { diff --git a/rules/group.go b/rules/group.go index 28a0ff6e1..e8a7d82d2 100644 --- a/rules/group.go +++ b/rules/group.go @@ -664,7 +664,7 @@ func (g *Group) RestoreForState(ts time.Time) { continue } - sset, err := alertRule.QueryforStateSeries(g.opts.Context, q) + sset, err := alertRule.QueryForStateSeries(g.opts.Context, q) if err != nil { level.Error(g.logger).Log( "msg", "Failed to restore 'for' state", From 63b09944b8502a64cb17330161c8520d24874aa4 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Tue, 30 Apr 2024 12:25:48 +0100 Subject: [PATCH 10/13] Use labels.Len() instead of manually counting the labels Signed-off-by: gotjosh --- rules/alerting_test.go | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/rules/alerting_test.go b/rules/alerting_test.go index 165f38b86..5fae3edd1 100644 --- a/rules/alerting_test.go +++ b/rules/alerting_test.go @@ -711,16 +711,12 @@ func TestQueryForStateSeries(t *testing.T) { ) sample := rule.forStateSample(nil, time.Time{}, 0) - var matchersCount int - sample.Metric.Range(func(l labels.Label) { - matchersCount++ - }) seriesSet, err := rule.QueryForStateSeries(context.Background(), querier) var series storage.Series for seriesSet.Next() { - if seriesSet.At().Labels().Len() == matchersCount { + if seriesSet.At().Labels().Len() == sample.Metric.Len() { series = seriesSet.At() break } From f63dbc3db2abee643c7e182690be5712c42f13ca Mon Sep 17 00:00:00 2001 From: gotjosh Date: Tue, 30 Apr 2024 12:39:07 +0100 Subject: [PATCH 11/13] Remove duplicated sorted and assignment of expected alerts. Signed-off-by: gotjosh --- rules/manager_test.go | 8 -------- 1 file changed, 8 deletions(-) diff --git a/rules/manager_test.go b/rules/manager_test.go index a3bd335d1..d74a7dfe1 100644 --- a/rules/manager_test.go +++ b/rules/manager_test.go @@ -397,14 +397,6 @@ func TestForStateRestore(t *testing.T) { group.Eval(context.TODO(), evalTime) } - exp := rule.ActiveAlerts() - for _, aa := range exp { - require.Zero(t, aa.Labels.Get(model.MetricNameLabel), "%s label set on active alert: %s", model.MetricNameLabel, aa.Labels) - } - sort.Slice(exp, func(i, j int) bool { - return labels.Compare(exp[i].Labels, exp[j].Labels) < 0 - }) - // Prometheus goes down here. We create new rules and groups. type testInput struct { name string From 05ca082b079bb12dc7ad0f8c3940774df803401b Mon Sep 17 00:00:00 2001 From: gotjosh Date: Tue, 30 Apr 2024 12:43:09 +0100 Subject: [PATCH 12/13] Rename `alerts` to `expectedAlerts` in the test case input Signed-off-by: gotjosh --- rules/manager_test.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/rules/manager_test.go b/rules/manager_test.go index d74a7dfe1..07159145f 100644 --- a/rules/manager_test.go +++ b/rules/manager_test.go @@ -401,7 +401,7 @@ func TestForStateRestore(t *testing.T) { type testInput struct { name string restoreDuration time.Duration - alerts []*Alert + expectedAlerts []*Alert num int noRestore bool @@ -414,7 +414,7 @@ func TestForStateRestore(t *testing.T) { { name: "normal restore (alerts were not firing)", restoreDuration: 15 * time.Minute, - alerts: rule.ActiveAlerts(), + expectedAlerts: rule.ActiveAlerts(), downDuration: 10 * time.Minute, }, { @@ -426,12 +426,12 @@ func TestForStateRestore(t *testing.T) { { name: "no active alerts", restoreDuration: 50 * time.Minute, - alerts: []*Alert{}, + expectedAlerts: []*Alert{}, }, { name: "test the grace period", restoreDuration: 25 * time.Minute, - alerts: []*Alert{}, + expectedAlerts: []*Alert{}, gracePeriod: true, before: func() { for _, duration := range []time.Duration{10 * time.Minute, 15 * time.Minute, 20 * time.Minute} { @@ -496,7 +496,7 @@ func TestForStateRestore(t *testing.T) { require.Equal(t, opts.ForGracePeriod, e.ActiveAt.Add(alertForDuration).Sub(restoreTime)) } default: - exp := tt.alerts + exp := tt.expectedAlerts require.Equal(t, len(exp), len(got)) sortAlerts(exp) sortAlerts(got) From 379dec9d368edbf083b5a7da9e8aa02e94bb079b Mon Sep 17 00:00:00 2001 From: gotjosh Date: Tue, 30 Apr 2024 13:09:30 +0100 Subject: [PATCH 13/13] querier.Select cannot return a nil series set. Signed-off-by: gotjosh --- rules/group.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/rules/group.go b/rules/group.go index e8a7d82d2..3a97c9564 100644 --- a/rules/group.go +++ b/rules/group.go @@ -675,18 +675,18 @@ func (g *Group) RestoreForState(ts time.Time) { continue } - // No results for this alert rule. - if sset == nil { - level.Debug(g.logger).Log("msg", "Failed to find a series to restore the 'for' state", labels.AlertName, alertRule.Name()) - continue - } - // While not technically the same number of series we expect, it's as good of an approximation as any. seriesByLabels := make(map[string]storage.Series, alertRule.ActiveAlertsCount()) for sset.Next() { seriesByLabels[sset.At().Labels().DropMetricName().String()] = sset.At() } + // No results for this alert rule. + if len(seriesByLabels) == 0 { + level.Debug(g.logger).Log("msg", "Failed to find a series to restore the 'for' state", labels.AlertName, alertRule.Name()) + continue + } + alertRule.ForEachActiveAlert(func(a *Alert) { var s storage.Series