From a768a3b95e65efd0338cb2db598191ecd117f362 Mon Sep 17 00:00:00 2001 From: Julien Duchesne Date: Wed, 8 Jan 2025 11:32:48 -0500 Subject: [PATCH] Rule Concurrency: Test safe abort of rule evaluations (#15797) This test was added in the Grafana fork a while ago: https://github.com/grafana/mimir-prometheus/pull/714 and has been helpful to make sure we can safely terminate rule evaluations early The new rule evaluation logic (done here: https://github.com/prometheus/prometheus/pull/15681) does not have the bug, but the test was useful to verify that Signed-off-by: Julien Duchesne --- rules/manager_test.go | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/rules/manager_test.go b/rules/manager_test.go index defa93a68c..5c3fcc96a8 100644 --- a/rules/manager_test.go +++ b/rules/manager_test.go @@ -2326,6 +2326,41 @@ func TestUpdateWhenStopped(t *testing.T) { require.NoError(t, err) } +func TestGroup_Eval_RaceConditionOnStoppingGroupEvaluationWhileRulesAreEvaluatedConcurrently(t *testing.T) { + storage := teststorage.New(t) + t.Cleanup(func() { storage.Close() }) + + var ( + inflightQueries atomic.Int32 + maxInflight atomic.Int32 + maxConcurrency int64 = 10 + ) + + files := []string{"fixtures/rules_multiple_groups.yaml"} + files2 := []string{"fixtures/rules.yaml"} + + ruleManager := NewManager(optsFactory(storage, &maxInflight, &inflightQueries, maxConcurrency)) + go func() { + ruleManager.Run() + }() + <-ruleManager.block + + // Update the group a decent number of times to simulate start and stopping in the middle of an evaluation. + for i := 0; i < 10; i++ { + err := ruleManager.Update(time.Second, files, labels.EmptyLabels(), "", nil) + require.NoError(t, err) + + // Wait half of the query execution duration and then change the rule groups loaded by the manager + // so that the previous rule group will be interrupted while the query is executing. + time.Sleep(artificialDelay / 2) + + err = ruleManager.Update(time.Second, files2, labels.EmptyLabels(), "", nil) + require.NoError(t, err) + } + + ruleManager.Stop() +} + const artificialDelay = 250 * time.Millisecond func optsFactory(storage storage.Storage, maxInflight, inflightQueries *atomic.Int32, maxConcurrent int64) *ManagerOptions {