From 2a8ae586f4f17e8616b02b40909abc3dde3733b3 Mon Sep 17 00:00:00 2001 From: Dimitar Dimitrov Date: Mon, 20 Jan 2025 21:26:58 +0100 Subject: [PATCH] ruler: stop all rule groups asynchronously on shutdown (#15804) * ruler: stop all rule groups asynchronously on shutdown During shutdown of the rules manager some rule groups have already stopped and are missing evaluations while we're waiting for other groups to finish their evaluation. When there are many groups (in the thousands), the whole shutdown process can take up to 10 minutes, during which we get miss evaluations. Signed-off-by: Dimitar Dimitrov * Use wrappers in stop(); rename awaitStopped() Signed-off-by: Dimitar Dimitrov * Add comment Signed-off-by: Dimitar Dimitrov --------- Signed-off-by: Dimitar Dimitrov --- rules/group.go | 10 +++++++++- rules/manager.go | 8 +++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/rules/group.go b/rules/group.go index 4398d9211d..9ad9aab093 100644 --- a/rules/group.go +++ b/rules/group.go @@ -302,11 +302,19 @@ func (g *Group) run(ctx context.Context) { } } -func (g *Group) stop() { +func (g *Group) stopAsync() { close(g.done) +} + +func (g *Group) waitStopped() { <-g.terminated } +func (g *Group) stop() { + g.stopAsync() + g.waitStopped() +} + func (g *Group) hash() uint64 { l := labels.New( labels.Label{Name: "name", Value: g.name}, diff --git a/rules/manager.go b/rules/manager.go index 50b2a7e99d..b1d3e8e3d6 100644 --- a/rules/manager.go +++ b/rules/manager.go @@ -188,8 +188,14 @@ func (m *Manager) Stop() { m.logger.Info("Stopping rule manager...") + // Stop all groups asynchronously, then wait for them to finish. + // This is faster than stopping and waiting for each group in sequence. for _, eg := range m.groups { - eg.stop() + eg.stopAsync() + } + + for _, eg := range m.groups { + eg.waitStopped() } // Shut down the groups waiting multiple evaluation intervals to write