From a7b449320d4ad88dd6eca38c84ce1c48d11257bd Mon Sep 17 00:00:00 2001 From: Chris Marchbanks Date: Sat, 18 Apr 2020 06:32:18 -0600 Subject: [PATCH] Fix updating rule manager never finishing (#7138) Rather than sending a value to the done channel on a group to indicate whether or not to add stale markers to a closing rule group use an explicit boolean. This allows more functions than just run() to read from the done channel and fixes an issue where Eval() could consume the channel during an update, causing run() to never return. Signed-off-by: Chris Marchbanks --- rules/manager.go | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/rules/manager.go b/rules/manager.go index 383d38327..8065b2e39 100644 --- a/rules/manager.go +++ b/rules/manager.go @@ -232,7 +232,8 @@ type Group struct { shouldRestore bool - done chan bool + markStale bool + done chan struct{} terminated chan struct{} managerDone chan struct{} @@ -270,7 +271,7 @@ func NewGroup(o GroupOptions) *Group { shouldRestore: o.ShouldRestore, opts: o.Opts, seriesInPreviousEval: make([]map[string]labels.Labels, len(o.Rules)), - done: make(chan bool), + done: make(chan struct{}), managerDone: o.done, terminated: make(chan struct{}), logger: log.With(o.Opts.Logger, "group", o.Name), @@ -326,8 +327,8 @@ func (g *Group) run(ctx context.Context) { tick := time.NewTicker(g.interval) defer tick.Stop() - makeStale := func(s bool) { - if !s { + defer func() { + if !g.markStale { return } go func(now time.Time) { @@ -347,7 +348,7 @@ func (g *Group) run(ctx context.Context) { g.cleanupStaleSeries(now) } }(time.Now()) - } + }() iter() if g.shouldRestore { @@ -356,8 +357,7 @@ func (g *Group) run(ctx context.Context) { // we might not have enough data scraped, and recording rules would not // have updated the latest values, on which some alerts might depend. select { - case stale := <-g.done: - makeStale(stale) + case <-g.done: return case <-tick.C: missed := (time.Since(evalTimestamp) / g.interval) - 1 @@ -375,13 +375,11 @@ func (g *Group) run(ctx context.Context) { for { select { - case stale := <-g.done: - makeStale(stale) + case <-g.done: return default: select { - case stale := <-g.done: - makeStale(stale) + case <-g.done: return case <-tick.C: missed := (time.Since(evalTimestamp) / g.interval) - 1 @@ -396,11 +394,6 @@ func (g *Group) run(ctx context.Context) { } } -func (g *Group) stopAndMakeStale() { - g.done <- true - <-g.terminated -} - func (g *Group) stop() { close(g.done) <-g.terminated @@ -943,7 +936,8 @@ func (m *Manager) Update(interval time.Duration, files []string, externalLabels wg.Add(len(m.groups)) for n, oldg := range m.groups { go func(n string, g *Group) { - g.stopAndMakeStale() + g.markStale = true + g.stop() if m := g.metrics; m != nil { m.groupInterval.DeleteLabelValues(n) m.groupLastEvalTime.DeleteLabelValues(n)