Fix updating rule manager never finishing (#7138)

Rather than sending a value to the done channel on a group to indicate
whether or not to add stale markers to a closing rule group use an
explicit boolean. This allows more functions than just run() to read
from the done channel and fixes an issue where Eval() could consume the
channel during an update, causing run() to never return.

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>
This commit is contained in:
Chris Marchbanks 2020-04-18 06:32:18 -06:00 committed by GitHub
parent ca23cd064e
commit a7b449320d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -232,7 +232,8 @@ type Group struct {
shouldRestore bool shouldRestore bool
done chan bool markStale bool
done chan struct{}
terminated chan struct{} terminated chan struct{}
managerDone chan struct{} managerDone chan struct{}
@ -270,7 +271,7 @@ func NewGroup(o GroupOptions) *Group {
shouldRestore: o.ShouldRestore, shouldRestore: o.ShouldRestore,
opts: o.Opts, opts: o.Opts,
seriesInPreviousEval: make([]map[string]labels.Labels, len(o.Rules)), seriesInPreviousEval: make([]map[string]labels.Labels, len(o.Rules)),
done: make(chan bool), done: make(chan struct{}),
managerDone: o.done, managerDone: o.done,
terminated: make(chan struct{}), terminated: make(chan struct{}),
logger: log.With(o.Opts.Logger, "group", o.Name), logger: log.With(o.Opts.Logger, "group", o.Name),
@ -326,8 +327,8 @@ func (g *Group) run(ctx context.Context) {
tick := time.NewTicker(g.interval) tick := time.NewTicker(g.interval)
defer tick.Stop() defer tick.Stop()
makeStale := func(s bool) { defer func() {
if !s { if !g.markStale {
return return
} }
go func(now time.Time) { go func(now time.Time) {
@ -347,7 +348,7 @@ func (g *Group) run(ctx context.Context) {
g.cleanupStaleSeries(now) g.cleanupStaleSeries(now)
} }
}(time.Now()) }(time.Now())
} }()
iter() iter()
if g.shouldRestore { if g.shouldRestore {
@ -356,8 +357,7 @@ func (g *Group) run(ctx context.Context) {
// we might not have enough data scraped, and recording rules would not // we might not have enough data scraped, and recording rules would not
// have updated the latest values, on which some alerts might depend. // have updated the latest values, on which some alerts might depend.
select { select {
case stale := <-g.done: case <-g.done:
makeStale(stale)
return return
case <-tick.C: case <-tick.C:
missed := (time.Since(evalTimestamp) / g.interval) - 1 missed := (time.Since(evalTimestamp) / g.interval) - 1
@ -375,13 +375,11 @@ func (g *Group) run(ctx context.Context) {
for { for {
select { select {
case stale := <-g.done: case <-g.done:
makeStale(stale)
return return
default: default:
select { select {
case stale := <-g.done: case <-g.done:
makeStale(stale)
return return
case <-tick.C: case <-tick.C:
missed := (time.Since(evalTimestamp) / g.interval) - 1 missed := (time.Since(evalTimestamp) / g.interval) - 1
@ -396,11 +394,6 @@ func (g *Group) run(ctx context.Context) {
} }
} }
func (g *Group) stopAndMakeStale() {
g.done <- true
<-g.terminated
}
func (g *Group) stop() { func (g *Group) stop() {
close(g.done) close(g.done)
<-g.terminated <-g.terminated
@ -943,7 +936,8 @@ func (m *Manager) Update(interval time.Duration, files []string, externalLabels
wg.Add(len(m.groups)) wg.Add(len(m.groups))
for n, oldg := range m.groups { for n, oldg := range m.groups {
go func(n string, g *Group) { go func(n string, g *Group) {
g.stopAndMakeStale() g.markStale = true
g.stop()
if m := g.metrics; m != nil { if m := g.metrics; m != nil {
m.groupInterval.DeleteLabelValues(n) m.groupInterval.DeleteLabelValues(n)
m.groupLastEvalTime.DeleteLabelValues(n) m.groupLastEvalTime.DeleteLabelValues(n)