rules: Make deleted rule series as stale after a reload (#6745)

* rules: Make deleted rule series as stale after a reload

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
This commit is contained in:
Julien Pivotto 2020-02-12 16:22:18 +01:00 committed by GitHub
parent c35438f2dd
commit 135cc30063
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 305 additions and 56 deletions

View file

@ -0,0 +1,5 @@
groups:
- name: test_2 copy
rules:
- record: test_2
expr: vector(2)

View file

@ -231,37 +231,48 @@ type Group struct {
shouldRestore bool shouldRestore bool
done chan struct{} done chan bool
terminated chan struct{} terminated chan struct{}
managerDone chan struct{}
logger log.Logger logger log.Logger
metrics *Metrics metrics *Metrics
} }
// NewGroup makes a new Group with the given name, options, and rules. type GroupOptions struct {
func NewGroup(name, file string, interval time.Duration, rules []Rule, shouldRestore bool, opts *ManagerOptions) *Group { Name, File string
metrics := opts.Metrics Interval time.Duration
if metrics == nil { Rules []Rule
metrics = NewGroupMetrics(opts.Registerer) ShouldRestore bool
Opts *ManagerOptions
done chan struct{}
} }
metrics.groupLastEvalTime.WithLabelValues(groupKey(file, name)) // NewGroup makes a new Group with the given name, options, and rules.
metrics.groupLastDuration.WithLabelValues(groupKey(file, name)) func NewGroup(o GroupOptions) *Group {
metrics.groupRules.WithLabelValues(groupKey(file, name)).Set(float64(len(rules))) metrics := o.Opts.Metrics
metrics.groupInterval.WithLabelValues(groupKey(file, name)).Set(interval.Seconds()) if metrics == nil {
metrics = NewGroupMetrics(o.Opts.Registerer)
}
metrics.groupLastEvalTime.WithLabelValues(groupKey(o.File, o.Name))
metrics.groupLastDuration.WithLabelValues(groupKey(o.File, o.Name))
metrics.groupRules.WithLabelValues(groupKey(o.File, o.Name)).Set(float64(len(o.Rules)))
metrics.groupInterval.WithLabelValues(groupKey(o.File, o.Name)).Set(o.Interval.Seconds())
return &Group{ return &Group{
name: name, name: o.Name,
file: file, file: o.File,
interval: interval, interval: o.Interval,
rules: rules, rules: o.Rules,
shouldRestore: shouldRestore, shouldRestore: o.ShouldRestore,
opts: opts, opts: o.Opts,
seriesInPreviousEval: make([]map[string]labels.Labels, len(rules)), seriesInPreviousEval: make([]map[string]labels.Labels, len(o.Rules)),
done: make(chan struct{}), done: make(chan bool),
managerDone: o.done,
terminated: make(chan struct{}), terminated: make(chan struct{}),
logger: log.With(opts.Logger, "group", name), logger: log.With(o.Opts.Logger, "group", o.Name),
metrics: metrics, metrics: metrics,
} }
} }
@ -314,6 +325,29 @@ func (g *Group) run(ctx context.Context) {
tick := time.NewTicker(g.interval) tick := time.NewTicker(g.interval)
defer tick.Stop() defer tick.Stop()
makeStale := func(s bool) {
if !s {
return
}
go func(now time.Time) {
for _, rule := range g.seriesInPreviousEval {
for _, r := range rule {
g.staleSeries = append(g.staleSeries, r)
}
}
// That can be garbage collected at this point.
g.seriesInPreviousEval = nil
// Wait for 2 intervals to give the opportunity to renamed rules
// to insert new series in the tsdb. At this point if there is a
// renamed rule, it should already be started.
select {
case <-g.managerDone:
case <-time.After(2 * g.interval):
g.cleanupStaleSeries(now)
}
}(time.Now())
}
iter() iter()
if g.shouldRestore { if g.shouldRestore {
// If we have to restore, we wait for another Eval to finish. // If we have to restore, we wait for another Eval to finish.
@ -321,7 +355,8 @@ func (g *Group) run(ctx context.Context) {
// we might not have enough data scraped, and recording rules would not // we might not have enough data scraped, and recording rules would not
// have updated the latest values, on which some alerts might depend. // have updated the latest values, on which some alerts might depend.
select { select {
case <-g.done: case stale := <-g.done:
makeStale(stale)
return return
case <-tick.C: case <-tick.C:
missed := (time.Since(evalTimestamp) / g.interval) - 1 missed := (time.Since(evalTimestamp) / g.interval) - 1
@ -339,11 +374,13 @@ func (g *Group) run(ctx context.Context) {
for { for {
select { select {
case <-g.done: case stale := <-g.done:
makeStale(stale)
return return
default: default:
select { select {
case <-g.done: case stale := <-g.done:
makeStale(stale)
return return
case <-tick.C: case <-tick.C:
missed := (time.Since(evalTimestamp) / g.interval) - 1 missed := (time.Since(evalTimestamp) / g.interval) - 1
@ -358,6 +395,11 @@ func (g *Group) run(ctx context.Context) {
} }
} }
func (g *Group) stopAndMakeStale() {
g.done <- true
<-g.terminated
}
func (g *Group) stop() { func (g *Group) stop() {
close(g.done) close(g.done)
<-g.terminated <-g.terminated
@ -596,8 +638,13 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
} }
}(i, rule) }(i, rule)
} }
g.cleanupStaleSeries(ts)
}
if len(g.staleSeries) != 0 { func (g *Group) cleanupStaleSeries(ts time.Time) {
if len(g.staleSeries) == 0 {
return
}
app, err := g.opts.Appendable.Appender() app, err := g.opts.Appendable.Appender()
if err != nil { if err != nil {
level.Warn(g.logger).Log("msg", "creating appender failed", "err", err) level.Warn(g.logger).Log("msg", "creating appender failed", "err", err)
@ -621,7 +668,6 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
g.staleSeries = nil g.staleSeries = nil
} }
} }
}
// RestoreForState restores the 'for' state of the alerts // RestoreForState restores the 'for' state of the alerts
// by looking up last ActiveAt from storage. // by looking up last ActiveAt from storage.
@ -784,6 +830,7 @@ type Manager struct {
groups map[string]*Group groups map[string]*Group
mtx sync.RWMutex mtx sync.RWMutex
block chan struct{} block chan struct{}
done chan struct{}
restored bool restored bool
logger log.Logger logger log.Logger
@ -825,6 +872,7 @@ func NewManager(o *ManagerOptions) *Manager {
groups: map[string]*Group{}, groups: map[string]*Group{},
opts: o, opts: o,
block: make(chan struct{}), block: make(chan struct{}),
done: make(chan struct{}),
logger: o.Logger, logger: o.Logger,
} }
@ -848,6 +896,10 @@ func (m *Manager) Stop() {
eg.stop() eg.stop()
} }
// Shut down the groups waiting multiple evaluation intervals to write
// staleness markers.
close(m.done)
level.Info(m.logger).Log("msg", "Rule manager stopped") level.Info(m.logger).Log("msg", "Rule manager stopped")
} }
@ -899,14 +951,18 @@ func (m *Manager) Update(interval time.Duration, files []string, externalLabels
} }
// Stop remaining old groups. // Stop remaining old groups.
wg.Add(len(m.groups))
for n, oldg := range m.groups { for n, oldg := range m.groups {
oldg.stop() go func(n string, g *Group) {
if m := oldg.metrics; m != nil { g.stopAndMakeStale()
if m := g.metrics; m != nil {
m.groupInterval.DeleteLabelValues(n) m.groupInterval.DeleteLabelValues(n)
m.groupLastEvalTime.DeleteLabelValues(n) m.groupLastEvalTime.DeleteLabelValues(n)
m.groupLastDuration.DeleteLabelValues(n) m.groupLastDuration.DeleteLabelValues(n)
m.groupRules.DeleteLabelValues(n) m.groupRules.DeleteLabelValues(n)
} }
wg.Done()
}(n, oldg)
} }
wg.Wait() wg.Wait()
@ -962,7 +1018,15 @@ func (m *Manager) LoadGroups(
)) ))
} }
groups[groupKey(fn, rg.Name)] = NewGroup(rg.Name, fn, itv, rules, shouldRestore, m.opts) groups[groupKey(fn, rg.Name)] = NewGroup(GroupOptions{
Name: rg.Name,
File: fn,
Interval: itv,
Rules: rules,
ShouldRestore: shouldRestore,
Opts: m.opts,
done: m.done,
})
} }
} }

View file

@ -376,7 +376,13 @@ func TestForStateRestore(t *testing.T) {
nil, nil, true, nil, nil, nil, true, nil,
) )
group := NewGroup("default", "", time.Second, []Rule{rule}, true, opts) group := NewGroup(GroupOptions{
Name: "default",
Interval: time.Second,
Rules: []Rule{rule},
ShouldRestore: true,
Opts: opts,
})
groups := make(map[string]*Group) groups := make(map[string]*Group)
groups["default;"] = group groups["default;"] = group
@ -435,7 +441,13 @@ func TestForStateRestore(t *testing.T) {
labels.FromStrings("severity", "critical"), labels.FromStrings("severity", "critical"),
nil, nil, false, nil, nil, nil, false, nil,
) )
newGroup := NewGroup("default", "", time.Second, []Rule{newRule}, true, opts) newGroup := NewGroup(GroupOptions{
Name: "default",
Interval: time.Second,
Rules: []Rule{newRule},
ShouldRestore: true,
Opts: opts,
})
newGroups := make(map[string]*Group) newGroups := make(map[string]*Group)
newGroups["default;"] = newGroup newGroups["default;"] = newGroup
@ -519,7 +531,13 @@ func TestStaleness(t *testing.T) {
expr, err := promql.ParseExpr("a + 1") expr, err := promql.ParseExpr("a + 1")
testutil.Ok(t, err) testutil.Ok(t, err)
rule := NewRecordingRule("a_plus_one", expr, labels.Labels{}) rule := NewRecordingRule("a_plus_one", expr, labels.Labels{})
group := NewGroup("default", "", time.Second, []Rule{rule}, true, opts) group := NewGroup(GroupOptions{
Name: "default",
Interval: time.Second,
Rules: []Rule{rule},
ShouldRestore: true,
Opts: opts,
})
// A time series that has two samples and then goes stale. // A time series that has two samples and then goes stale.
app, _ := storage.Appender() app, _ := storage.Appender()
@ -842,7 +860,13 @@ func TestNotify(t *testing.T) {
expr, err := promql.ParseExpr("a > 1") expr, err := promql.ParseExpr("a > 1")
testutil.Ok(t, err) testutil.Ok(t, err)
rule := NewAlertingRule("aTooHigh", expr, 0, labels.Labels{}, labels.Labels{}, nil, true, log.NewNopLogger()) rule := NewAlertingRule("aTooHigh", expr, 0, labels.Labels{}, labels.Labels{}, nil, true, log.NewNopLogger())
group := NewGroup("alert", "", time.Second, []Rule{rule}, true, opts) group := NewGroup(GroupOptions{
Name: "alert",
Interval: time.Second,
Rules: []Rule{rule},
ShouldRestore: true,
Opts: opts,
})
app, _ := storage.Appender() app, _ := storage.Appender()
app.Add(labels.FromStrings(model.MetricNameLabel, "a"), 1000, 2) app.Add(labels.FromStrings(model.MetricNameLabel, "a"), 1000, 2)
@ -948,3 +972,152 @@ func TestMetricsUpdate(t *testing.T) {
testutil.Equals(t, c.metrics, countMetrics(), "test %d: invalid count of metrics", i) testutil.Equals(t, c.metrics, countMetrics(), "test %d: invalid count of metrics", i)
} }
} }
func TestGroupStalenessOnRemoval(t *testing.T) {
if testing.Short() {
t.Skip("skipping test in short mode.")
}
files := []string{"fixtures/rules2.yaml"}
sameFiles := []string{"fixtures/rules2_copy.yaml"}
storage := teststorage.New(t)
defer storage.Close()
opts := promql.EngineOpts{
Logger: nil,
Reg: nil,
MaxSamples: 10,
Timeout: 10 * time.Second,
}
engine := promql.NewEngine(opts)
ruleManager := NewManager(&ManagerOptions{
Appendable: storage,
TSDB: storage,
QueryFunc: EngineQueryFunc(engine, storage),
Context: context.Background(),
Logger: log.NewNopLogger(),
})
var stopped bool
ruleManager.Run()
defer func() {
if !stopped {
ruleManager.Stop()
}
}()
cases := []struct {
files []string
staleNaN int
}{
{
files: files,
staleNaN: 0,
},
{
// When we remove the files, it should produce a staleness marker.
files: files[:0],
staleNaN: 1,
},
{
// Rules that produce the same metrics but in a different file
// should not produce staleness marker.
files: sameFiles,
staleNaN: 0,
},
{
// Staleness marker should be present as we don't have any rules
// loaded anymore.
files: files[:0],
staleNaN: 1,
},
{
// Add rules back so we have rules loaded when we stop the manager
// and check for the absence of staleness markers.
files: sameFiles,
staleNaN: 0,
},
}
var totalStaleNaN int
for i, c := range cases {
err := ruleManager.Update(time.Second, c.files, nil)
testutil.Ok(t, err)
time.Sleep(3 * time.Second)
totalStaleNaN += c.staleNaN
testutil.Equals(t, totalStaleNaN, countStaleNaN(t, storage), "test %d/%q: invalid count of staleness markers", i, c.files)
}
ruleManager.Stop()
stopped = true
testutil.Equals(t, totalStaleNaN, countStaleNaN(t, storage), "invalid count of staleness markers after stopping the engine")
}
func TestMetricsStalenessOnManagerShutdown(t *testing.T) {
if testing.Short() {
t.Skip("skipping test in short mode.")
}
files := []string{"fixtures/rules2.yaml"}
storage := teststorage.New(t)
defer storage.Close()
opts := promql.EngineOpts{
Logger: nil,
Reg: nil,
MaxSamples: 10,
Timeout: 10 * time.Second,
}
engine := promql.NewEngine(opts)
ruleManager := NewManager(&ManagerOptions{
Appendable: storage,
TSDB: storage,
QueryFunc: EngineQueryFunc(engine, storage),
Context: context.Background(),
Logger: log.NewNopLogger(),
})
var stopped bool
ruleManager.Run()
defer func() {
if !stopped {
ruleManager.Stop()
}
}()
err := ruleManager.Update(2*time.Second, files, nil)
time.Sleep(4 * time.Second)
testutil.Ok(t, err)
start := time.Now()
err = ruleManager.Update(3*time.Second, files[:0], nil)
testutil.Ok(t, err)
ruleManager.Stop()
stopped = true
testutil.Assert(t, time.Since(start) < 1*time.Second, "rule manager does not stop early")
time.Sleep(5 * time.Second)
testutil.Equals(t, 0, countStaleNaN(t, storage), "invalid count of staleness markers after stopping the engine")
}
func countStaleNaN(t *testing.T, storage storage.Storage) int {
var c int
querier, err := storage.Querier(context.Background(), 0, time.Now().Unix()*1000)
testutil.Ok(t, err)
defer querier.Close()
matcher, err := labels.NewMatcher(labels.MatchEqual, model.MetricNameLabel, "test_2")
testutil.Ok(t, err)
set, _, err := querier.Select(nil, matcher)
testutil.Ok(t, err)
samples, err := readSeriesSet(set)
testutil.Ok(t, err)
metric := labels.FromStrings(model.MetricNameLabel, "test_2").String()
metricSample, ok := samples[metric]
testutil.Assert(t, ok, "Series %s not returned.", metric)
for _, s := range metricSample {
if value.IsStaleNaN(s.V) {
c++
}
}
return c
}

View file

@ -258,7 +258,14 @@ func (m rulesRetrieverMock) RuleGroups() []*rules.Group {
recordingRule := rules.NewRecordingRule("recording-rule-1", recordingExpr, labels.Labels{}) recordingRule := rules.NewRecordingRule("recording-rule-1", recordingExpr, labels.Labels{})
r = append(r, recordingRule) r = append(r, recordingRule)
group := rules.NewGroup("grp", "/path/to/file", time.Second, r, false, opts) group := rules.NewGroup(rules.GroupOptions{
Name: "grp",
File: "/path/to/file",
Interval: time.Second,
Rules: r,
ShouldRestore: false,
Opts: opts,
})
return []*rules.Group{group} return []*rules.Group{group}
} }