Stop rule manager before TSDB is stopped (#10680)

During shutdown TSDB is stopped before rule manager is stopped. Since  TSDB shutdown can take a long time (minutes or 10s of minutes) it keeps rule manager running while parts of Prometheus are already stopped (most notebly scrape manager). This can cause false positive alerts to fire, mostly those that rely on absent() calls since new sample appends will stop while alert queries are still evaluated.
Stop rules before stopping TSDB and scrape manager to avoid this problem.

Signed-off-by: Łukasz Mierzwa <l.mierzwa@gmail.com>
This commit is contained in:
Łukasz Mierzwa 2022-05-20 22:26:06 +01:00 committed by GitHub
parent 42f574d5ac
commit d3c9c4f574
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 16 additions and 12 deletions

View file

@ -838,6 +838,19 @@ func main() {
},
)
}
if !agentMode {
// Rule manager.
g.Add(
func() error {
<-reloadReady.C
ruleManager.Run()
return nil
},
func(err error) {
ruleManager.Stop()
},
)
}
{
// Scrape manager.
g.Add(
@ -855,6 +868,8 @@ func main() {
func(err error) {
// Scrape manager needs to be stopped before closing the local TSDB
// so that it doesn't try to write samples to a closed storage.
// We should also wait for rule manager to be fully stopped to ensure
// we don't trigger any false positive alerts for rules using absent().
level.Info(logger).Log("msg", "Stopping scrape manager...")
scrapeManager.Stop()
},
@ -940,18 +955,6 @@ func main() {
)
}
if !agentMode {
// Rule manager.
g.Add(
func() error {
<-reloadReady.C
ruleManager.Run()
return nil
},
func(err error) {
ruleManager.Stop()
},
)
// TSDB.
opts := cfg.tsdb.ToTSDBOptions()
cancel := make(chan struct{})

View file

@ -934,6 +934,7 @@ func NewManager(o *ManagerOptions) *Manager {
// Run starts processing of the rule manager. It is blocking.
func (m *Manager) Run() {
level.Info(m.logger).Log("msg", "Starting rule manager...")
m.start()
<-m.done
}