Add option to auto detect configuration changes.

Fixes https://github.com/prometheus/prometheus/issues/9783#issuecomment-1669341273

Signed-off-by: bwplotka <bwplotka@gmail.com>
This commit is contained in:
bwplotka 2023-08-08 08:56:43 +01:00
parent 8d47b3d497
commit 9254279be1
2 changed files with 145 additions and 1 deletions

View file

@ -47,6 +47,7 @@ import (
promlogflag "github.com/prometheus/common/promlog/flag" promlogflag "github.com/prometheus/common/promlog/flag"
"github.com/prometheus/common/version" "github.com/prometheus/common/version"
toolkit_web "github.com/prometheus/exporter-toolkit/web" toolkit_web "github.com/prometheus/exporter-toolkit/web"
"github.com/prometheus/prometheus/util/fswatch"
"go.uber.org/atomic" "go.uber.org/atomic"
"go.uber.org/automaxprocs/maxprocs" "go.uber.org/automaxprocs/maxprocs"
"k8s.io/klog" "k8s.io/klog"
@ -128,7 +129,9 @@ func agentOnlyFlag(app *kingpin.Application, name, help string) *kingpin.FlagCla
} }
type flagConfig struct { type flagConfig struct {
configFile string configFile string
configWatchInterval model.Duration
configWatchDelay model.Duration
agentStoragePath string agentStoragePath string
serverStoragePath string serverStoragePath string
@ -253,6 +256,19 @@ func main() {
a.Flag("config.file", "Prometheus configuration file path."). a.Flag("config.file", "Prometheus configuration file path.").
Default("prometheus.yml").StringVar(&cfg.configFile) Default("prometheus.yml").StringVar(&cfg.configFile)
a.Flag("config.watch-interval", "For non-zero duration, Prometheus will watch for configuration file changes, "+
"as well as, previously specified rule changes and scrape config file changes. Once changes are noticed,"+
"and after delay specified in --config.watch-delay, Prometheus will self-reload. "+
"Change detection is done through filesystem inotify with the regular interval specified in the flag, as well as, checksum validation."+
"With this flag, there is no need to reload Prometheus on configuration changes from the outside.").
Default("0").SetValue(&cfg.configWatchInterval)
a.Flag("config.watch-delay", "The duration between noticing the configuration changes "+
"and Prometheus self-reloading. Needed for throttling reloads which can be expensive."+
"For automation that updates configuration it's common to update file one by one within seconds."+
"Delay allows waiting some time to perform one reload for multiple small changes within short period").
Default("30s").SetValue(&cfg.configWatchDelay)
a.Flag("web.listen-address", "Address to listen on for UI, API, and telemetry."). a.Flag("web.listen-address", "Address to listen on for UI, API, and telemetry.").
Default("0.0.0.0:9090").StringVar(&cfg.web.ListenAddress) Default("0.0.0.0:9090").StringVar(&cfg.web.ListenAddress)
@ -723,6 +739,18 @@ func main() {
// This is passed to ruleManager.Update(). // This is passed to ruleManager.Update().
externalURL := cfg.web.ExternalURL.String() externalURL := cfg.web.ExternalURL.String()
var configWatcher *fswatch.Watch
if cfg.configWatchInterval > 0 {
configWatcher = fswatch.New(
prometheus.DefaultRegisterer,
"config",
logger,
time.Duration(cfg.configWatchInterval),
time.Duration(cfg.configWatchDelay),
)
configWatcher.AddFiles(context.TODO(), cfg.configFile)
}
reloaders := []reloader{ reloaders := []reloader{
{ {
name: "db_storage", name: "db_storage",
@ -815,6 +843,44 @@ func main() {
}, },
} }
if configWatcher != nil {
reloaders = append(reloaders, reloader{
name: configWatcher.Name(),
reloader: func(c *config.Config) error {
if agentMode {
// No-op in Agent mode
return nil
}
ctx := context.TODO()
if err := configWatcher.Reset(ctx); err != nil {
return err
}
if err := configWatcher.AddFiles(ctx, cfg.configFile); err != nil {
return err
}
// Get all rule files matching the configuration paths.
var files []string
for _, pat := range c.RuleFiles {
fs, err := filepath.Glob(pat)
if err != nil {
// The only error can be a bad pattern.
return fmt.Errorf("error retrieving rule files for %s: %w", pat, err)
}
files = append(files, fs...)
}
if err := configWatcher.AddFiles(ctx, files...); err != nil {
return err
}
// TODO: Add scrape config files
return nil
},
})
}
prometheus.MustRegister(configSuccess) prometheus.MustRegister(configSuccess)
prometheus.MustRegister(configSuccessTime) prometheus.MustRegister(configSuccessTime)
@ -941,6 +1007,20 @@ func main() {
}, },
) )
} }
// Optional configuration file watcher.
if configWatcher != nil {
ctx, cancel := context.WithCancel(context.Background())
g.Add(
func() error {
<-reloadReady.C
configWatcher.Run(ctx)
return nil
},
func(err error) {
cancel()
},
)
}
{ {
// Tracing manager. // Tracing manager.
g.Add( g.Add(
@ -971,6 +1051,7 @@ func main() {
case <-hup: case <-hup:
if err := reloadConfig(cfg.configFile, cfg.enableExpandExternalLabels, cfg.tsdb.EnableExemplarStorage, logger, noStepSubqueryInterval, reloaders...); err != nil { if err := reloadConfig(cfg.configFile, cfg.enableExpandExternalLabels, cfg.tsdb.EnableExemplarStorage, logger, noStepSubqueryInterval, reloaders...); err != nil {
level.Error(logger).Log("msg", "Error reloading config", "err", err) level.Error(logger).Log("msg", "Error reloading config", "err", err)
// TODO: metric?
} }
case rc := <-webHandler.Reload(): case rc := <-webHandler.Reload():
if err := reloadConfig(cfg.configFile, cfg.enableExpandExternalLabels, cfg.tsdb.EnableExemplarStorage, logger, noStepSubqueryInterval, reloaders...); err != nil { if err := reloadConfig(cfg.configFile, cfg.enableExpandExternalLabels, cfg.tsdb.EnableExemplarStorage, logger, noStepSubqueryInterval, reloaders...); err != nil {
@ -979,6 +1060,11 @@ func main() {
} else { } else {
rc <- nil rc <- nil
} }
case <-configWatcher.FilesChanged():
if err := reloadConfig(cfg.configFile, cfg.enableExpandExternalLabels, cfg.tsdb.EnableExemplarStorage, logger, noStepSubqueryInterval, reloaders...); err != nil {
level.Error(logger).Log("msg", "Error reloading config after watcher notification", "err", err)
// TODO: metric?
}
case <-cancel: case <-cancel:
return nil return nil
} }

58
util/fswatch/fswatch.go Normal file
View file

@ -0,0 +1,58 @@
package fswatch
import (
"context"
"errors"
"time"
"github.com/go-kit/log"
"github.com/prometheus/client_golang/prometheus"
)
type Watch struct {
name string
logger log.Logger
interval time.Duration
delay time.Duration
// TODO: Close it, handle it.
notifyCh chan struct{}
}
func New(
reg prometheus.Registerer,
name string,
logger log.Logger,
interval time.Duration,
delay time.Duration,
) *Watch {
// TODO: Add metrics
return &Watch{name: name, logger: logger, interval: interval, delay: delay, notifyCh: make(chan struct{}, 1)}
}
// TODO(bwplotka): In future we could consider string slice channel to mention which files changed. For now
// reload does not care (it reloads all).
// Only one caller at the time can use this.
func (w *Watch) FilesChanged() <-chan struct{} {
if w == nil {
return nil // A receive from a nil channel blocks forever.
}
return w.notifyCh
}
func (w *Watch) Name() string { return w.name }
func (w *Watch) AddFiles(ctx context.Context, file ...string) error {
return errors.New("not implemented")
}
func (w *Watch) Reset(ctx context.Context) error {
return errors.New("not implemented")
}
// Run errors means run stopped working, it should be safe to restart by calling Run again.
func (w *Watch) Run(ctx context.Context) error {
// TODO: Copy the code from https://github.com/thanos-io/thanos/blob/main/pkg/reloader/reloader.go and adjust based on requirements
return errors.New("not implemented")
}