From 9254279be130b96019188b5915543a283b3c0f7e Mon Sep 17 00:00:00 2001 From: bwplotka Date: Tue, 8 Aug 2023 08:56:43 +0100 Subject: [PATCH] Add option to auto detect configuration changes. Fixes https://github.com/prometheus/prometheus/issues/9783#issuecomment-1669341273 Signed-off-by: bwplotka --- cmd/prometheus/main.go | 88 ++++++++++++++++++++++++++++++++++++++++- util/fswatch/fswatch.go | 58 +++++++++++++++++++++++++++ 2 files changed, 145 insertions(+), 1 deletion(-) create mode 100644 util/fswatch/fswatch.go diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index d48898b94..7d7b56569 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -47,6 +47,7 @@ import ( promlogflag "github.com/prometheus/common/promlog/flag" "github.com/prometheus/common/version" toolkit_web "github.com/prometheus/exporter-toolkit/web" + "github.com/prometheus/prometheus/util/fswatch" "go.uber.org/atomic" "go.uber.org/automaxprocs/maxprocs" "k8s.io/klog" @@ -128,7 +129,9 @@ func agentOnlyFlag(app *kingpin.Application, name, help string) *kingpin.FlagCla } type flagConfig struct { - configFile string + configFile string + configWatchInterval model.Duration + configWatchDelay model.Duration agentStoragePath string serverStoragePath string @@ -253,6 +256,19 @@ func main() { a.Flag("config.file", "Prometheus configuration file path."). Default("prometheus.yml").StringVar(&cfg.configFile) + a.Flag("config.watch-interval", "For non-zero duration, Prometheus will watch for configuration file changes, "+ + "as well as, previously specified rule changes and scrape config file changes. Once changes are noticed,"+ + "and after delay specified in --config.watch-delay, Prometheus will self-reload. "+ + "Change detection is done through filesystem inotify with the regular interval specified in the flag, as well as, checksum validation."+ + "With this flag, there is no need to reload Prometheus on configuration changes from the outside."). + Default("0").SetValue(&cfg.configWatchInterval) + + a.Flag("config.watch-delay", "The duration between noticing the configuration changes "+ + "and Prometheus self-reloading. Needed for throttling reloads which can be expensive."+ + "For automation that updates configuration it's common to update file one by one within seconds."+ + "Delay allows waiting some time to perform one reload for multiple small changes within short period"). + Default("30s").SetValue(&cfg.configWatchDelay) + a.Flag("web.listen-address", "Address to listen on for UI, API, and telemetry."). Default("0.0.0.0:9090").StringVar(&cfg.web.ListenAddress) @@ -723,6 +739,18 @@ func main() { // This is passed to ruleManager.Update(). externalURL := cfg.web.ExternalURL.String() + var configWatcher *fswatch.Watch + if cfg.configWatchInterval > 0 { + configWatcher = fswatch.New( + prometheus.DefaultRegisterer, + "config", + logger, + time.Duration(cfg.configWatchInterval), + time.Duration(cfg.configWatchDelay), + ) + configWatcher.AddFiles(context.TODO(), cfg.configFile) + } + reloaders := []reloader{ { name: "db_storage", @@ -815,6 +843,44 @@ func main() { }, } + if configWatcher != nil { + reloaders = append(reloaders, reloader{ + name: configWatcher.Name(), + reloader: func(c *config.Config) error { + if agentMode { + // No-op in Agent mode + return nil + } + + ctx := context.TODO() + if err := configWatcher.Reset(ctx); err != nil { + return err + } + if err := configWatcher.AddFiles(ctx, cfg.configFile); err != nil { + return err + } + + // Get all rule files matching the configuration paths. + var files []string + for _, pat := range c.RuleFiles { + fs, err := filepath.Glob(pat) + if err != nil { + // The only error can be a bad pattern. + return fmt.Errorf("error retrieving rule files for %s: %w", pat, err) + } + files = append(files, fs...) + } + + if err := configWatcher.AddFiles(ctx, files...); err != nil { + return err + } + + // TODO: Add scrape config files + return nil + }, + }) + } + prometheus.MustRegister(configSuccess) prometheus.MustRegister(configSuccessTime) @@ -941,6 +1007,20 @@ func main() { }, ) } + // Optional configuration file watcher. + if configWatcher != nil { + ctx, cancel := context.WithCancel(context.Background()) + g.Add( + func() error { + <-reloadReady.C + configWatcher.Run(ctx) + return nil + }, + func(err error) { + cancel() + }, + ) + } { // Tracing manager. g.Add( @@ -971,6 +1051,7 @@ func main() { case <-hup: if err := reloadConfig(cfg.configFile, cfg.enableExpandExternalLabels, cfg.tsdb.EnableExemplarStorage, logger, noStepSubqueryInterval, reloaders...); err != nil { level.Error(logger).Log("msg", "Error reloading config", "err", err) + // TODO: metric? } case rc := <-webHandler.Reload(): if err := reloadConfig(cfg.configFile, cfg.enableExpandExternalLabels, cfg.tsdb.EnableExemplarStorage, logger, noStepSubqueryInterval, reloaders...); err != nil { @@ -979,6 +1060,11 @@ func main() { } else { rc <- nil } + case <-configWatcher.FilesChanged(): + if err := reloadConfig(cfg.configFile, cfg.enableExpandExternalLabels, cfg.tsdb.EnableExemplarStorage, logger, noStepSubqueryInterval, reloaders...); err != nil { + level.Error(logger).Log("msg", "Error reloading config after watcher notification", "err", err) + // TODO: metric? + } case <-cancel: return nil } diff --git a/util/fswatch/fswatch.go b/util/fswatch/fswatch.go new file mode 100644 index 000000000..a0849398e --- /dev/null +++ b/util/fswatch/fswatch.go @@ -0,0 +1,58 @@ +package fswatch + +import ( + "context" + "errors" + "time" + + "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" +) + +type Watch struct { + name string + logger log.Logger + interval time.Duration + delay time.Duration + + // TODO: Close it, handle it. + notifyCh chan struct{} +} + +func New( + reg prometheus.Registerer, + name string, + logger log.Logger, + interval time.Duration, + delay time.Duration, +) *Watch { + // TODO: Add metrics + + return &Watch{name: name, logger: logger, interval: interval, delay: delay, notifyCh: make(chan struct{}, 1)} +} + +// TODO(bwplotka): In future we could consider string slice channel to mention which files changed. For now +// reload does not care (it reloads all). +// Only one caller at the time can use this. +func (w *Watch) FilesChanged() <-chan struct{} { + if w == nil { + return nil // A receive from a nil channel blocks forever. + } + return w.notifyCh +} + +func (w *Watch) Name() string { return w.name } + +func (w *Watch) AddFiles(ctx context.Context, file ...string) error { + return errors.New("not implemented") +} + +func (w *Watch) Reset(ctx context.Context) error { + return errors.New("not implemented") +} + +// Run errors means run stopped working, it should be safe to restart by calling Run again. +func (w *Watch) Run(ctx context.Context) error { + // TODO: Copy the code from https://github.com/thanos-io/thanos/blob/main/pkg/reloader/reloader.go and adjust based on requirements + return errors.New("not implemented") +}