Implement config reloading on SIGHUP.

With this commit, sending SIGHUP to the Prometheus process will reload and apply the configuration file. The different components attempt to handle failing changes gracefully.
2025-03-05 20:59:13 -08:00 · 2015-05-12 16:52:56 +02:00 · 2015-05-12 16:52:56 +02:00 · bb540fd9fd
parent 3b0777ff84
commit bb540fd9fd
6 changed files with 103 additions and 73 deletions
--- a/main.go
+++ b/main.go
@ -92,13 +92,6 @@ type prometheus struct {
 // NewPrometheus creates a new prometheus object based on flag values.
 // Call Serve() to start serving and Close() for clean shutdown.
 func NewPrometheus() *prometheus {
-	conf, err := config.LoadFromFile(*configFile)
-	if err != nil {
-		glog.Errorf("Couldn't load configuration (-config.file=%s): %v", *configFile, err)
-		glog.Errorf("Note: The configuration format has changed with version 0.14, please check the documentation.")
-		os.Exit(2)
-	}
-
 	notificationHandler := notification.NewNotificationHandler(*alertmanagerURL, *notificationQueueCapacity)

 	var syncStrategy local.SyncStrategy
@ -155,26 +148,17 @@ func NewPrometheus() *prometheus {
 		sampleAppender = fanout
 	}

-	targetManager, err := retrieval.NewTargetManager(conf, sampleAppender)
-	if err != nil {
-		glog.Errorf("Error creating target manager: %s", err)
-		os.Exit(1)
-	}
+	targetManager := retrieval.NewTargetManager(sampleAppender)

 	queryEngine := promql.NewEngine(memStorage)

 	ruleManager := rules.NewManager(&rules.ManagerOptions{
 		SampleAppender:      sampleAppender,
 		NotificationHandler: notificationHandler,
-		EvaluationInterval:  time.Duration(conf.GlobalConfig.EvaluationInterval),
 		QueryEngine:         queryEngine,
 		PrometheusURL:       web.MustBuildServerURL(*pathPrefix),
 		PathPrefix:          *pathPrefix,
 	})
-	if err := ruleManager.LoadRuleFiles(conf.RuleFiles...); err != nil {
-		glog.Errorf("Error loading rule files: %s", err)
-		os.Exit(1)
-	}

 	flags := map[string]string{}
 	flag.VisitAll(func(f *flag.Flag) {
@ -182,7 +166,6 @@ func NewPrometheus() *prometheus {
 	})
 	prometheusStatus := &web.PrometheusStatusHandler{
 		BuildInfo:   BuildInfo,
-		Config:      conf.String(),
 		RuleManager: ruleManager,
 		TargetPools: targetManager.Pools,
 		Flags:       flags,
@ -229,9 +212,27 @@ func NewPrometheus() *prometheus {
 		webService: webService,
 	}
 	webService.QuitChan = make(chan struct{})
+
+	p.reloadConfig()
+
 	return p
 }

+func (p *prometheus) reloadConfig() {
+	glog.Infof("Loading configuration file %s", *configFile)
+
+	conf, err := config.LoadFromFile(*configFile)
+	if err != nil {
+		glog.Errorf("Couldn't load configuration (-config.file=%s): %v", *configFile, err)
+		glog.Errorf("Note: The configuration format has changed with version 0.14, please check the documentation.")
+		return
+	}
+
+	p.webService.StatusHandler.ApplyConfig(conf)
+	p.targetManager.ApplyConfig(conf)
+	p.ruleManager.ApplyConfig(conf)
+}
+
 // Serve starts the Prometheus server. It returns after the server has been shut
 // down. The method installs an interrupt handler, allowing to trigger a
 // shutdown by sending SIGTERM to the process.
@ -252,15 +253,25 @@ func (p *prometheus) Serve() {
 		}
 	}()

-	notifier := make(chan os.Signal)
-	signal.Notify(notifier, os.Interrupt, syscall.SIGTERM)
+	hup := make(chan os.Signal)
+	signal.Notify(hup, syscall.SIGHUP)
+	go func() {
+		for range hup {
+			p.reloadConfig()
+		}
+	}()
+
+	term := make(chan os.Signal)
+	signal.Notify(term, os.Interrupt, syscall.SIGTERM)
 	select {
-	case <-notifier:
+	case <-term:
 		glog.Warning("Received SIGTERM, exiting gracefully...")
 	case <-p.webService.QuitChan:
 		glog.Warning("Received termination request via web service, exiting gracefully...")
 	}

+	close(hup)
+
 	p.targetManager.Stop()
 	p.ruleManager.Stop()
 	p.queryEngine.Stop()
--- a/retrieval/target.go
+++ b/retrieval/target.go
@ -285,6 +285,7 @@ func (t *target) RunScraper(sampleAppender storage.SampleAppender) {
 				// On changed scrape interval the new interval becomes effective
 				// after the next scrape.
 				if lastScrapeInterval != t.scrapeInterval {
+					ticker.Stop()
 					ticker = time.NewTicker(t.scrapeInterval)
 					lastScrapeInterval = t.scrapeInterval
 				}
--- a/retrieval/targetmanager.go
+++ b/retrieval/targetmanager.go
@ -62,16 +62,13 @@ type TargetManager struct {
 	providers map[*config.ScrapeConfig][]TargetProvider
 }

-// NewTargetManager creates a new TargetManager based on the given config.
-func NewTargetManager(cfg *config.Config, sampleAppender storage.SampleAppender) (*TargetManager, error) {
+// NewTargetManager creates a new TargetManager.
+func NewTargetManager(sampleAppender storage.SampleAppender) *TargetManager {
 	tm := &TargetManager{
 		sampleAppender: sampleAppender,
 		targets:        make(map[string][]Target),
 	}
-	if err := tm.applyConfig(cfg); err != nil {
-		return nil, err
-	}
-	return tm, nil
+	return tm
 }

 // Run starts background processing to handle target updates.
@ -129,19 +126,17 @@ func fullSource(cfg *config.ScrapeConfig, src string) string {

 // Stop all background processing.
 func (tm *TargetManager) Stop() {
-	tm.stop(true)
+	tm.m.Lock()
+	defer tm.m.Unlock()
+
+	if tm.running {
+		tm.stop(true)
+	}
 }

 // stop background processing of the target manager. If removeTargets is true,
 // existing targets will be stopped and removed.
 func (tm *TargetManager) stop(removeTargets bool) {
-	tm.m.Lock()
-	defer tm.m.Unlock()
-
-	if !tm.running {
-		return
-	}
-
 	glog.Info("Stopping target manager...")
 	defer glog.Info("Target manager stopped.")

@ -273,35 +268,23 @@ func (tm *TargetManager) Pools() map[string][]Target {

 // ApplyConfig resets the manager's target providers and job configurations as defined
 // by the new cfg. The state of targets that are valid in the new configuration remains unchanged.
-func (tm *TargetManager) ApplyConfig(cfg *config.Config) error {
-	tm.stop(false)
-	// Even if updating the config failed, we want to continue rather than stop scraping anything.
-	defer tm.Run()
-
-	if err := tm.applyConfig(cfg); err != nil {
-		glog.Warningf("Error updating config, changes not applied: %s", err)
-		return err
-	}
-	return nil
-}
-
-func (tm *TargetManager) applyConfig(cfg *config.Config) error {
-	// Only apply changes if everything was successful.
-	providers := map[*config.ScrapeConfig][]TargetProvider{}
-
-	for _, scfg := range cfg.ScrapeConfigs {
-		provs, err := ProvidersFromConfig(scfg)
-		if err != nil {
-			return err
-		}
-		providers[scfg] = provs
-	}
+func (tm *TargetManager) ApplyConfig(cfg *config.Config) {
 	tm.m.Lock()
 	defer tm.m.Unlock()

+	if tm.running {
+		tm.stop(false)
+		// Even if updating the config failed, we want to continue rather than stop scraping anything.
+		defer tm.Run()
+	}
+	providers := map[*config.ScrapeConfig][]TargetProvider{}
+
+	for _, scfg := range cfg.ScrapeConfigs {
+		providers[scfg] = ProvidersFromConfig(scfg)
+	}
+
 	tm.globalLabels = cfg.GlobalConfig.Labels
 	tm.providers = providers
-	return nil
 }

 // targetsFromGroup builds targets based on the given TargetGroup and config.
@ -335,7 +318,7 @@ func (tm *TargetManager) targetsFromGroup(tg *config.TargetGroup, cfg *config.Sc

 		labels, err := Relabel(labels, cfg.RelabelConfigs...)
 		if err != nil {
-			return nil, fmt.Errorf("error while relabelling instance %d in target group %s: %s", i, tg, err)
+			return nil, fmt.Errorf("error while relabeling instance %d in target group %s: %s", i, tg, err)
 		}
 		// Check if the target was dropped.
 		if labels == nil {
@ -357,7 +340,7 @@ func (tm *TargetManager) targetsFromGroup(tg *config.TargetGroup, cfg *config.Sc
 }

 // ProvidersFromConfig returns all TargetProviders configured in cfg.
-func ProvidersFromConfig(cfg *config.ScrapeConfig) ([]TargetProvider, error) {
+func ProvidersFromConfig(cfg *config.ScrapeConfig) []TargetProvider {
 	var providers []TargetProvider

 	for _, dnscfg := range cfg.DNSSDConfigs {
@ -367,7 +350,7 @@ func ProvidersFromConfig(cfg *config.ScrapeConfig) ([]TargetProvider, error) {
 	if len(cfg.TargetGroups) > 0 {
 		providers = append(providers, NewStaticProvider(cfg.TargetGroups))
 	}
-	return providers, nil
+	return providers
 }

 // StaticProvider holds a list of target groups that never change.
--- a/retrieval/targetmanager_test.go
+++ b/retrieval/targetmanager_test.go
@ -277,19 +277,15 @@ func TestTargetManagerConfigUpdate(t *testing.T) {
 	}
 	conf := &config.Config{DefaultedConfig: config.DefaultConfig}

-	targetManager, err := NewTargetManager(conf, nopAppender{})
-	if err != nil {
-		t.Fatal(err)
-	}
+	targetManager := NewTargetManager(nopAppender{})
+	targetManager.ApplyConfig(conf)
+
 	targetManager.Run()
 	defer targetManager.Stop()

 	for i, step := range sequence {
 		conf.ScrapeConfigs = step.scrapeConfigs
-		err := targetManager.ApplyConfig(conf)
-		if err != nil {
-			t.Fatal(err)
-		}
+		targetManager.ApplyConfig(conf)

 		<-time.After(1 * time.Millisecond)

--- a/rules/manager.go
+++ b/rules/manager.go
@ -24,6 +24,7 @@ import (

 	clientmodel "github.com/prometheus/client_golang/model"

+	"github.com/prometheus/prometheus/config"
 	"github.com/prometheus/prometheus/notification"
 	"github.com/prometheus/prometheus/promql"
 	"github.com/prometheus/prometheus/storage"
@ -120,7 +121,11 @@ func NewManager(o *ManagerOptions) *Manager {
 func (m *Manager) Run() {
 	defer glog.Info("Rule manager stopped.")

-	ticker := time.NewTicker(m.interval)
+	m.Lock()
+	lastInterval := m.interval
+	m.Unlock()
+
+	ticker := time.NewTicker(lastInterval)
 	defer ticker.Stop()

 	for {
@ -137,6 +142,14 @@ func (m *Manager) Run() {
 				start := time.Now()
 				m.runIteration()
 				iterationDuration.Observe(float64(time.Since(start) / time.Millisecond))
+
+				m.Lock()
+				if lastInterval != m.interval {
+					ticker.Stop()
+					ticker = time.NewTicker(m.interval)
+					lastInterval = m.interval
+				}
+				m.Unlock()
 			case <-m.done:
 				return
 			}
@ -255,11 +268,27 @@ func (m *Manager) runIteration() {
 	wg.Wait()
 }

-// LoadRuleFiles loads alerting and recording rules from the given files.
-func (m *Manager) LoadRuleFiles(filenames ...string) error {
+// ApplyConfig updates the rule manager's state as the config requires. If
+// loading the new rules failed the old rule set is restored.
+func (m *Manager) ApplyConfig(conf *config.Config) {
 	m.Lock()
 	defer m.Unlock()

+	m.interval = time.Duration(conf.GlobalConfig.EvaluationInterval)
+
+	rulesSnapshot := make([]Rule, len(m.rules))
+	copy(rulesSnapshot, m.rules)
+	m.rules = m.rules[:0]
+
+	if err := m.loadRuleFiles(conf.RuleFiles...); err != nil {
+		// If loading the new rules failed, restore the old rule set.
+		m.rules = rulesSnapshot
+		glog.Errorf("Error loading rules, previous rule set restored: %s", err)
+	}
+}
+
+// loadRuleFiles loads alerting and recording rules from the given files.
+func (m *Manager) loadRuleFiles(filenames ...string) error {
 	for _, fn := range filenames {
 		content, err := ioutil.ReadFile(fn)
 		if err != nil {
--- a/web/status.go
+++ b/web/status.go
@ -18,6 +18,7 @@ import (
 	"sync"
 	"time"

+	"github.com/prometheus/prometheus/config"
 	"github.com/prometheus/prometheus/retrieval"
 	"github.com/prometheus/prometheus/rules"
 )
@ -47,5 +48,14 @@ func (h *PrometheusStatusHandler) TargetStateToClass() map[retrieval.TargetState
 }

 func (h *PrometheusStatusHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
+	h.mu.RLock()
 	executeTemplate(w, "status", h, h.PathPrefix)
+	h.mu.RUnlock()
+}
+
+// ApplyConfig updates the status handler's state as the new config requires.
+func (h *PrometheusStatusHandler) ApplyConfig(conf *config.Config) {
+	h.mu.Lock()
+	h.Config = conf.String()
+	h.mu.Unlock()
 }