Implement config reloading on SIGHUP.

With this commit, sending SIGHUP to the Prometheus process will reload
and apply the configuration file. The different components attempt
to handle failing changes gracefully.
This commit is contained in:
Fabian Reinartz 2015-05-12 16:52:56 +02:00
parent 3b0777ff84
commit bb540fd9fd
6 changed files with 103 additions and 73 deletions

53
main.go
View file

@ -92,13 +92,6 @@ type prometheus struct {
// NewPrometheus creates a new prometheus object based on flag values.
// Call Serve() to start serving and Close() for clean shutdown.
func NewPrometheus() *prometheus {
conf, err := config.LoadFromFile(*configFile)
if err != nil {
glog.Errorf("Couldn't load configuration (-config.file=%s): %v", *configFile, err)
glog.Errorf("Note: The configuration format has changed with version 0.14, please check the documentation.")
os.Exit(2)
}
notificationHandler := notification.NewNotificationHandler(*alertmanagerURL, *notificationQueueCapacity)
var syncStrategy local.SyncStrategy
@ -155,26 +148,17 @@ func NewPrometheus() *prometheus {
sampleAppender = fanout
}
targetManager, err := retrieval.NewTargetManager(conf, sampleAppender)
if err != nil {
glog.Errorf("Error creating target manager: %s", err)
os.Exit(1)
}
targetManager := retrieval.NewTargetManager(sampleAppender)
queryEngine := promql.NewEngine(memStorage)
ruleManager := rules.NewManager(&rules.ManagerOptions{
SampleAppender: sampleAppender,
NotificationHandler: notificationHandler,
EvaluationInterval: time.Duration(conf.GlobalConfig.EvaluationInterval),
QueryEngine: queryEngine,
PrometheusURL: web.MustBuildServerURL(*pathPrefix),
PathPrefix: *pathPrefix,
})
if err := ruleManager.LoadRuleFiles(conf.RuleFiles...); err != nil {
glog.Errorf("Error loading rule files: %s", err)
os.Exit(1)
}
flags := map[string]string{}
flag.VisitAll(func(f *flag.Flag) {
@ -182,7 +166,6 @@ func NewPrometheus() *prometheus {
})
prometheusStatus := &web.PrometheusStatusHandler{
BuildInfo: BuildInfo,
Config: conf.String(),
RuleManager: ruleManager,
TargetPools: targetManager.Pools,
Flags: flags,
@ -229,9 +212,27 @@ func NewPrometheus() *prometheus {
webService: webService,
}
webService.QuitChan = make(chan struct{})
p.reloadConfig()
return p
}
func (p *prometheus) reloadConfig() {
glog.Infof("Loading configuration file %s", *configFile)
conf, err := config.LoadFromFile(*configFile)
if err != nil {
glog.Errorf("Couldn't load configuration (-config.file=%s): %v", *configFile, err)
glog.Errorf("Note: The configuration format has changed with version 0.14, please check the documentation.")
return
}
p.webService.StatusHandler.ApplyConfig(conf)
p.targetManager.ApplyConfig(conf)
p.ruleManager.ApplyConfig(conf)
}
// Serve starts the Prometheus server. It returns after the server has been shut
// down. The method installs an interrupt handler, allowing to trigger a
// shutdown by sending SIGTERM to the process.
@ -252,15 +253,25 @@ func (p *prometheus) Serve() {
}
}()
notifier := make(chan os.Signal)
signal.Notify(notifier, os.Interrupt, syscall.SIGTERM)
hup := make(chan os.Signal)
signal.Notify(hup, syscall.SIGHUP)
go func() {
for range hup {
p.reloadConfig()
}
}()
term := make(chan os.Signal)
signal.Notify(term, os.Interrupt, syscall.SIGTERM)
select {
case <-notifier:
case <-term:
glog.Warning("Received SIGTERM, exiting gracefully...")
case <-p.webService.QuitChan:
glog.Warning("Received termination request via web service, exiting gracefully...")
}
close(hup)
p.targetManager.Stop()
p.ruleManager.Stop()
p.queryEngine.Stop()

View file

@ -285,6 +285,7 @@ func (t *target) RunScraper(sampleAppender storage.SampleAppender) {
// On changed scrape interval the new interval becomes effective
// after the next scrape.
if lastScrapeInterval != t.scrapeInterval {
ticker.Stop()
ticker = time.NewTicker(t.scrapeInterval)
lastScrapeInterval = t.scrapeInterval
}

View file

@ -62,16 +62,13 @@ type TargetManager struct {
providers map[*config.ScrapeConfig][]TargetProvider
}
// NewTargetManager creates a new TargetManager based on the given config.
func NewTargetManager(cfg *config.Config, sampleAppender storage.SampleAppender) (*TargetManager, error) {
// NewTargetManager creates a new TargetManager.
func NewTargetManager(sampleAppender storage.SampleAppender) *TargetManager {
tm := &TargetManager{
sampleAppender: sampleAppender,
targets: make(map[string][]Target),
}
if err := tm.applyConfig(cfg); err != nil {
return nil, err
}
return tm, nil
return tm
}
// Run starts background processing to handle target updates.
@ -129,19 +126,17 @@ func fullSource(cfg *config.ScrapeConfig, src string) string {
// Stop all background processing.
func (tm *TargetManager) Stop() {
tm.stop(true)
tm.m.Lock()
defer tm.m.Unlock()
if tm.running {
tm.stop(true)
}
}
// stop background processing of the target manager. If removeTargets is true,
// existing targets will be stopped and removed.
func (tm *TargetManager) stop(removeTargets bool) {
tm.m.Lock()
defer tm.m.Unlock()
if !tm.running {
return
}
glog.Info("Stopping target manager...")
defer glog.Info("Target manager stopped.")
@ -273,35 +268,23 @@ func (tm *TargetManager) Pools() map[string][]Target {
// ApplyConfig resets the manager's target providers and job configurations as defined
// by the new cfg. The state of targets that are valid in the new configuration remains unchanged.
func (tm *TargetManager) ApplyConfig(cfg *config.Config) error {
tm.stop(false)
// Even if updating the config failed, we want to continue rather than stop scraping anything.
defer tm.Run()
if err := tm.applyConfig(cfg); err != nil {
glog.Warningf("Error updating config, changes not applied: %s", err)
return err
}
return nil
}
func (tm *TargetManager) applyConfig(cfg *config.Config) error {
// Only apply changes if everything was successful.
providers := map[*config.ScrapeConfig][]TargetProvider{}
for _, scfg := range cfg.ScrapeConfigs {
provs, err := ProvidersFromConfig(scfg)
if err != nil {
return err
}
providers[scfg] = provs
}
func (tm *TargetManager) ApplyConfig(cfg *config.Config) {
tm.m.Lock()
defer tm.m.Unlock()
if tm.running {
tm.stop(false)
// Even if updating the config failed, we want to continue rather than stop scraping anything.
defer tm.Run()
}
providers := map[*config.ScrapeConfig][]TargetProvider{}
for _, scfg := range cfg.ScrapeConfigs {
providers[scfg] = ProvidersFromConfig(scfg)
}
tm.globalLabels = cfg.GlobalConfig.Labels
tm.providers = providers
return nil
}
// targetsFromGroup builds targets based on the given TargetGroup and config.
@ -335,7 +318,7 @@ func (tm *TargetManager) targetsFromGroup(tg *config.TargetGroup, cfg *config.Sc
labels, err := Relabel(labels, cfg.RelabelConfigs...)
if err != nil {
return nil, fmt.Errorf("error while relabelling instance %d in target group %s: %s", i, tg, err)
return nil, fmt.Errorf("error while relabeling instance %d in target group %s: %s", i, tg, err)
}
// Check if the target was dropped.
if labels == nil {
@ -357,7 +340,7 @@ func (tm *TargetManager) targetsFromGroup(tg *config.TargetGroup, cfg *config.Sc
}
// ProvidersFromConfig returns all TargetProviders configured in cfg.
func ProvidersFromConfig(cfg *config.ScrapeConfig) ([]TargetProvider, error) {
func ProvidersFromConfig(cfg *config.ScrapeConfig) []TargetProvider {
var providers []TargetProvider
for _, dnscfg := range cfg.DNSSDConfigs {
@ -367,7 +350,7 @@ func ProvidersFromConfig(cfg *config.ScrapeConfig) ([]TargetProvider, error) {
if len(cfg.TargetGroups) > 0 {
providers = append(providers, NewStaticProvider(cfg.TargetGroups))
}
return providers, nil
return providers
}
// StaticProvider holds a list of target groups that never change.

View file

@ -277,19 +277,15 @@ func TestTargetManagerConfigUpdate(t *testing.T) {
}
conf := &config.Config{DefaultedConfig: config.DefaultConfig}
targetManager, err := NewTargetManager(conf, nopAppender{})
if err != nil {
t.Fatal(err)
}
targetManager := NewTargetManager(nopAppender{})
targetManager.ApplyConfig(conf)
targetManager.Run()
defer targetManager.Stop()
for i, step := range sequence {
conf.ScrapeConfigs = step.scrapeConfigs
err := targetManager.ApplyConfig(conf)
if err != nil {
t.Fatal(err)
}
targetManager.ApplyConfig(conf)
<-time.After(1 * time.Millisecond)

View file

@ -24,6 +24,7 @@ import (
clientmodel "github.com/prometheus/client_golang/model"
"github.com/prometheus/prometheus/config"
"github.com/prometheus/prometheus/notification"
"github.com/prometheus/prometheus/promql"
"github.com/prometheus/prometheus/storage"
@ -120,7 +121,11 @@ func NewManager(o *ManagerOptions) *Manager {
func (m *Manager) Run() {
defer glog.Info("Rule manager stopped.")
ticker := time.NewTicker(m.interval)
m.Lock()
lastInterval := m.interval
m.Unlock()
ticker := time.NewTicker(lastInterval)
defer ticker.Stop()
for {
@ -137,6 +142,14 @@ func (m *Manager) Run() {
start := time.Now()
m.runIteration()
iterationDuration.Observe(float64(time.Since(start) / time.Millisecond))
m.Lock()
if lastInterval != m.interval {
ticker.Stop()
ticker = time.NewTicker(m.interval)
lastInterval = m.interval
}
m.Unlock()
case <-m.done:
return
}
@ -255,11 +268,27 @@ func (m *Manager) runIteration() {
wg.Wait()
}
// LoadRuleFiles loads alerting and recording rules from the given files.
func (m *Manager) LoadRuleFiles(filenames ...string) error {
// ApplyConfig updates the rule manager's state as the config requires. If
// loading the new rules failed the old rule set is restored.
func (m *Manager) ApplyConfig(conf *config.Config) {
m.Lock()
defer m.Unlock()
m.interval = time.Duration(conf.GlobalConfig.EvaluationInterval)
rulesSnapshot := make([]Rule, len(m.rules))
copy(rulesSnapshot, m.rules)
m.rules = m.rules[:0]
if err := m.loadRuleFiles(conf.RuleFiles...); err != nil {
// If loading the new rules failed, restore the old rule set.
m.rules = rulesSnapshot
glog.Errorf("Error loading rules, previous rule set restored: %s", err)
}
}
// loadRuleFiles loads alerting and recording rules from the given files.
func (m *Manager) loadRuleFiles(filenames ...string) error {
for _, fn := range filenames {
content, err := ioutil.ReadFile(fn)
if err != nil {

View file

@ -18,6 +18,7 @@ import (
"sync"
"time"
"github.com/prometheus/prometheus/config"
"github.com/prometheus/prometheus/retrieval"
"github.com/prometheus/prometheus/rules"
)
@ -47,5 +48,14 @@ func (h *PrometheusStatusHandler) TargetStateToClass() map[retrieval.TargetState
}
func (h *PrometheusStatusHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
h.mu.RLock()
executeTemplate(w, "status", h, h.PathPrefix)
h.mu.RUnlock()
}
// ApplyConfig updates the status handler's state as the new config requires.
func (h *PrometheusStatusHandler) ApplyConfig(conf *config.Config) {
h.mu.Lock()
h.Config = conf.String()
h.mu.Unlock()
}