diff --git a/discovery/manager.go b/discovery/manager.go index 4625e42a3..5457bd9b2 100644 --- a/discovery/manager.go +++ b/discovery/manager.go @@ -41,10 +41,10 @@ import ( ) var ( - failedConfigs = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Name: "prometheus_sd_configs_failed_total", - Help: "Total number of service discovery configurations that failed to load.", + failedConfigs = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "prometheus_sd_failed_configs", + Help: "Current number of service discovery configurations that failed to load.", }, []string{"name"}, ) @@ -194,10 +194,14 @@ func (m *Manager) ApplyConfig(cfg map[string]sd_config.ServiceDiscoveryConfig) e m.targets = make(map[poolKey]map[string]*targetgroup.Group) m.providers = nil m.discoverCancel = nil + + failedCount := 0 for name, scfg := range cfg { - m.registerProviders(scfg, name) + failedCount += m.registerProviders(scfg, name) discoveredTargets.WithLabelValues(m.name, name).Set(0) } + failedConfigs.WithLabelValues(m.name).Set(float64(failedCount)) + for _, prov := range m.providers { m.startProvider(m.ctx, prov) } @@ -317,8 +321,12 @@ func (m *Manager) allGroups() map[string][]*targetgroup.Group { return tSets } -func (m *Manager) registerProviders(cfg sd_config.ServiceDiscoveryConfig, setName string) { - var added bool +// registerProviders returns a number of failed SD config. +func (m *Manager) registerProviders(cfg sd_config.ServiceDiscoveryConfig, setName string) int { + var ( + failedCount int + added bool + ) add := func(cfg interface{}, newDiscoverer func() (Discoverer, error)) { t := reflect.TypeOf(cfg).String() for _, p := range m.providers { @@ -332,7 +340,7 @@ func (m *Manager) registerProviders(cfg sd_config.ServiceDiscoveryConfig, setNam d, err := newDiscoverer() if err != nil { level.Error(m.logger).Log("msg", "Cannot create service discovery", "err", err, "type", t) - failedConfigs.WithLabelValues(m.name).Inc() + failedCount++ return } @@ -421,6 +429,7 @@ func (m *Manager) registerProviders(cfg sd_config.ServiceDiscoveryConfig, setNam return &StaticProvider{TargetGroups: []*targetgroup.Group{{}}}, nil }) } + return failedCount } // StaticProvider holds a list of target groups that never change. diff --git a/discovery/manager_test.go b/discovery/manager_test.go index b2bff1fc4..9e5b229fb 100644 --- a/discovery/manager_test.go +++ b/discovery/manager_test.go @@ -25,6 +25,8 @@ import ( "time" "github.com/go-kit/kit/log" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" "github.com/prometheus/common/model" "github.com/prometheus/prometheus/config" sd_config "github.com/prometheus/prometheus/discovery/config" @@ -949,6 +951,91 @@ scrape_configs: } } +func TestGaugeFailedConfigs(t *testing.T) { + var ( + fcGauge prometheus.Gauge + err error + ) + + cfgOneText := ` +scrape_configs: +- job_name: prometheus + consul_sd_configs: + - server: "foo:8500" + tls_config: + cert_file: "/tmp/non_existent" + - server: "bar:8500" + tls_config: + cert_file: "/tmp/non_existent" + - server: "foo2:8500" + tls_config: + cert_file: "/tmp/non_existent" +` + cfgOne := &config.Config{} + + err = yaml.UnmarshalStrict([]byte(cfgOneText), cfgOne) + if err != nil { + t.Fatalf("Unable to load YAML config cfgOne: %s", err) + } + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + discoveryManager := NewManager(ctx, log.NewNopLogger()) + discoveryManager.updatert = 100 * time.Millisecond + go discoveryManager.Run() + + c := make(map[string]sd_config.ServiceDiscoveryConfig) + for _, v := range cfgOne.ScrapeConfigs { + c[v.JobName] = v.ServiceDiscoveryConfig + } + + discoveryManager.ApplyConfig(c) + <-discoveryManager.SyncCh() + + metricOne := &dto.Metric{} + fcGauge, err = failedConfigs.GetMetricWithLabelValues(discoveryManager.name) + if err != nil { + t.Fatal(err) + } + + fcGauge.Write(metricOne) + + failedCount := metricOne.GetGauge().GetValue() + if failedCount != 3 { + t.Fatalf("Expected to have 3 failed configs, got: %v", failedCount) + } + + cfgTwoText := ` +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ["foo:9090"] +` + cfgTwo := &config.Config{} + if err := yaml.UnmarshalStrict([]byte(cfgTwoText), cfgTwo); err != nil { + t.Fatalf("Unable to load YAML config cfgTwo: %s", err) + } + c = make(map[string]sd_config.ServiceDiscoveryConfig) + for _, v := range cfgTwo.ScrapeConfigs { + c[v.JobName] = v.ServiceDiscoveryConfig + } + + discoveryManager.ApplyConfig(c) + <-discoveryManager.SyncCh() + + metricTwo := &dto.Metric{} + fcGauge, err = failedConfigs.GetMetricWithLabelValues(discoveryManager.name) + if err != nil { + t.Fatal(err) + } + fcGauge.Write(metricTwo) + + failedCount = metricTwo.GetGauge().GetValue() + if failedCount != 0 { + t.Fatalf("Expected to get no failed config, got: %v", failedCount) + } + +} + func TestCoordinationWithReceiver(t *testing.T) { updateDelay := 100 * time.Millisecond