diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index d4face577..898a39c60 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -275,6 +275,20 @@ description: 'Prometheus %(prometheusName)s has dropped {{ printf "%%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.' % $._config, }, }, + { + alert: 'PrometheusTargetSyncFailure', + expr: ||| + increase(prometheus_target_sync_failed_total{%(prometheusSelector)s}[30m]) > 0 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Prometheus has failed to sync targets.', + description: '{{ printf "%%.0f" $value }} targets in Prometheus %(prometheusName)s have failed to sync because invalid configuration was supplied.' % $._config, + }, + }, ] + if $._config.prometheusHAGroupLabels == '' then self.rulesWithoutHA else self.rulesWithHA, rulesWithoutHA:: [ { diff --git a/scrape/scrape.go b/scrape/scrape.go index 20600a1e0..63295e2a2 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -176,6 +176,13 @@ var ( Help: "Total number of times scrape pools hit the label limits, during sync or config reload.", }, ) + targetSyncFailed = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "prometheus_target_sync_failed_total", + Help: "Total number of target sync failures.", + }, + []string{"scrape_job"}, + ) ) func init() { @@ -199,6 +206,7 @@ func init() { targetMetadataCache, targetScrapeExemplarOutOfOrder, targetScrapePoolExceededLabelLimits, + targetSyncFailed, ) } @@ -346,6 +354,7 @@ func (sp *scrapePool) stop() { targetScrapePoolTargetLimit.DeleteLabelValues(sp.config.JobName) targetScrapePoolTargetsAdded.DeleteLabelValues(sp.config.JobName) targetSyncIntervalLength.DeleteLabelValues(sp.config.JobName) + targetSyncFailed.DeleteLabelValues(sp.config.JobName) } } @@ -445,11 +454,11 @@ func (sp *scrapePool) Sync(tgs []*targetgroup.Group) { var all []*Target sp.droppedTargets = []*Target{} for _, tg := range tgs { - targets, err := targetsFromGroup(tg, sp.config) - if err != nil { - level.Error(sp.logger).Log("msg", "creating targets failed", "err", err) - continue + targets, failures := targetsFromGroup(tg, sp.config) + for _, err := range failures { + level.Error(sp.logger).Log("msg", "Creating target failed", "err", err) } + targetSyncFailed.WithLabelValues(sp.config.JobName).Add(float64(len(failures))) for _, t := range targets { if t.Labels().Len() > 0 { all = append(all, t) diff --git a/scrape/target.go b/scrape/target.go index f3dd2d0c0..4a7b6eb0f 100644 --- a/scrape/target.go +++ b/scrape/target.go @@ -414,8 +414,9 @@ func populateLabels(lset labels.Labels, cfg *config.ScrapeConfig) (res, orig lab } // targetsFromGroup builds targets based on the given TargetGroup and config. -func targetsFromGroup(tg *targetgroup.Group, cfg *config.ScrapeConfig) ([]*Target, error) { +func targetsFromGroup(tg *targetgroup.Group, cfg *config.ScrapeConfig) ([]*Target, []error) { targets := make([]*Target, 0, len(tg.Targets)) + failures := []error{} for i, tlset := range tg.Targets { lbls := make([]labels.Label, 0, len(tlset)+len(tg.Labels)) @@ -433,11 +434,11 @@ func targetsFromGroup(tg *targetgroup.Group, cfg *config.ScrapeConfig) ([]*Targe lbls, origLabels, err := populateLabels(lset, cfg) if err != nil { - return nil, errors.Wrapf(err, "instance %d in group %s", i, tg) + failures = append(failures, errors.Wrapf(err, "instance %d in group %s", i, tg)) } if lbls != nil || origLabels != nil { targets = append(targets, NewTarget(lbls, origLabels, cfg.Params)) } } - return targets, nil + return targets, failures } diff --git a/scrape/target_test.go b/scrape/target_test.go index cc6a02291..6a7a77fec 100644 --- a/scrape/target_test.go +++ b/scrape/target_test.go @@ -29,6 +29,8 @@ import ( "github.com/prometheus/common/model" "github.com/stretchr/testify/require" + "github.com/prometheus/prometheus/config" + "github.com/prometheus/prometheus/discovery/targetgroup" "github.com/prometheus/prometheus/pkg/labels" ) @@ -365,3 +367,18 @@ func TestNewClientWithBadTLSConfig(t *testing.T) { t.Fatalf("Expected error, got nil.") } } + +func TestTargetsFromGroup(t *testing.T) { + expectedError := "instance 0 in group : no address" + + targets, failures := targetsFromGroup(&targetgroup.Group{Targets: []model.LabelSet{{}, {model.AddressLabel: "localhost:9090"}}}, &config.ScrapeConfig{}) + if len(targets) != 1 { + t.Fatalf("Expected 1 target, got %v", len(targets)) + } + if len(failures) != 1 { + t.Fatalf("Expected 1 failure, got %v", len(failures)) + } + if failures[0].Error() != expectedError { + t.Fatalf("Expected error %s, got %s", expectedError, failures[0]) + } +}