mirror of
https://github.com/prometheus/prometheus.git
synced 2025-01-11 22:07:27 -08:00
SD: Add target creation failure counter and change failure handling (#8786)
* Added metric and changed failure/drop strategy Signed-off-by: Levi Harrison <git@leviharrison.dev>
This commit is contained in:
parent
ae086c73cb
commit
2826fbeeb7
|
@ -275,6 +275,20 @@
|
||||||
description: 'Prometheus %(prometheusName)s has dropped {{ printf "%%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.' % $._config,
|
description: 'Prometheus %(prometheusName)s has dropped {{ printf "%%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.' % $._config,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
alert: 'PrometheusTargetSyncFailure',
|
||||||
|
expr: |||
|
||||||
|
increase(prometheus_target_sync_failed_total{%(prometheusSelector)s}[30m]) > 0
|
||||||
|
||| % $._config,
|
||||||
|
'for': '5m',
|
||||||
|
labels: {
|
||||||
|
severity: 'critical',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
summary: 'Prometheus has failed to sync targets.',
|
||||||
|
description: '{{ printf "%%.0f" $value }} targets in Prometheus %(prometheusName)s have failed to sync because invalid configuration was supplied.' % $._config,
|
||||||
|
},
|
||||||
|
},
|
||||||
] + if $._config.prometheusHAGroupLabels == '' then self.rulesWithoutHA else self.rulesWithHA,
|
] + if $._config.prometheusHAGroupLabels == '' then self.rulesWithoutHA else self.rulesWithHA,
|
||||||
rulesWithoutHA:: [
|
rulesWithoutHA:: [
|
||||||
{
|
{
|
||||||
|
|
|
@ -176,6 +176,13 @@ var (
|
||||||
Help: "Total number of times scrape pools hit the label limits, during sync or config reload.",
|
Help: "Total number of times scrape pools hit the label limits, during sync or config reload.",
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
targetSyncFailed = prometheus.NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Name: "prometheus_target_sync_failed_total",
|
||||||
|
Help: "Total number of target sync failures.",
|
||||||
|
},
|
||||||
|
[]string{"scrape_job"},
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
|
@ -199,6 +206,7 @@ func init() {
|
||||||
targetMetadataCache,
|
targetMetadataCache,
|
||||||
targetScrapeExemplarOutOfOrder,
|
targetScrapeExemplarOutOfOrder,
|
||||||
targetScrapePoolExceededLabelLimits,
|
targetScrapePoolExceededLabelLimits,
|
||||||
|
targetSyncFailed,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -346,6 +354,7 @@ func (sp *scrapePool) stop() {
|
||||||
targetScrapePoolTargetLimit.DeleteLabelValues(sp.config.JobName)
|
targetScrapePoolTargetLimit.DeleteLabelValues(sp.config.JobName)
|
||||||
targetScrapePoolTargetsAdded.DeleteLabelValues(sp.config.JobName)
|
targetScrapePoolTargetsAdded.DeleteLabelValues(sp.config.JobName)
|
||||||
targetSyncIntervalLength.DeleteLabelValues(sp.config.JobName)
|
targetSyncIntervalLength.DeleteLabelValues(sp.config.JobName)
|
||||||
|
targetSyncFailed.DeleteLabelValues(sp.config.JobName)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -445,11 +454,11 @@ func (sp *scrapePool) Sync(tgs []*targetgroup.Group) {
|
||||||
var all []*Target
|
var all []*Target
|
||||||
sp.droppedTargets = []*Target{}
|
sp.droppedTargets = []*Target{}
|
||||||
for _, tg := range tgs {
|
for _, tg := range tgs {
|
||||||
targets, err := targetsFromGroup(tg, sp.config)
|
targets, failures := targetsFromGroup(tg, sp.config)
|
||||||
if err != nil {
|
for _, err := range failures {
|
||||||
level.Error(sp.logger).Log("msg", "creating targets failed", "err", err)
|
level.Error(sp.logger).Log("msg", "Creating target failed", "err", err)
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
|
targetSyncFailed.WithLabelValues(sp.config.JobName).Add(float64(len(failures)))
|
||||||
for _, t := range targets {
|
for _, t := range targets {
|
||||||
if t.Labels().Len() > 0 {
|
if t.Labels().Len() > 0 {
|
||||||
all = append(all, t)
|
all = append(all, t)
|
||||||
|
|
|
@ -414,8 +414,9 @@ func populateLabels(lset labels.Labels, cfg *config.ScrapeConfig) (res, orig lab
|
||||||
}
|
}
|
||||||
|
|
||||||
// targetsFromGroup builds targets based on the given TargetGroup and config.
|
// targetsFromGroup builds targets based on the given TargetGroup and config.
|
||||||
func targetsFromGroup(tg *targetgroup.Group, cfg *config.ScrapeConfig) ([]*Target, error) {
|
func targetsFromGroup(tg *targetgroup.Group, cfg *config.ScrapeConfig) ([]*Target, []error) {
|
||||||
targets := make([]*Target, 0, len(tg.Targets))
|
targets := make([]*Target, 0, len(tg.Targets))
|
||||||
|
failures := []error{}
|
||||||
|
|
||||||
for i, tlset := range tg.Targets {
|
for i, tlset := range tg.Targets {
|
||||||
lbls := make([]labels.Label, 0, len(tlset)+len(tg.Labels))
|
lbls := make([]labels.Label, 0, len(tlset)+len(tg.Labels))
|
||||||
|
@ -433,11 +434,11 @@ func targetsFromGroup(tg *targetgroup.Group, cfg *config.ScrapeConfig) ([]*Targe
|
||||||
|
|
||||||
lbls, origLabels, err := populateLabels(lset, cfg)
|
lbls, origLabels, err := populateLabels(lset, cfg)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, errors.Wrapf(err, "instance %d in group %s", i, tg)
|
failures = append(failures, errors.Wrapf(err, "instance %d in group %s", i, tg))
|
||||||
}
|
}
|
||||||
if lbls != nil || origLabels != nil {
|
if lbls != nil || origLabels != nil {
|
||||||
targets = append(targets, NewTarget(lbls, origLabels, cfg.Params))
|
targets = append(targets, NewTarget(lbls, origLabels, cfg.Params))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return targets, nil
|
return targets, failures
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,6 +29,8 @@ import (
|
||||||
"github.com/prometheus/common/model"
|
"github.com/prometheus/common/model"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
|
"github.com/prometheus/prometheus/config"
|
||||||
|
"github.com/prometheus/prometheus/discovery/targetgroup"
|
||||||
"github.com/prometheus/prometheus/pkg/labels"
|
"github.com/prometheus/prometheus/pkg/labels"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -365,3 +367,18 @@ func TestNewClientWithBadTLSConfig(t *testing.T) {
|
||||||
t.Fatalf("Expected error, got nil.")
|
t.Fatalf("Expected error, got nil.")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestTargetsFromGroup(t *testing.T) {
|
||||||
|
expectedError := "instance 0 in group : no address"
|
||||||
|
|
||||||
|
targets, failures := targetsFromGroup(&targetgroup.Group{Targets: []model.LabelSet{{}, {model.AddressLabel: "localhost:9090"}}}, &config.ScrapeConfig{})
|
||||||
|
if len(targets) != 1 {
|
||||||
|
t.Fatalf("Expected 1 target, got %v", len(targets))
|
||||||
|
}
|
||||||
|
if len(failures) != 1 {
|
||||||
|
t.Fatalf("Expected 1 failure, got %v", len(failures))
|
||||||
|
}
|
||||||
|
if failures[0].Error() != expectedError {
|
||||||
|
t.Fatalf("Expected error %s, got %s", expectedError, failures[0])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue