From b50f9c1c8486b403c0299e84498eeec73f475e1a Mon Sep 17 00:00:00 2001 From: Damien Grisonnet Date: Thu, 6 May 2021 10:56:21 +0200 Subject: [PATCH] Add label scrape limits (#8777) * scrape: add label limits per scrape Add three new limits to the scrape configuration to provide some mechanism to defend against unbound number of labels and excessive label lengths. If any of these limits are broken by a sample from a scrape, the whole scrape will fail. For all of these configuration options, a zero value means no limit. The `label_limit` configuration will provide a mechanism to bound the number of labels per-scrape of a certain sample to a user defined limit. This limit will be tested against the sample labels plus the discovery labels, but it will exclude the __name__ from the count since it is a mandatory Prometheus label to which applying constraints isn't meaningful. The `label_name_length_limit` and `label_value_length_limit` will prevent having labels of excessive lengths. These limits also skip the __name__ label for the same reasons as the `label_limit` option and will also make the scrape fail if any sample has a label name/value length that exceed the predefined limits. Signed-off-by: Damien Grisonnet * scrape: add metrics and alert to label limits Add three gauge, one for each label limit to easily access the limit set by a certain scrape target. Also add a counter to count the number of targets that exceeded the label limits and thus were dropped. This is useful for the `PrometheusLabelLimitHit` alert that will notify the users that scraping some targets failed because they had samples exceeding the label limits defined in the scrape configuration. Signed-off-by: Damien Grisonnet * scrape: apply label limits to __name__ label Apply limits to the __name__ label that was previously skipped and truncate the label names and values in the error messages as they can be very very long. Signed-off-by: Damien Grisonnet * scrape: remove label limits gauges and refactor Remove `prometheus_target_scrape_pool_label_limit`, `prometheus_target_scrape_pool_label_name_length_limit`, and `prometheus_target_scrape_pool_label_value_length_limit` as they are not really useful since we don't have the information on the labels in it. Signed-off-by: Damien Grisonnet --- config/config.go | 12 +- docs/configuration/configuration.md | 15 ++ .../prometheus-mixin/alerts.libsonnet | 14 ++ scrape/scrape.go | 98 +++++++++++-- scrape/scrape_test.go | 133 +++++++++++++++++- 5 files changed, 257 insertions(+), 15 deletions(-) diff --git a/config/config.go b/config/config.go index 56350e1be..e4045eb05 100644 --- a/config/config.go +++ b/config/config.go @@ -382,11 +382,21 @@ type ScrapeConfig struct { MetricsPath string `yaml:"metrics_path,omitempty"` // The URL scheme with which to fetch metrics from targets. Scheme string `yaml:"scheme,omitempty"` - // More than this many samples post metric-relabeling will cause the scrape to fail. + // More than this many samples post metric-relabeling will cause the scrape to + // fail. SampleLimit uint `yaml:"sample_limit,omitempty"` // More than this many targets after the target relabeling will cause the // scrapes to fail. TargetLimit uint `yaml:"target_limit,omitempty"` + // More than this many labels post metric-relabeling will cause the scrape to + // fail. + LabelLimit uint `yaml:"label_limit,omitempty"` + // More than this label name length post metric-relabeling will cause the + // scrape to fail. + LabelNameLengthLimit uint `yaml:"label_name_length_limit,omitempty"` + // More than this label value length post metric-relabeling will cause the + // scrape to fail. + LabelValueLengthLimit uint `yaml:"label_value_length_limit,omitempty"` // We cannot do proper Go type embedding below as the parser will then parse // values arbitrarily into the overflow maps of further-down types. diff --git a/docs/configuration/configuration.md b/docs/configuration/configuration.md index 72135a270..ac45fa82f 100644 --- a/docs/configuration/configuration.md +++ b/docs/configuration/configuration.md @@ -288,6 +288,21 @@ metric_relabel_configs: # the entire scrape will be treated as failed. 0 means no limit. [ sample_limit: | default = 0 ] +# Per-scrape limit on number of labels that will be accepted for a sample. If +# more than this number of labels are present post metric-relabeling, the +# entire scrape will be treated as failed. 0 means no limit. +[ label_limit: | default = 0 ] + +# Per-scrape limit on length of labels name that will be accepted for a sample. +# If a label name is longer than this number post metric-relabeling, the entire +# scrape will be treated as failed. 0 means no limit. +[ label_name_length_limit: | default = 0 ] + +# Per-scrape limit on length of labels value that will be accepted for a sample. +# If a label value is longer than this number post metric-relabeling, the +# entire scrape will be treated as failed. 0 means no limit. +[ label_value_length_limit: | default = 0 ] + # Per-scrape config limit on number of unique targets that will be # accepted. If more than this number of targets are present after target # relabeling, Prometheus will mark the targets as failed without scraping them. diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index f555a4cdf..d4face577 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -261,6 +261,20 @@ description: 'Prometheus %(prometheusName)s has dropped {{ printf "%%.0f" $value }} targets because the number of targets exceeded the configured target_limit.' % $._config, }, }, + { + alert: 'PrometheusLabelLimitHit', + expr: ||| + increase(prometheus_target_scrape_pool_exceeded_label_limits_total{%(prometheusSelector)s}[5m]) > 0 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Prometheus has dropped targets because some scrape configs have exceeded the labels limit.', + description: 'Prometheus %(prometheusName)s has dropped {{ printf "%%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.' % $._config, + }, + }, ] + if $._config.prometheusHAGroupLabels == '' then self.rulesWithoutHA else self.rulesWithHA, rulesWithoutHA:: [ { diff --git a/scrape/scrape.go b/scrape/scrape.go index f7888f3ee..0985d2f46 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -170,6 +170,12 @@ var ( Help: "Total number of exemplar rejected due to not being out of the expected order.", }, ) + targetScrapePoolExceededLabelLimits = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "prometheus_target_scrape_pool_exceeded_label_limits_total", + Help: "Total number of times scrape pools hit the label limits, during sync or config reload.", + }, + ) ) func init() { @@ -192,6 +198,7 @@ func init() { targetScrapeCacheFlushForced, targetMetadataCache, targetScrapeExemplarOutOfOrder, + targetScrapePoolExceededLabelLimits, ) } @@ -218,10 +225,17 @@ type scrapePool struct { newLoop func(scrapeLoopOptions) loop } +type labelLimits struct { + labelLimit int + labelNameLengthLimit int + labelValueLengthLimit int +} + type scrapeLoopOptions struct { target *Target scraper scraper - limit int + sampleLimit int + labelLimits *labelLimits honorLabels bool honorTimestamps bool mrc []*relabel.Config @@ -273,10 +287,11 @@ func newScrapePool(cfg *config.ScrapeConfig, app storage.Appendable, jitterSeed return mutateSampleLabels(l, opts.target, opts.honorLabels, opts.mrc) }, func(l labels.Labels) labels.Labels { return mutateReportSampleLabels(l, opts.target) }, - func(ctx context.Context) storage.Appender { return appender(app.Appender(ctx), opts.limit) }, + func(ctx context.Context) storage.Appender { return appender(app.Appender(ctx), opts.sampleLimit) }, cache, jitterSeed, opts.honorTimestamps, + opts.labelLimits, ) } @@ -357,10 +372,15 @@ func (sp *scrapePool) reload(cfg *config.ScrapeConfig) error { targetScrapePoolTargetLimit.WithLabelValues(sp.config.JobName).Set(float64(sp.config.TargetLimit)) var ( - wg sync.WaitGroup - interval = time.Duration(sp.config.ScrapeInterval) - timeout = time.Duration(sp.config.ScrapeTimeout) - limit = int(sp.config.SampleLimit) + wg sync.WaitGroup + interval = time.Duration(sp.config.ScrapeInterval) + timeout = time.Duration(sp.config.ScrapeTimeout) + sampleLimit = int(sp.config.SampleLimit) + labelLimits = &labelLimits{ + labelLimit: int(sp.config.LabelLimit), + labelNameLengthLimit: int(sp.config.LabelNameLengthLimit), + labelValueLengthLimit: int(sp.config.LabelValueLengthLimit), + } honorLabels = sp.config.HonorLabels honorTimestamps = sp.config.HonorTimestamps mrc = sp.config.MetricRelabelConfigs @@ -383,7 +403,8 @@ func (sp *scrapePool) reload(cfg *config.ScrapeConfig) error { newLoop = sp.newLoop(scrapeLoopOptions{ target: t, scraper: s, - limit: limit, + sampleLimit: sampleLimit, + labelLimits: labelLimits, honorLabels: honorLabels, honorTimestamps: honorTimestamps, mrc: mrc, @@ -451,10 +472,15 @@ func (sp *scrapePool) Sync(tgs []*targetgroup.Group) { // It returns after all stopped scrape loops terminated. func (sp *scrapePool) sync(targets []*Target) { var ( - uniqueLoops = make(map[uint64]loop) - interval = time.Duration(sp.config.ScrapeInterval) - timeout = time.Duration(sp.config.ScrapeTimeout) - limit = int(sp.config.SampleLimit) + uniqueLoops = make(map[uint64]loop) + interval = time.Duration(sp.config.ScrapeInterval) + timeout = time.Duration(sp.config.ScrapeTimeout) + sampleLimit = int(sp.config.SampleLimit) + labelLimits = &labelLimits{ + labelLimit: int(sp.config.LabelLimit), + labelNameLengthLimit: int(sp.config.LabelNameLengthLimit), + labelValueLengthLimit: int(sp.config.LabelValueLengthLimit), + } honorLabels = sp.config.HonorLabels honorTimestamps = sp.config.HonorTimestamps mrc = sp.config.MetricRelabelConfigs @@ -469,7 +495,8 @@ func (sp *scrapePool) sync(targets []*Target) { l := sp.newLoop(scrapeLoopOptions{ target: t, scraper: s, - limit: limit, + sampleLimit: sampleLimit, + labelLimits: labelLimits, honorLabels: honorLabels, honorTimestamps: honorTimestamps, mrc: mrc, @@ -544,6 +571,41 @@ func (sp *scrapePool) refreshTargetLimitErr() error { return err } +func verifyLabelLimits(lset labels.Labels, limits *labelLimits) error { + if limits == nil { + return nil + } + + met := lset.Get(labels.MetricName) + if limits.labelLimit > 0 { + nbLabels := len(lset) + if nbLabels > int(limits.labelLimit) { + return fmt.Errorf("label_limit exceeded (metric: %.50s, number of label: %d, limit: %d)", met, nbLabels, limits.labelLimit) + } + } + + if limits.labelNameLengthLimit == 0 && limits.labelValueLengthLimit == 0 { + return nil + } + + for _, l := range lset { + if limits.labelNameLengthLimit > 0 { + nameLength := len(l.Name) + if nameLength > int(limits.labelNameLengthLimit) { + return fmt.Errorf("label_name_length_limit exceeded (metric: %.50s, label: %.50v, name length: %d, limit: %d)", met, l, nameLength, limits.labelNameLengthLimit) + } + } + + if limits.labelValueLengthLimit > 0 { + valueLength := len(l.Value) + if valueLength > int(limits.labelValueLengthLimit) { + return fmt.Errorf("label_value_length_limit exceeded (metric: %.50s, label: %.50v, value length: %d, limit: %d)", met, l, valueLength, limits.labelValueLengthLimit) + } + } + } + return nil +} + func mutateSampleLabels(lset labels.Labels, target *Target, honor bool, rc []*relabel.Config) labels.Labels { lb := labels.NewBuilder(lset) @@ -707,6 +769,7 @@ type scrapeLoop struct { honorTimestamps bool forcedErr error forcedErrMtx sync.Mutex + labelLimits *labelLimits appender func(ctx context.Context) storage.Appender sampleMutator labelsMutator @@ -974,6 +1037,7 @@ func newScrapeLoop(ctx context.Context, cache *scrapeCache, jitterSeed uint64, honorTimestamps bool, + labelLimits *labelLimits, ) *scrapeLoop { if l == nil { l = log.NewNopLogger() @@ -996,6 +1060,7 @@ func newScrapeLoop(ctx context.Context, l: l, parentCtx: ctx, honorTimestamps: honorTimestamps, + labelLimits: labelLimits, } sl.ctx, sl.cancel = context.WithCancel(ctx) @@ -1346,6 +1411,12 @@ loop: err = errNameLabelMandatory break loop } + + // If any label limits is exceeded the scrape should fail. + if err = verifyLabelLimits(lset, sl.labelLimits); err != nil { + targetScrapePoolExceededLabelLimits.Inc() + break loop + } } ref, err = app.Append(ref, lset, t, v) @@ -1577,6 +1648,9 @@ func zeroConfig(c *config.ScrapeConfig) *config.ScrapeConfig { z.ScrapeInterval = 0 z.ScrapeTimeout = 0 z.SampleLimit = 0 + z.LabelLimit = 0 + z.LabelNameLengthLimit = 0 + z.LabelValueLengthLimit = 0 z.HTTPClientConfig = config_util.HTTPClientConfig{} return &z } diff --git a/scrape/scrape_test.go b/scrape/scrape_test.go index f21d10157..93877035e 100644 --- a/scrape/scrape_test.go +++ b/scrape/scrape_test.go @@ -464,8 +464,8 @@ func TestScrapePoolAppender(t *testing.T) { require.True(t, ok, "Expected base appender but got %T", tl.Appender) loop = sp.newLoop(scrapeLoopOptions{ - target: &Target{}, - limit: 100, + target: &Target{}, + sampleLimit: 100, }) appl, ok = loop.(*scrapeLoop) require.True(t, ok, "Expected scrapeLoop but got %T", loop) @@ -577,6 +577,7 @@ func TestScrapeLoopStopBeforeRun(t *testing.T) { nopMutator, nil, nil, 0, true, + nil, ) // The scrape pool synchronizes on stopping scrape loops. However, new scrape @@ -641,6 +642,7 @@ func TestScrapeLoopStop(t *testing.T) { nil, 0, true, + nil, ) // Terminate loop after 2 scrapes. @@ -708,6 +710,7 @@ func TestScrapeLoopRun(t *testing.T) { nil, 0, true, + nil, ) // The loop must terminate during the initial offset if the context @@ -755,6 +758,7 @@ func TestScrapeLoopRun(t *testing.T) { nil, 0, true, + nil, ) go func() { @@ -806,6 +810,7 @@ func TestScrapeLoopForcedErr(t *testing.T) { nil, 0, true, + nil, ) forcedErr := fmt.Errorf("forced err") @@ -856,6 +861,7 @@ func TestScrapeLoopMetadata(t *testing.T) { cache, 0, true, + nil, ) defer cancel() @@ -905,6 +911,7 @@ func TestScrapeLoopSeriesAdded(t *testing.T) { nil, 0, true, + nil, ) defer cancel() @@ -943,6 +950,7 @@ func TestScrapeLoopRunCreatesStaleMarkersOnFailedScrape(t *testing.T) { nil, 0, true, + nil, ) // Succeed once, several failures, then stop. numScrapes := 0 @@ -997,6 +1005,7 @@ func TestScrapeLoopRunCreatesStaleMarkersOnParseFailure(t *testing.T) { nil, 0, true, + nil, ) // Succeed once, several failures, then stop. @@ -1055,6 +1064,7 @@ func TestScrapeLoopCache(t *testing.T) { nil, 0, true, + nil, ) numScrapes := 0 @@ -1129,6 +1139,7 @@ func TestScrapeLoopCacheMemoryExhaustionProtection(t *testing.T) { nil, 0, true, + nil, ) numScrapes := 0 @@ -1235,6 +1246,7 @@ func TestScrapeLoopAppend(t *testing.T) { nil, 0, true, + nil, ) now := time.Now() @@ -1276,6 +1288,7 @@ func TestScrapeLoopAppendCacheEntryButErrNotFound(t *testing.T) { nil, 0, true, + nil, ) fakeRef := uint64(1) @@ -1325,6 +1338,7 @@ func TestScrapeLoopAppendSampleLimit(t *testing.T) { nil, 0, true, + nil, ) // Get the value of the Counter before performing the append. @@ -1394,6 +1408,7 @@ func TestScrapeLoop_ChangingMetricString(t *testing.T) { nil, 0, true, + nil, ) now := time.Now() @@ -1434,6 +1449,7 @@ func TestScrapeLoopAppendStaleness(t *testing.T) { nil, 0, true, + nil, ) now := time.Now() @@ -1477,6 +1493,7 @@ func TestScrapeLoopAppendNoStalenessIfTimestamp(t *testing.T) { nil, 0, true, + nil, ) now := time.Now() @@ -1578,6 +1595,7 @@ metric_total{n="2"} 2 # {t="2"} 2.0 20000 nil, 0, true, + nil, ) now := time.Now() @@ -1635,6 +1653,7 @@ func TestScrapeLoopAppendExemplarSeries(t *testing.T) { nil, 0, true, + nil, ) now := time.Now() @@ -1679,6 +1698,7 @@ func TestScrapeLoopRunReportsTargetDownOnScrapeError(t *testing.T) { nil, 0, true, + nil, ) scraper.scrapeFunc = func(ctx context.Context, w io.Writer) error { @@ -1707,6 +1727,7 @@ func TestScrapeLoopRunReportsTargetDownOnInvalidUTF8(t *testing.T) { nil, 0, true, + nil, ) scraper.scrapeFunc = func(ctx context.Context, w io.Writer) error { @@ -1748,6 +1769,7 @@ func TestScrapeLoopAppendGracefullyIfAmendOrOutOfOrderOrOutOfBounds(t *testing.T nil, 0, true, + nil, ) now := time.Unix(1, 0) @@ -1785,6 +1807,7 @@ func TestScrapeLoopOutOfBoundsTimeError(t *testing.T) { nil, 0, true, + nil, ) now := time.Now().Add(20 * time.Minute) @@ -1972,6 +1995,7 @@ func TestScrapeLoop_RespectTimestamps(t *testing.T) { func(ctx context.Context) storage.Appender { return capp }, nil, 0, true, + nil, ) now := time.Now() @@ -2005,6 +2029,7 @@ func TestScrapeLoop_DiscardTimestamps(t *testing.T) { func(ctx context.Context) storage.Appender { return capp }, nil, 0, false, + nil, ) now := time.Now() @@ -2037,6 +2062,7 @@ func TestScrapeLoopDiscardDuplicateLabels(t *testing.T) { nil, 0, true, + nil, ) defer cancel() @@ -2087,6 +2113,7 @@ func TestScrapeLoopDiscardUnnamedMetrics(t *testing.T) { nil, 0, true, + nil, ) defer cancel() @@ -2304,6 +2331,7 @@ func TestScrapeAddFast(t *testing.T) { nil, 0, true, + nil, ) defer cancel() @@ -2387,6 +2415,7 @@ func TestScrapeReportSingleAppender(t *testing.T) { nil, 0, true, + nil, ) numScrapes := 0 @@ -2430,3 +2459,103 @@ func TestScrapeReportSingleAppender(t *testing.T) { t.Fatalf("Scrape wasn't stopped.") } } + +func TestScrapeLoopLabelLimit(t *testing.T) { + tests := []struct { + title string + scrapeLabels string + discoveryLabels []string + labelLimits labelLimits + expectErr bool + }{ + { + title: "Valid number of labels", + scrapeLabels: `metric{l1="1", l2="2"} 0`, + discoveryLabels: nil, + labelLimits: labelLimits{labelLimit: 5}, + expectErr: false, + }, { + title: "Too many labels", + scrapeLabels: `metric{l1="1", l2="2", l3="3", l4="4", l5="5", l6="6"} 0`, + discoveryLabels: nil, + labelLimits: labelLimits{labelLimit: 5}, + expectErr: true, + }, { + title: "Too many labels including discovery labels", + scrapeLabels: `metric{l1="1", l2="2", l3="3", l4="4"} 0`, + discoveryLabels: []string{"l5", "5", "l6", "6"}, + labelLimits: labelLimits{labelLimit: 5}, + expectErr: true, + }, { + title: "Valid labels name length", + scrapeLabels: `metric{l1="1", l2="2"} 0`, + discoveryLabels: nil, + labelLimits: labelLimits{labelNameLengthLimit: 10}, + expectErr: false, + }, { + title: "Label name too long", + scrapeLabels: `metric{label_name_too_long="0"} 0`, + discoveryLabels: nil, + labelLimits: labelLimits{labelNameLengthLimit: 10}, + expectErr: true, + }, { + title: "Discovery label name too long", + scrapeLabels: `metric{l1="1", l2="2"} 0`, + discoveryLabels: []string{"label_name_too_long", "0"}, + labelLimits: labelLimits{labelNameLengthLimit: 10}, + expectErr: true, + }, { + title: "Valid labels value length", + scrapeLabels: `metric{l1="1", l2="2"} 0`, + discoveryLabels: nil, + labelLimits: labelLimits{labelValueLengthLimit: 10}, + expectErr: false, + }, { + title: "Label value too long", + scrapeLabels: `metric{l1="label_value_too_long"} 0`, + discoveryLabels: nil, + labelLimits: labelLimits{labelValueLengthLimit: 10}, + expectErr: true, + }, { + title: "Discovery label value too long", + scrapeLabels: `metric{l1="1", l2="2"} 0`, + discoveryLabels: []string{"l1", "label_value_too_long"}, + labelLimits: labelLimits{labelValueLengthLimit: 10}, + expectErr: true, + }, + } + + for _, test := range tests { + app := &collectResultAppender{} + + discoveryLabels := &Target{ + labels: labels.FromStrings(test.discoveryLabels...), + } + + sl := newScrapeLoop(context.Background(), + nil, nil, nil, + func(l labels.Labels) labels.Labels { + return mutateSampleLabels(l, discoveryLabels, false, nil) + }, + func(l labels.Labels) labels.Labels { + return mutateReportSampleLabels(l, discoveryLabels) + }, + func(ctx context.Context) storage.Appender { return app }, + nil, + 0, + true, + &test.labelLimits, + ) + + slApp := sl.appender(context.Background()) + _, _, _, err := sl.append(slApp, []byte(test.scrapeLabels), "", time.Now()) + + t.Logf("Test:%s", test.title) + if test.expectErr { + require.Error(t, err) + } else { + require.NoError(t, err) + require.NoError(t, slApp.Commit()) + } + } +}