Treat bucket limit like sample limit and make it fail the whole scrape and return an error

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
2025-03-05 20:59:13 -08:00 · 2023-04-22 03:14:19 +08:00 · 2023-04-22 03:14:19 +08:00 · 2ad39baa72
parent 071426f72f
commit 2ad39baa72
6 changed files with 37 additions and 28 deletions
--- a/config/config.go
+++ b/config/config.go
@ -489,8 +489,8 @@ type ScrapeConfig struct {
 	// More than this label value length post metric-relabeling will cause the
 	// scrape to fail.
 	LabelValueLengthLimit uint `yaml:"label_value_length_limit,omitempty"`
-	// More than this many buckets in a native histogram will cause the histogram
-	// to be ignored, but it will not make the whole scrape fail.
+	// More than this many buckets in a native histogram will cause the scrape to
+	// fail.
 	NativeHistogramBucketLimit uint `yaml:"bucket_limit,omitempty"`

 	// We cannot do proper Go type embedding below as the parser will then parse
--- a/docs/configuration/configuration.md
+++ b/docs/configuration/configuration.md
@ -378,8 +378,8 @@ metric_relabel_configs:
 [ target_limit: <int> | default = 0 ]

 # Limit on total number of positive and negative buckets allowed in a native
-# histogram. If this is exceeded, the histogram will be ignored, but this will
-# not make the scrape fail. 0 means no limit.
+# histogram. If this is exceeded, the entire scrape will be treated as failed.
+# 0 means no limit.
 [ sample_limit: <int> | default = 0 ]
 ```

--- a/scrape/scrape.go
+++ b/scrape/scrape.go
@ -194,7 +194,7 @@ var (
 	targetScrapeNativeHistogramBucketLimit = prometheus.NewCounter(
 		prometheus.CounterOpts{
 			Name: "prometheus_target_scrapes_histogram_exceeded_bucket_limit_total",
-			Help: "Total number of native histograms rejected due to exceeding the bucket limit.",
+			Help: "Total number of scrapes that hit the native histograms bucket limit and were rejected.",
 		},
 	)
 )
@ -1485,11 +1485,10 @@ func (sl *scrapeLoop) getCache() *scrapeCache {
 }

 type appendErrors struct {
-	numOutOfOrder           int
-	numDuplicates           int
-	numOutOfBounds          int
-	numExemplarOutOfOrder   int
-	numHistogramBucketLimit int
+	numOutOfOrder         int
+	numDuplicates         int
+	numOutOfBounds        int
+	numExemplarOutOfOrder int
 }

 func (sl *scrapeLoop) append(app storage.Appender, b []byte, contentType string, ts time.Time) (total, added, seriesAdded int, err error) {
@ -1506,6 +1505,7 @@ func (sl *scrapeLoop) append(app storage.Appender, b []byte, contentType string,
 		defTime         = timestamp.FromTime(ts)
 		appErrs         = appendErrors{}
 		sampleLimitErr  error
+		bucketLimitErr  error
 		e               exemplar.Exemplar // escapes to heap so hoisted out of loop
 		meta            metadata.Metadata
 		metadataChanged bool
@ -1655,7 +1655,7 @@ loop:
 		} else {
 			ref, err = app.Append(ref, lset, t, val)
 		}
-		sampleAdded, err = sl.checkAddError(ce, met, parsedTimestamp, err, &sampleLimitErr, &appErrs)
+		sampleAdded, err = sl.checkAddError(ce, met, parsedTimestamp, err, &sampleLimitErr, &bucketLimitErr, &appErrs)
 		if err != nil {
 			if err != storage.ErrNotFound {
 				level.Debug(sl.l).Log("msg", "Unexpected error", "series", string(met), "err", err)
@ -1669,7 +1669,7 @@ loop:
 				sl.cache.trackStaleness(hash, lset)
 			}
 			sl.cache.addRef(met, ref, lset, hash)
-			if sampleAdded && sampleLimitErr == nil {
+			if sampleAdded && sampleLimitErr == nil && bucketLimitErr == nil {
 				seriesAdded++
 			}
 		}
@ -1705,6 +1705,13 @@ loop:
 		// We only want to increment this once per scrape, so this is Inc'd outside the loop.
 		targetScrapeSampleLimit.Inc()
 	}
+	if bucketLimitErr != nil {
+		if err == nil {
+			err = bucketLimitErr // if sample limit is hit, that error takes precedence
+		}
+		// We only want to increment this once per scrape, so this is Inc'd outside the loop.
+		targetScrapeNativeHistogramBucketLimit.Inc()
+	}
 	if appErrs.numOutOfOrder > 0 {
 		level.Warn(sl.l).Log("msg", "Error on ingesting out-of-order samples", "num_dropped", appErrs.numOutOfOrder)
 	}
@ -1717,9 +1724,6 @@ loop:
 	if appErrs.numExemplarOutOfOrder > 0 {
 		level.Warn(sl.l).Log("msg", "Error on ingesting out-of-order exemplars", "num_dropped", appErrs.numExemplarOutOfOrder)
 	}
-	if appErrs.numHistogramBucketLimit > 0 {
-		level.Warn(sl.l).Log("msg", "Error on ingesting native histograms that exceeded bucket limit", "num_dropped", appErrs.numHistogramBucketLimit)
-	}
 	if err == nil {
 		sl.cache.forEachStale(func(lset labels.Labels) bool {
 			// Series no longer exposed, mark it stale.
@ -1737,8 +1741,8 @@ loop:
 }

 // Adds samples to the appender, checking the error, and then returns the # of samples added,
-// whether the caller should continue to process more samples, and any sample limit errors.
-func (sl *scrapeLoop) checkAddError(ce *cacheEntry, met []byte, tp *int64, err error, sampleLimitErr *error, appErrs *appendErrors) (bool, error) {
+// whether the caller should continue to process more samples, and any sample or bucket limit errors.
+func (sl *scrapeLoop) checkAddError(ce *cacheEntry, met []byte, tp *int64, err error, sampleLimitErr, bucketLimitErr *error, appErrs *appendErrors) (bool, error) {
 	switch errors.Cause(err) {
 	case nil:
 		if tp == nil && ce != nil {
@ -1762,16 +1766,16 @@ func (sl *scrapeLoop) checkAddError(ce *cacheEntry, met []byte, tp *int64, err e
 		level.Debug(sl.l).Log("msg", "Out of bounds metric", "series", string(met))
 		targetScrapeSampleOutOfBounds.Inc()
 		return false, nil
-	case storage.ErrHistogramBucketLimit:
-		appErrs.numHistogramBucketLimit++
-		level.Debug(sl.l).Log("msg", "Exceeded bucket limit for native histograms", "series", string(met))
-		targetScrapeNativeHistogramBucketLimit.Inc()
-		return false, nil
 	case errSampleLimit:
 		// Keep on parsing output if we hit the limit, so we report the correct
 		// total number of samples scraped.
 		*sampleLimitErr = err
 		return false, nil
+	case errBucketLimit:
+		// Keep on parsing output if we hit the limit, so we report the correct
+		// total number of samples scraped.
+		*bucketLimitErr = err
+		return false, nil
 	default:
 		return false, err
 	}
--- a/scrape/scrape_test.go
+++ b/scrape/scrape_test.go
@ -1788,7 +1788,10 @@ func TestScrapeLoop_HistogramBucketLimit(t *testing.T) {

 	now = time.Now()
 	total, added, seriesAdded, err = sl.append(app, msg, "application/vnd.google.protobuf", now)
-	require.NoError(t, err)
+	if err != errBucketLimit {
+		t.Fatalf("Did not see expected histogram bucket limit error: %s", err)
+	}
+	require.NoError(t, app.Rollback())
 	require.Equal(t, 1, total)
 	require.Equal(t, 1, added)
 	require.Equal(t, 0, seriesAdded)
@ -3010,7 +3013,7 @@ func TestReuseCacheRace(*testing.T) {
 func TestCheckAddError(t *testing.T) {
 	var appErrs appendErrors
 	sl := scrapeLoop{l: log.NewNopLogger()}
-	sl.checkAddError(nil, nil, nil, storage.ErrOutOfOrderSample, nil, &appErrs)
+	sl.checkAddError(nil, nil, nil, storage.ErrOutOfOrderSample, nil, nil, &appErrs)
 	require.Equal(t, 1, appErrs.numOutOfOrder)
 }

--- a/scrape/target.go
+++ b/scrape/target.go
@ -314,7 +314,10 @@ func (ts Targets) Len() int           { return len(ts) }
 func (ts Targets) Less(i, j int) bool { return ts[i].URL().String() < ts[j].URL().String() }
 func (ts Targets) Swap(i, j int)      { ts[i], ts[j] = ts[j], ts[i] }

-var errSampleLimit = errors.New("sample limit exceeded")
+var (
+	errSampleLimit = errors.New("sample limit exceeded")
+	errBucketLimit = errors.New("histogram bucket limit exceeded")
+)

 // limitAppender limits the number of total appended samples in a batch.
 type limitAppender struct {
@ -366,12 +369,12 @@ type bucketLimitAppender struct {
 func (app *bucketLimitAppender) AppendHistogram(ref storage.SeriesRef, lset labels.Labels, t int64, h *histogram.Histogram, fh *histogram.FloatHistogram) (storage.SeriesRef, error) {
 	if h != nil {
 		if len(h.PositiveBuckets)+len(h.NegativeBuckets) > app.limit {
-			return 0, storage.ErrHistogramBucketLimit
+			return 0, errBucketLimit
 		}
 	}
 	if fh != nil {
 		if len(fh.PositiveBuckets)+len(fh.NegativeBuckets) > app.limit {
-			return 0, storage.ErrHistogramBucketLimit
+			return 0, errBucketLimit
 		}
 	}
 	ref, err := app.Appender.AppendHistogram(ref, lset, t, h, fh)
--- a/storage/interface.go
+++ b/storage/interface.go
@ -46,7 +46,6 @@ var (
 	ErrHistogramNegativeBucketCount  = errors.New("histogram has a bucket whose observation count is negative")
 	ErrHistogramSpanNegativeOffset   = errors.New("histogram has a span whose offset is negative")
 	ErrHistogramSpansBucketsMismatch = errors.New("histogram spans specify different number of buckets than provided")
-	ErrHistogramBucketLimit          = errors.New("histogram bucket limit exceeded")
 )

 // SeriesRef is a generic series reference. In prometheus it is either a