mirror of
https://github.com/prometheus/prometheus.git
synced 2024-11-09 23:24:05 -08:00
Treat bucket limit like sample limit and make it fail the whole scrape and return an error
Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
This commit is contained in:
parent
071426f72f
commit
2ad39baa72
|
@ -489,8 +489,8 @@ type ScrapeConfig struct {
|
|||
// More than this label value length post metric-relabeling will cause the
|
||||
// scrape to fail.
|
||||
LabelValueLengthLimit uint `yaml:"label_value_length_limit,omitempty"`
|
||||
// More than this many buckets in a native histogram will cause the histogram
|
||||
// to be ignored, but it will not make the whole scrape fail.
|
||||
// More than this many buckets in a native histogram will cause the scrape to
|
||||
// fail.
|
||||
NativeHistogramBucketLimit uint `yaml:"bucket_limit,omitempty"`
|
||||
|
||||
// We cannot do proper Go type embedding below as the parser will then parse
|
||||
|
|
|
@ -378,8 +378,8 @@ metric_relabel_configs:
|
|||
[ target_limit: <int> | default = 0 ]
|
||||
|
||||
# Limit on total number of positive and negative buckets allowed in a native
|
||||
# histogram. If this is exceeded, the histogram will be ignored, but this will
|
||||
# not make the scrape fail. 0 means no limit.
|
||||
# histogram. If this is exceeded, the entire scrape will be treated as failed.
|
||||
# 0 means no limit.
|
||||
[ sample_limit: <int> | default = 0 ]
|
||||
```
|
||||
|
||||
|
|
|
@ -194,7 +194,7 @@ var (
|
|||
targetScrapeNativeHistogramBucketLimit = prometheus.NewCounter(
|
||||
prometheus.CounterOpts{
|
||||
Name: "prometheus_target_scrapes_histogram_exceeded_bucket_limit_total",
|
||||
Help: "Total number of native histograms rejected due to exceeding the bucket limit.",
|
||||
Help: "Total number of scrapes that hit the native histograms bucket limit and were rejected.",
|
||||
},
|
||||
)
|
||||
)
|
||||
|
@ -1485,11 +1485,10 @@ func (sl *scrapeLoop) getCache() *scrapeCache {
|
|||
}
|
||||
|
||||
type appendErrors struct {
|
||||
numOutOfOrder int
|
||||
numDuplicates int
|
||||
numOutOfBounds int
|
||||
numExemplarOutOfOrder int
|
||||
numHistogramBucketLimit int
|
||||
numOutOfOrder int
|
||||
numDuplicates int
|
||||
numOutOfBounds int
|
||||
numExemplarOutOfOrder int
|
||||
}
|
||||
|
||||
func (sl *scrapeLoop) append(app storage.Appender, b []byte, contentType string, ts time.Time) (total, added, seriesAdded int, err error) {
|
||||
|
@ -1506,6 +1505,7 @@ func (sl *scrapeLoop) append(app storage.Appender, b []byte, contentType string,
|
|||
defTime = timestamp.FromTime(ts)
|
||||
appErrs = appendErrors{}
|
||||
sampleLimitErr error
|
||||
bucketLimitErr error
|
||||
e exemplar.Exemplar // escapes to heap so hoisted out of loop
|
||||
meta metadata.Metadata
|
||||
metadataChanged bool
|
||||
|
@ -1655,7 +1655,7 @@ loop:
|
|||
} else {
|
||||
ref, err = app.Append(ref, lset, t, val)
|
||||
}
|
||||
sampleAdded, err = sl.checkAddError(ce, met, parsedTimestamp, err, &sampleLimitErr, &appErrs)
|
||||
sampleAdded, err = sl.checkAddError(ce, met, parsedTimestamp, err, &sampleLimitErr, &bucketLimitErr, &appErrs)
|
||||
if err != nil {
|
||||
if err != storage.ErrNotFound {
|
||||
level.Debug(sl.l).Log("msg", "Unexpected error", "series", string(met), "err", err)
|
||||
|
@ -1669,7 +1669,7 @@ loop:
|
|||
sl.cache.trackStaleness(hash, lset)
|
||||
}
|
||||
sl.cache.addRef(met, ref, lset, hash)
|
||||
if sampleAdded && sampleLimitErr == nil {
|
||||
if sampleAdded && sampleLimitErr == nil && bucketLimitErr == nil {
|
||||
seriesAdded++
|
||||
}
|
||||
}
|
||||
|
@ -1705,6 +1705,13 @@ loop:
|
|||
// We only want to increment this once per scrape, so this is Inc'd outside the loop.
|
||||
targetScrapeSampleLimit.Inc()
|
||||
}
|
||||
if bucketLimitErr != nil {
|
||||
if err == nil {
|
||||
err = bucketLimitErr // if sample limit is hit, that error takes precedence
|
||||
}
|
||||
// We only want to increment this once per scrape, so this is Inc'd outside the loop.
|
||||
targetScrapeNativeHistogramBucketLimit.Inc()
|
||||
}
|
||||
if appErrs.numOutOfOrder > 0 {
|
||||
level.Warn(sl.l).Log("msg", "Error on ingesting out-of-order samples", "num_dropped", appErrs.numOutOfOrder)
|
||||
}
|
||||
|
@ -1717,9 +1724,6 @@ loop:
|
|||
if appErrs.numExemplarOutOfOrder > 0 {
|
||||
level.Warn(sl.l).Log("msg", "Error on ingesting out-of-order exemplars", "num_dropped", appErrs.numExemplarOutOfOrder)
|
||||
}
|
||||
if appErrs.numHistogramBucketLimit > 0 {
|
||||
level.Warn(sl.l).Log("msg", "Error on ingesting native histograms that exceeded bucket limit", "num_dropped", appErrs.numHistogramBucketLimit)
|
||||
}
|
||||
if err == nil {
|
||||
sl.cache.forEachStale(func(lset labels.Labels) bool {
|
||||
// Series no longer exposed, mark it stale.
|
||||
|
@ -1737,8 +1741,8 @@ loop:
|
|||
}
|
||||
|
||||
// Adds samples to the appender, checking the error, and then returns the # of samples added,
|
||||
// whether the caller should continue to process more samples, and any sample limit errors.
|
||||
func (sl *scrapeLoop) checkAddError(ce *cacheEntry, met []byte, tp *int64, err error, sampleLimitErr *error, appErrs *appendErrors) (bool, error) {
|
||||
// whether the caller should continue to process more samples, and any sample or bucket limit errors.
|
||||
func (sl *scrapeLoop) checkAddError(ce *cacheEntry, met []byte, tp *int64, err error, sampleLimitErr, bucketLimitErr *error, appErrs *appendErrors) (bool, error) {
|
||||
switch errors.Cause(err) {
|
||||
case nil:
|
||||
if tp == nil && ce != nil {
|
||||
|
@ -1762,16 +1766,16 @@ func (sl *scrapeLoop) checkAddError(ce *cacheEntry, met []byte, tp *int64, err e
|
|||
level.Debug(sl.l).Log("msg", "Out of bounds metric", "series", string(met))
|
||||
targetScrapeSampleOutOfBounds.Inc()
|
||||
return false, nil
|
||||
case storage.ErrHistogramBucketLimit:
|
||||
appErrs.numHistogramBucketLimit++
|
||||
level.Debug(sl.l).Log("msg", "Exceeded bucket limit for native histograms", "series", string(met))
|
||||
targetScrapeNativeHistogramBucketLimit.Inc()
|
||||
return false, nil
|
||||
case errSampleLimit:
|
||||
// Keep on parsing output if we hit the limit, so we report the correct
|
||||
// total number of samples scraped.
|
||||
*sampleLimitErr = err
|
||||
return false, nil
|
||||
case errBucketLimit:
|
||||
// Keep on parsing output if we hit the limit, so we report the correct
|
||||
// total number of samples scraped.
|
||||
*bucketLimitErr = err
|
||||
return false, nil
|
||||
default:
|
||||
return false, err
|
||||
}
|
||||
|
|
|
@ -1788,7 +1788,10 @@ func TestScrapeLoop_HistogramBucketLimit(t *testing.T) {
|
|||
|
||||
now = time.Now()
|
||||
total, added, seriesAdded, err = sl.append(app, msg, "application/vnd.google.protobuf", now)
|
||||
require.NoError(t, err)
|
||||
if err != errBucketLimit {
|
||||
t.Fatalf("Did not see expected histogram bucket limit error: %s", err)
|
||||
}
|
||||
require.NoError(t, app.Rollback())
|
||||
require.Equal(t, 1, total)
|
||||
require.Equal(t, 1, added)
|
||||
require.Equal(t, 0, seriesAdded)
|
||||
|
@ -3010,7 +3013,7 @@ func TestReuseCacheRace(*testing.T) {
|
|||
func TestCheckAddError(t *testing.T) {
|
||||
var appErrs appendErrors
|
||||
sl := scrapeLoop{l: log.NewNopLogger()}
|
||||
sl.checkAddError(nil, nil, nil, storage.ErrOutOfOrderSample, nil, &appErrs)
|
||||
sl.checkAddError(nil, nil, nil, storage.ErrOutOfOrderSample, nil, nil, &appErrs)
|
||||
require.Equal(t, 1, appErrs.numOutOfOrder)
|
||||
}
|
||||
|
||||
|
|
|
@ -314,7 +314,10 @@ func (ts Targets) Len() int { return len(ts) }
|
|||
func (ts Targets) Less(i, j int) bool { return ts[i].URL().String() < ts[j].URL().String() }
|
||||
func (ts Targets) Swap(i, j int) { ts[i], ts[j] = ts[j], ts[i] }
|
||||
|
||||
var errSampleLimit = errors.New("sample limit exceeded")
|
||||
var (
|
||||
errSampleLimit = errors.New("sample limit exceeded")
|
||||
errBucketLimit = errors.New("histogram bucket limit exceeded")
|
||||
)
|
||||
|
||||
// limitAppender limits the number of total appended samples in a batch.
|
||||
type limitAppender struct {
|
||||
|
@ -366,12 +369,12 @@ type bucketLimitAppender struct {
|
|||
func (app *bucketLimitAppender) AppendHistogram(ref storage.SeriesRef, lset labels.Labels, t int64, h *histogram.Histogram, fh *histogram.FloatHistogram) (storage.SeriesRef, error) {
|
||||
if h != nil {
|
||||
if len(h.PositiveBuckets)+len(h.NegativeBuckets) > app.limit {
|
||||
return 0, storage.ErrHistogramBucketLimit
|
||||
return 0, errBucketLimit
|
||||
}
|
||||
}
|
||||
if fh != nil {
|
||||
if len(fh.PositiveBuckets)+len(fh.NegativeBuckets) > app.limit {
|
||||
return 0, storage.ErrHistogramBucketLimit
|
||||
return 0, errBucketLimit
|
||||
}
|
||||
}
|
||||
ref, err := app.Appender.AppendHistogram(ref, lset, t, h, fh)
|
||||
|
|
|
@ -46,7 +46,6 @@ var (
|
|||
ErrHistogramNegativeBucketCount = errors.New("histogram has a bucket whose observation count is negative")
|
||||
ErrHistogramSpanNegativeOffset = errors.New("histogram has a span whose offset is negative")
|
||||
ErrHistogramSpansBucketsMismatch = errors.New("histogram spans specify different number of buckets than provided")
|
||||
ErrHistogramBucketLimit = errors.New("histogram bucket limit exceeded")
|
||||
)
|
||||
|
||||
// SeriesRef is a generic series reference. In prometheus it is either a
|
||||
|
|
Loading…
Reference in a new issue