From f0a26266c0103c2100515c28603329dcff063049 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Mierzwa?= Date: Fri, 3 Sep 2021 14:37:42 +0100 Subject: [PATCH] Add scrape_sample_limit metric This adds a new metric exposing per target scrape sample_limit value. Metrics are only exposed if extra-scrape-metrics feature flag is enabled. scrape_sample_limit will make it easy to monitor and alert on targets getting close to configured sample_limit, which is important given than exceeding sample_limit results in the entire scrape results being rejected. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ɓukasz Mierzwa --- docs/feature_flags.md | 8 +++++--- scrape/scrape.go | 11 +++++++++++ scrape/scrape_test.go | 30 ++++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/docs/feature_flags.md b/docs/feature_flags.md index 67ce75be8..07424d731 100644 --- a/docs/feature_flags.md +++ b/docs/feature_flags.md @@ -34,7 +34,7 @@ that PromQL does not look ahead of the evaluation time for samples. `--enable-feature=promql-negative-offset` In contrast to the positive offset modifier, the negative offset modifier lets -one shift a vector selector into the future. An example in which one may want +one shift a vector selector into the future. An example in which one may want to use a negative offset is reviewing past data and making temporal comparisons with more recent data. @@ -59,7 +59,7 @@ Exemplar storage is implemented as a fixed size circular buffer that stores exem `--enable-feature=memory-snapshot-on-shutdown` This takes the snapshot of the chunks that are in memory along with the series information when shutting down and stores -it on disk. This will reduce the startup time since the memory state can be restored with this snapshot and m-mapped +it on disk. This will reduce the startup time since the memory state can be restored with this snapshot and m-mapped chunks without the need of WAL replay. ## Extra Scrape Metrics @@ -68,4 +68,6 @@ chunks without the need of WAL replay. When enabled, for each instance scrape, Prometheus stores a sample in the following additional time series: -* `scrape_timeout_seconds`. The configured `scrape_timeout` for a target. This allows you to measure each target to find out how close they are to timing out with `scrape_duration_seconds / scrape_timeout_seconds`. +- `scrape_timeout_seconds`. The configured `scrape_timeout` for a target. This allows you to measure each target to find out how close they are to timing out with `scrape_duration_seconds / scrape_timeout_seconds`. +- `scrape_sample_limit`. The configured `sample_limit` for a target. This allows you to measure each target + to find out how close they are to reaching the limit with `scrape_samples_post_metric_relabeling / scrape_sample_limit`. Note that `scrape_sample_limit` can be zero if there is no limit configured, which means that the query above can return `+Inf` for targets with no limit (as we divide by zero). If you want to query only for targets that do have a sample limit use this query: `scrape_samples_post_metric_relabeling / (scrape_sample_limit > 0)`. diff --git a/scrape/scrape.go b/scrape/scrape.go index 736c1ef84..5344e663a 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -308,6 +308,7 @@ func newScrapePool(cfg *config.ScrapeConfig, app storage.Appendable, jitterSeed cache, jitterSeed, opts.honorTimestamps, + opts.sampleLimit, opts.labelLimits, opts.interval, opts.timeout, @@ -815,6 +816,7 @@ type scrapeLoop struct { honorTimestamps bool forcedErr error forcedErrMtx sync.Mutex + sampleLimit int labelLimits *labelLimits interval time.Duration timeout time.Duration @@ -1087,6 +1089,7 @@ func newScrapeLoop(ctx context.Context, cache *scrapeCache, jitterSeed uint64, honorTimestamps bool, + sampleLimit int, labelLimits *labelLimits, interval time.Duration, timeout time.Duration, @@ -1113,6 +1116,7 @@ func newScrapeLoop(ctx context.Context, l: l, parentCtx: ctx, honorTimestamps: honorTimestamps, + sampleLimit: sampleLimit, labelLimits: labelLimits, interval: interval, timeout: timeout, @@ -1610,6 +1614,7 @@ const ( samplesPostRelabelMetricName = "scrape_samples_post_metric_relabeling" + "\xff" scrapeSeriesAddedMetricName = "scrape_series_added" + "\xff" scrapeTimeoutMetricName = "scrape_timeout_seconds" + "\xff" + scrapeSampleLimitMetricName = "scrape_sample_limit" + "\xff" ) func (sl *scrapeLoop) report(app storage.Appender, start time.Time, timeout, duration time.Duration, scraped, added, seriesAdded int, scrapeErr error) (err error) { @@ -1641,6 +1646,9 @@ func (sl *scrapeLoop) report(app storage.Appender, start time.Time, timeout, dur if err = sl.addReportSample(app, scrapeTimeoutMetricName, ts, timeout.Seconds()); err != nil { return } + if err = sl.addReportSample(app, scrapeSampleLimitMetricName, ts, float64(sl.sampleLimit)); err != nil { + return + } } return } @@ -1669,6 +1677,9 @@ func (sl *scrapeLoop) reportStale(app storage.Appender, start time.Time) (err er if err = sl.addReportSample(app, scrapeTimeoutMetricName, ts, stale); err != nil { return } + if err = sl.addReportSample(app, scrapeSampleLimitMetricName, ts, stale); err != nil { + return + } } return } diff --git a/scrape/scrape_test.go b/scrape/scrape_test.go index 9b27e9783..fc2b51ab7 100644 --- a/scrape/scrape_test.go +++ b/scrape/scrape_test.go @@ -586,6 +586,7 @@ func TestScrapeLoopStopBeforeRun(t *testing.T) { nopMutator, nil, nil, 0, true, + 0, nil, 1, 0, @@ -654,6 +655,7 @@ func TestScrapeLoopStop(t *testing.T) { nil, 0, true, + 0, nil, 10*time.Millisecond, time.Hour, @@ -725,6 +727,7 @@ func TestScrapeLoopRun(t *testing.T) { nil, 0, true, + 0, nil, time.Second, time.Hour, @@ -776,6 +779,7 @@ func TestScrapeLoopRun(t *testing.T) { nil, 0, true, + 0, nil, time.Second, 100*time.Millisecond, @@ -831,6 +835,7 @@ func TestScrapeLoopForcedErr(t *testing.T) { nil, 0, true, + 0, nil, time.Second, time.Hour, @@ -885,6 +890,7 @@ func TestScrapeLoopMetadata(t *testing.T) { cache, 0, true, + 0, nil, 0, 0, @@ -938,6 +944,7 @@ func TestScrapeLoopSeriesAdded(t *testing.T) { nil, 0, true, + 0, nil, 0, 0, @@ -980,6 +987,7 @@ func TestScrapeLoopRunCreatesStaleMarkersOnFailedScrape(t *testing.T) { nil, 0, true, + 0, nil, 10*time.Millisecond, time.Hour, @@ -1038,6 +1046,7 @@ func TestScrapeLoopRunCreatesStaleMarkersOnParseFailure(t *testing.T) { nil, 0, true, + 0, nil, 10*time.Millisecond, time.Hour, @@ -1100,6 +1109,7 @@ func TestScrapeLoopCache(t *testing.T) { nil, 0, true, + 0, nil, 10*time.Millisecond, time.Hour, @@ -1178,6 +1188,7 @@ func TestScrapeLoopCacheMemoryExhaustionProtection(t *testing.T) { nil, 0, true, + 0, nil, 10*time.Millisecond, time.Hour, @@ -1288,6 +1299,7 @@ func TestScrapeLoopAppend(t *testing.T) { nil, 0, true, + 0, nil, 0, 0, @@ -1333,6 +1345,7 @@ func TestScrapeLoopAppendCacheEntryButErrNotFound(t *testing.T) { nil, 0, true, + 0, nil, 0, 0, @@ -1386,6 +1399,7 @@ func TestScrapeLoopAppendSampleLimit(t *testing.T) { nil, 0, true, + app.limit, nil, 0, 0, @@ -1459,6 +1473,7 @@ func TestScrapeLoop_ChangingMetricString(t *testing.T) { nil, 0, true, + 0, nil, 0, 0, @@ -1503,6 +1518,7 @@ func TestScrapeLoopAppendStaleness(t *testing.T) { nil, 0, true, + 0, nil, 0, 0, @@ -1550,6 +1566,7 @@ func TestScrapeLoopAppendNoStalenessIfTimestamp(t *testing.T) { nil, 0, true, + 0, nil, 0, 0, @@ -1655,6 +1672,7 @@ metric_total{n="2"} 2 # {t="2"} 2.0 20000 nil, 0, true, + 0, nil, 0, 0, @@ -1716,6 +1734,7 @@ func TestScrapeLoopAppendExemplarSeries(t *testing.T) { nil, 0, true, + 0, nil, 0, 0, @@ -1764,6 +1783,7 @@ func TestScrapeLoopRunReportsTargetDownOnScrapeError(t *testing.T) { nil, 0, true, + 0, nil, 10*time.Millisecond, time.Hour, @@ -1796,6 +1816,7 @@ func TestScrapeLoopRunReportsTargetDownOnInvalidUTF8(t *testing.T) { nil, 0, true, + 0, nil, 10*time.Millisecond, time.Hour, @@ -1841,6 +1862,7 @@ func TestScrapeLoopAppendGracefullyIfAmendOrOutOfOrderOrOutOfBounds(t *testing.T nil, 0, true, + 0, nil, 0, 0, @@ -1882,6 +1904,7 @@ func TestScrapeLoopOutOfBoundsTimeError(t *testing.T) { nil, 0, true, + 0, nil, 0, 0, @@ -2136,6 +2159,7 @@ func TestScrapeLoop_RespectTimestamps(t *testing.T) { func(ctx context.Context) storage.Appender { return capp }, nil, 0, true, + 0, nil, 0, 0, @@ -2173,6 +2197,7 @@ func TestScrapeLoop_DiscardTimestamps(t *testing.T) { func(ctx context.Context) storage.Appender { return capp }, nil, 0, false, + 0, nil, 0, 0, @@ -2209,6 +2234,7 @@ func TestScrapeLoopDiscardDuplicateLabels(t *testing.T) { nil, 0, true, + 0, nil, 0, 0, @@ -2263,6 +2289,7 @@ func TestScrapeLoopDiscardUnnamedMetrics(t *testing.T) { nil, 0, true, + 0, nil, 0, 0, @@ -2484,6 +2511,7 @@ func TestScrapeAddFast(t *testing.T) { nil, 0, true, + 0, nil, 0, 0, @@ -2571,6 +2599,7 @@ func TestScrapeReportSingleAppender(t *testing.T) { nil, 0, true, + 0, nil, 10*time.Millisecond, time.Hour, @@ -2703,6 +2732,7 @@ func TestScrapeLoopLabelLimit(t *testing.T) { nil, 0, true, + 0, &test.labelLimits, 0, 0,