2015-05-12 03:21:24 -07:00
|
|
|
# Two histograms with 4 buckets each (x_sum and x_count not included,
|
|
|
|
# only buckets). Lowest bucket for one histogram < 0, for the other >
|
|
|
|
# 0. They have the same name, just separated by label. Not useful in
|
|
|
|
# practice, but can happen (if clients change bucketing), and the
|
|
|
|
# server has to cope with it.
|
|
|
|
|
|
|
|
# Test histogram.
|
2024-04-24 00:36:05 -07:00
|
|
|
load_with_nhcb 5m
|
2015-05-12 03:21:24 -07:00
|
|
|
testhistogram_bucket{le="0.1", start="positive"} 0+5x10
|
|
|
|
testhistogram_bucket{le=".2", start="positive"} 0+7x10
|
|
|
|
testhistogram_bucket{le="1e0", start="positive"} 0+11x10
|
|
|
|
testhistogram_bucket{le="+Inf", start="positive"} 0+12x10
|
|
|
|
testhistogram_bucket{le="-.2", start="negative"} 0+1x10
|
|
|
|
testhistogram_bucket{le="-0.1", start="negative"} 0+2x10
|
|
|
|
testhistogram_bucket{le="0.3", start="negative"} 0+2x10
|
|
|
|
testhistogram_bucket{le="+Inf", start="negative"} 0+3x10
|
|
|
|
|
2021-10-20 07:13:36 -07:00
|
|
|
# Another test histogram, where q(1/6), q(1/2), and q(5/6) are each in
|
|
|
|
# the middle of a bucket and should therefore be 1, 3, and 5,
|
|
|
|
# respectively.
|
2024-04-24 00:36:05 -07:00
|
|
|
load_with_nhcb 5m
|
2021-10-20 07:13:36 -07:00
|
|
|
testhistogram2_bucket{le="0"} 0+0x10
|
|
|
|
testhistogram2_bucket{le="2"} 0+1x10
|
|
|
|
testhistogram2_bucket{le="4"} 0+2x10
|
|
|
|
testhistogram2_bucket{le="6"} 0+3x10
|
|
|
|
testhistogram2_bucket{le="+Inf"} 0+3x10
|
2015-05-12 03:21:24 -07:00
|
|
|
|
2024-04-24 00:36:05 -07:00
|
|
|
# Another test histogram, where there are 0 counts where there is
|
|
|
|
# an infinite bound, allowing us to calculate standard deviation
|
|
|
|
# and variance properly.
|
|
|
|
load_with_nhcb 5m
|
|
|
|
testhistogram3_bucket{le="0", start="positive"} 0+0x10
|
|
|
|
testhistogram3_bucket{le="0.1", start="positive"} 0+5x10
|
|
|
|
testhistogram3_bucket{le=".2", start="positive"} 0+7x10
|
|
|
|
testhistogram3_bucket{le="1e0", start="positive"} 0+11x10
|
|
|
|
testhistogram3_bucket{le="+Inf", start="positive"} 0+11x10
|
|
|
|
testhistogram3_sum{start="positive"} 0+33x10
|
|
|
|
testhistogram3_count{start="positive"} 0+11x10
|
|
|
|
testhistogram3_bucket{le="-.25", start="negative"} 0+0x10
|
|
|
|
testhistogram3_bucket{le="-.2", start="negative"} 0+1x10
|
|
|
|
testhistogram3_bucket{le="-0.1", start="negative"} 0+2x10
|
|
|
|
testhistogram3_bucket{le="0.3", start="negative"} 0+2x10
|
|
|
|
testhistogram3_bucket{le="+Inf", start="negative"} 0+2x10
|
|
|
|
testhistogram3_sum{start="negative"} 0+8x10
|
|
|
|
testhistogram3_count{start="negative"} 0+2x10
|
|
|
|
|
2015-05-12 03:21:24 -07:00
|
|
|
# Now a more realistic histogram per job and instance to test aggregation.
|
2024-04-24 00:36:05 -07:00
|
|
|
load_with_nhcb 5m
|
2015-05-12 03:21:24 -07:00
|
|
|
request_duration_seconds_bucket{job="job1", instance="ins1", le="0.1"} 0+1x10
|
|
|
|
request_duration_seconds_bucket{job="job1", instance="ins1", le="0.2"} 0+3x10
|
|
|
|
request_duration_seconds_bucket{job="job1", instance="ins1", le="+Inf"} 0+4x10
|
|
|
|
request_duration_seconds_bucket{job="job1", instance="ins2", le="0.1"} 0+2x10
|
|
|
|
request_duration_seconds_bucket{job="job1", instance="ins2", le="0.2"} 0+5x10
|
|
|
|
request_duration_seconds_bucket{job="job1", instance="ins2", le="+Inf"} 0+6x10
|
|
|
|
request_duration_seconds_bucket{job="job2", instance="ins1", le="0.1"} 0+3x10
|
|
|
|
request_duration_seconds_bucket{job="job2", instance="ins1", le="0.2"} 0+4x10
|
|
|
|
request_duration_seconds_bucket{job="job2", instance="ins1", le="+Inf"} 0+6x10
|
|
|
|
request_duration_seconds_bucket{job="job2", instance="ins2", le="0.1"} 0+4x10
|
|
|
|
request_duration_seconds_bucket{job="job2", instance="ins2", le="0.2"} 0+7x10
|
|
|
|
request_duration_seconds_bucket{job="job2", instance="ins2", le="+Inf"} 0+9x10
|
|
|
|
|
2019-02-01 02:22:44 -08:00
|
|
|
# Different le representations in one histogram.
|
2024-04-24 00:36:05 -07:00
|
|
|
load_with_nhcb 5m
|
2019-02-01 02:22:44 -08:00
|
|
|
mixed_bucket{job="job1", instance="ins1", le="0.1"} 0+1x10
|
|
|
|
mixed_bucket{job="job1", instance="ins1", le="0.2"} 0+1x10
|
|
|
|
mixed_bucket{job="job1", instance="ins1", le="2e-1"} 0+1x10
|
|
|
|
mixed_bucket{job="job1", instance="ins1", le="2.0e-1"} 0+1x10
|
|
|
|
mixed_bucket{job="job1", instance="ins1", le="+Inf"} 0+4x10
|
|
|
|
mixed_bucket{job="job1", instance="ins2", le="+inf"} 0+0x10
|
|
|
|
mixed_bucket{job="job1", instance="ins2", le="+Inf"} 0+0x10
|
2015-05-12 03:21:24 -07:00
|
|
|
|
2024-04-24 00:36:05 -07:00
|
|
|
# Test histogram_count.
|
|
|
|
eval instant at 50m histogram_count(testhistogram3)
|
|
|
|
{start="positive"} 110
|
|
|
|
{start="negative"} 20
|
|
|
|
|
|
|
|
# Test histogram_sum.
|
|
|
|
eval instant at 50m histogram_sum(testhistogram3)
|
|
|
|
{start="positive"} 330
|
|
|
|
{start="negative"} 80
|
|
|
|
|
|
|
|
# Test histogram_avg.
|
|
|
|
eval instant at 50m histogram_avg(testhistogram3)
|
|
|
|
{start="positive"} 3
|
|
|
|
{start="negative"} 4
|
|
|
|
|
|
|
|
# Test histogram_stddev.
|
|
|
|
eval instant at 50m histogram_stddev(testhistogram3)
|
|
|
|
{start="positive"} 2.8189265757336734
|
|
|
|
{start="negative"} 4.182715937754936
|
|
|
|
|
|
|
|
# Test histogram_stdvar.
|
|
|
|
eval instant at 50m histogram_stdvar(testhistogram3)
|
|
|
|
{start="positive"} 7.946347039377573
|
|
|
|
{start="negative"} 17.495112615949154
|
|
|
|
|
|
|
|
# Test histogram_fraction.
|
|
|
|
|
|
|
|
eval instant at 50m histogram_fraction(0, 0.2, testhistogram3)
|
|
|
|
{start="positive"} 0.6363636363636364
|
|
|
|
{start="negative"} 0
|
|
|
|
|
|
|
|
eval instant at 50m histogram_fraction(0, 0.2, rate(testhistogram3[5m]))
|
|
|
|
{start="positive"} 0.6363636363636364
|
|
|
|
{start="negative"} 0
|
|
|
|
|
|
|
|
# Test histogram_quantile.
|
|
|
|
|
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0, testhistogram3_bucket)
|
|
|
|
{start="positive"} 0
|
|
|
|
{start="negative"} -0.25
|
|
|
|
|
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0.25, testhistogram3_bucket)
|
|
|
|
{start="positive"} 0.055
|
|
|
|
{start="negative"} -0.225
|
|
|
|
|
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0.5, testhistogram3_bucket)
|
|
|
|
{start="positive"} 0.125
|
|
|
|
{start="negative"} -0.2
|
|
|
|
|
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0.75, testhistogram3_bucket)
|
|
|
|
{start="positive"} 0.45
|
|
|
|
{start="negative"} -0.15
|
|
|
|
|
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(1, testhistogram3_bucket)
|
|
|
|
{start="positive"} 1
|
|
|
|
{start="negative"} -0.1
|
|
|
|
|
2015-05-12 03:21:24 -07:00
|
|
|
# Quantile too low.
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(-0.1, testhistogram_bucket)
|
2015-05-12 03:21:24 -07:00
|
|
|
{start="positive"} -Inf
|
|
|
|
{start="negative"} -Inf
|
|
|
|
|
|
|
|
# Quantile too high.
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(1.01, testhistogram_bucket)
|
2015-05-12 03:21:24 -07:00
|
|
|
{start="positive"} +Inf
|
|
|
|
{start="negative"} +Inf
|
|
|
|
|
2022-02-13 05:59:03 -08:00
|
|
|
# Quantile invalid.
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(NaN, testhistogram_bucket)
|
2022-02-13 05:59:03 -08:00
|
|
|
{start="positive"} NaN
|
|
|
|
{start="negative"} NaN
|
|
|
|
|
2024-04-24 00:36:05 -07:00
|
|
|
# Quantile value in lowest bucket.
|
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0, testhistogram_bucket)
|
2015-05-12 03:21:24 -07:00
|
|
|
{start="positive"} 0
|
|
|
|
{start="negative"} -0.2
|
|
|
|
|
|
|
|
# Quantile value in highest bucket.
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(1, testhistogram_bucket)
|
2015-05-12 03:21:24 -07:00
|
|
|
{start="positive"} 1
|
|
|
|
{start="negative"} 0.3
|
|
|
|
|
|
|
|
# Finally some useful quantiles.
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0.2, testhistogram_bucket)
|
2015-05-12 03:21:24 -07:00
|
|
|
{start="positive"} 0.048
|
|
|
|
{start="negative"} -0.2
|
|
|
|
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0.5, testhistogram_bucket)
|
2015-05-12 03:21:24 -07:00
|
|
|
{start="positive"} 0.15
|
|
|
|
{start="negative"} -0.15
|
|
|
|
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0.8, testhistogram_bucket)
|
2015-05-12 03:21:24 -07:00
|
|
|
{start="positive"} 0.72
|
|
|
|
{start="negative"} 0.3
|
|
|
|
|
|
|
|
# More realistic with rates.
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0.2, rate(testhistogram_bucket[5m]))
|
2015-05-12 03:21:24 -07:00
|
|
|
{start="positive"} 0.048
|
|
|
|
{start="negative"} -0.2
|
|
|
|
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0.5, rate(testhistogram_bucket[5m]))
|
2015-05-12 03:21:24 -07:00
|
|
|
{start="positive"} 0.15
|
|
|
|
{start="negative"} -0.15
|
|
|
|
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0.8, rate(testhistogram_bucket[5m]))
|
2015-05-12 03:21:24 -07:00
|
|
|
{start="positive"} 0.72
|
|
|
|
{start="negative"} 0.3
|
|
|
|
|
2021-10-20 07:13:36 -07:00
|
|
|
# Want results exactly in the middle of the bucket.
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 7m histogram_quantile(1./6., testhistogram2_bucket)
|
2021-10-20 07:13:36 -07:00
|
|
|
{} 1
|
|
|
|
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 7m histogram_quantile(0.5, testhistogram2_bucket)
|
2021-10-20 07:13:36 -07:00
|
|
|
{} 3
|
|
|
|
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 7m histogram_quantile(5./6., testhistogram2_bucket)
|
2021-10-20 07:13:36 -07:00
|
|
|
{} 5
|
|
|
|
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 47m histogram_quantile(1./6., rate(testhistogram2_bucket[15m]))
|
2021-10-20 07:13:36 -07:00
|
|
|
{} 1
|
|
|
|
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 47m histogram_quantile(0.5, rate(testhistogram2_bucket[15m]))
|
2021-10-20 07:13:36 -07:00
|
|
|
{} 3
|
|
|
|
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 47m histogram_quantile(5./6., rate(testhistogram2_bucket[15m]))
|
2021-10-20 07:13:36 -07:00
|
|
|
{} 5
|
|
|
|
|
2015-05-12 03:21:24 -07:00
|
|
|
# Aggregated histogram: Everything in one.
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0.3, sum(rate(request_duration_seconds_bucket[5m])) by (le))
|
2015-05-12 03:21:24 -07:00
|
|
|
{} 0.075
|
|
|
|
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0.5, sum(rate(request_duration_seconds_bucket[5m])) by (le))
|
2015-05-12 03:21:24 -07:00
|
|
|
{} 0.1277777777777778
|
|
|
|
|
|
|
|
# Aggregated histogram: Everything in one. Now with avg, which does not change anything.
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0.3, avg(rate(request_duration_seconds_bucket[5m])) by (le))
|
2015-05-12 03:21:24 -07:00
|
|
|
{} 0.075
|
|
|
|
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0.5, avg(rate(request_duration_seconds_bucket[5m])) by (le))
|
2015-05-12 03:21:24 -07:00
|
|
|
{} 0.12777777777777778
|
|
|
|
|
2021-01-20 02:57:39 -08:00
|
|
|
# Aggregated histogram: By instance.
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0.3, sum(rate(request_duration_seconds_bucket[5m])) by (le, instance))
|
2015-05-12 03:21:24 -07:00
|
|
|
{instance="ins1"} 0.075
|
|
|
|
{instance="ins2"} 0.075
|
|
|
|
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0.5, sum(rate(request_duration_seconds_bucket[5m])) by (le, instance))
|
2015-05-12 03:21:24 -07:00
|
|
|
{instance="ins1"} 0.1333333333
|
|
|
|
{instance="ins2"} 0.125
|
|
|
|
|
2021-01-20 02:57:39 -08:00
|
|
|
# Aggregated histogram: By job.
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0.3, sum(rate(request_duration_seconds_bucket[5m])) by (le, job))
|
2015-05-12 03:21:24 -07:00
|
|
|
{job="job1"} 0.1
|
|
|
|
{job="job2"} 0.0642857142857143
|
|
|
|
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0.5, sum(rate(request_duration_seconds_bucket[5m])) by (le, job))
|
2015-05-12 03:21:24 -07:00
|
|
|
{job="job1"} 0.14
|
|
|
|
{job="job2"} 0.1125
|
|
|
|
|
|
|
|
# Aggregated histogram: By job and instance.
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0.3, sum(rate(request_duration_seconds_bucket[5m])) by (le, job, instance))
|
2015-05-12 03:21:24 -07:00
|
|
|
{instance="ins1", job="job1"} 0.11
|
|
|
|
{instance="ins2", job="job1"} 0.09
|
|
|
|
{instance="ins1", job="job2"} 0.06
|
|
|
|
{instance="ins2", job="job2"} 0.0675
|
|
|
|
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0.5, sum(rate(request_duration_seconds_bucket[5m])) by (le, job, instance))
|
2015-05-12 03:21:24 -07:00
|
|
|
{instance="ins1", job="job1"} 0.15
|
|
|
|
{instance="ins2", job="job1"} 0.1333333333333333
|
|
|
|
{instance="ins1", job="job2"} 0.1
|
|
|
|
{instance="ins2", job="job2"} 0.1166666666666667
|
|
|
|
|
|
|
|
# The unaggregated histogram for comparison. Same result as the previous one.
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0.3, rate(request_duration_seconds_bucket[5m]))
|
2015-05-12 03:21:24 -07:00
|
|
|
{instance="ins1", job="job1"} 0.11
|
|
|
|
{instance="ins2", job="job1"} 0.09
|
|
|
|
{instance="ins1", job="job2"} 0.06
|
|
|
|
{instance="ins2", job="job2"} 0.0675
|
|
|
|
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0.5, rate(request_duration_seconds_bucket[5m]))
|
2015-05-12 03:21:24 -07:00
|
|
|
{instance="ins1", job="job1"} 0.15
|
|
|
|
{instance="ins2", job="job1"} 0.13333333333333333
|
|
|
|
{instance="ins1", job="job2"} 0.1
|
2015-05-12 04:41:57 -07:00
|
|
|
{instance="ins2", job="job2"} 0.11666666666666667
|
Force buckets in a histogram to be monotonic for quantile estimation (#2610)
* Force buckets in a histogram to be monotonic for quantile estimation
The assumption that bucket counts increase monotonically with increasing
upperBound may be violated during:
* Recording rule evaluation of histogram_quantile, especially when rate()
has been applied to the underlying bucket timeseries.
* Evaluation of histogram_quantile computed over federated bucket
timeseries, especially when rate() has been applied
This is because scraped data is not made available to RR evalution or
federation atomically, so some buckets are computed with data from the N
most recent scrapes, but the other buckets are missing the most recent
observations.
Monotonicity is usually guaranteed because if a bucket with upper bound
u1 has count c1, then any bucket with a higher upper bound u > u1 must
have counted all c1 observations and perhaps more, so that c >= c1.
Randomly interspersed partial sampling breaks that guarantee, and rate()
exacerbates it. Specifically, suppose bucket le=1000 has a count of 10 from
4 samples but the bucket with le=2000 has a count of 7, from 3 samples. The
monotonicity is broken. It is exacerbated by rate() because under normal
operation, cumulative counting of buckets will cause the bucket counts to
diverge such that small differences from missing samples are not a problem.
rate() removes this divergence.)
bucketQuantile depends on that monotonicity to do a binary search for the
bucket with the qth percentile count, so breaking the monotonicity
guarantee causes bucketQuantile() to return undefined (nonsense) results.
As a somewhat hacky solution until the Prometheus project is ready to
accept the changes required to make scrapes atomic, we calculate the
"envelope" of the histogram buckets, essentially removing any decreases
in the count between successive buckets.
* Fix up comment docs for ensureMonotonic
* ensureMonotonic: Use switch statement
Use switch statement rather than if/else for better readability.
Process the most frequent cases first.
2017-04-14 07:21:49 -07:00
|
|
|
|
|
|
|
# A histogram with nonmonotonic bucket counts. This may happen when recording
|
|
|
|
# rule evaluation or federation races scrape ingestion, causing some buckets
|
2020-06-15 03:32:10 -07:00
|
|
|
# counts to be derived from fewer samples.
|
Force buckets in a histogram to be monotonic for quantile estimation (#2610)
* Force buckets in a histogram to be monotonic for quantile estimation
The assumption that bucket counts increase monotonically with increasing
upperBound may be violated during:
* Recording rule evaluation of histogram_quantile, especially when rate()
has been applied to the underlying bucket timeseries.
* Evaluation of histogram_quantile computed over federated bucket
timeseries, especially when rate() has been applied
This is because scraped data is not made available to RR evalution or
federation atomically, so some buckets are computed with data from the N
most recent scrapes, but the other buckets are missing the most recent
observations.
Monotonicity is usually guaranteed because if a bucket with upper bound
u1 has count c1, then any bucket with a higher upper bound u > u1 must
have counted all c1 observations and perhaps more, so that c >= c1.
Randomly interspersed partial sampling breaks that guarantee, and rate()
exacerbates it. Specifically, suppose bucket le=1000 has a count of 10 from
4 samples but the bucket with le=2000 has a count of 7, from 3 samples. The
monotonicity is broken. It is exacerbated by rate() because under normal
operation, cumulative counting of buckets will cause the bucket counts to
diverge such that small differences from missing samples are not a problem.
rate() removes this divergence.)
bucketQuantile depends on that monotonicity to do a binary search for the
bucket with the qth percentile count, so breaking the monotonicity
guarantee causes bucketQuantile() to return undefined (nonsense) results.
As a somewhat hacky solution until the Prometheus project is ready to
accept the changes required to make scrapes atomic, we calculate the
"envelope" of the histogram buckets, essentially removing any decreases
in the count between successive buckets.
* Fix up comment docs for ensureMonotonic
* ensureMonotonic: Use switch statement
Use switch statement rather than if/else for better readability.
Process the most frequent cases first.
2017-04-14 07:21:49 -07:00
|
|
|
|
|
|
|
load 5m
|
2020-06-15 03:32:10 -07:00
|
|
|
nonmonotonic_bucket{le="0.1"} 0+2x10
|
|
|
|
nonmonotonic_bucket{le="1"} 0+1x10
|
|
|
|
nonmonotonic_bucket{le="10"} 0+5x10
|
|
|
|
nonmonotonic_bucket{le="100"} 0+4x10
|
Force buckets in a histogram to be monotonic for quantile estimation (#2610)
* Force buckets in a histogram to be monotonic for quantile estimation
The assumption that bucket counts increase monotonically with increasing
upperBound may be violated during:
* Recording rule evaluation of histogram_quantile, especially when rate()
has been applied to the underlying bucket timeseries.
* Evaluation of histogram_quantile computed over federated bucket
timeseries, especially when rate() has been applied
This is because scraped data is not made available to RR evalution or
federation atomically, so some buckets are computed with data from the N
most recent scrapes, but the other buckets are missing the most recent
observations.
Monotonicity is usually guaranteed because if a bucket with upper bound
u1 has count c1, then any bucket with a higher upper bound u > u1 must
have counted all c1 observations and perhaps more, so that c >= c1.
Randomly interspersed partial sampling breaks that guarantee, and rate()
exacerbates it. Specifically, suppose bucket le=1000 has a count of 10 from
4 samples but the bucket with le=2000 has a count of 7, from 3 samples. The
monotonicity is broken. It is exacerbated by rate() because under normal
operation, cumulative counting of buckets will cause the bucket counts to
diverge such that small differences from missing samples are not a problem.
rate() removes this divergence.)
bucketQuantile depends on that monotonicity to do a binary search for the
bucket with the qth percentile count, so breaking the monotonicity
guarantee causes bucketQuantile() to return undefined (nonsense) results.
As a somewhat hacky solution until the Prometheus project is ready to
accept the changes required to make scrapes atomic, we calculate the
"envelope" of the histogram buckets, essentially removing any decreases
in the count between successive buckets.
* Fix up comment docs for ensureMonotonic
* ensureMonotonic: Use switch statement
Use switch statement rather than if/else for better readability.
Process the most frequent cases first.
2017-04-14 07:21:49 -07:00
|
|
|
nonmonotonic_bucket{le="1000"} 0+9x10
|
2020-06-15 03:32:10 -07:00
|
|
|
nonmonotonic_bucket{le="+Inf"} 0+8x10
|
Force buckets in a histogram to be monotonic for quantile estimation (#2610)
* Force buckets in a histogram to be monotonic for quantile estimation
The assumption that bucket counts increase monotonically with increasing
upperBound may be violated during:
* Recording rule evaluation of histogram_quantile, especially when rate()
has been applied to the underlying bucket timeseries.
* Evaluation of histogram_quantile computed over federated bucket
timeseries, especially when rate() has been applied
This is because scraped data is not made available to RR evalution or
federation atomically, so some buckets are computed with data from the N
most recent scrapes, but the other buckets are missing the most recent
observations.
Monotonicity is usually guaranteed because if a bucket with upper bound
u1 has count c1, then any bucket with a higher upper bound u > u1 must
have counted all c1 observations and perhaps more, so that c >= c1.
Randomly interspersed partial sampling breaks that guarantee, and rate()
exacerbates it. Specifically, suppose bucket le=1000 has a count of 10 from
4 samples but the bucket with le=2000 has a count of 7, from 3 samples. The
monotonicity is broken. It is exacerbated by rate() because under normal
operation, cumulative counting of buckets will cause the bucket counts to
diverge such that small differences from missing samples are not a problem.
rate() removes this divergence.)
bucketQuantile depends on that monotonicity to do a binary search for the
bucket with the qth percentile count, so breaking the monotonicity
guarantee causes bucketQuantile() to return undefined (nonsense) results.
As a somewhat hacky solution until the Prometheus project is ready to
accept the changes required to make scrapes atomic, we calculate the
"envelope" of the histogram buckets, essentially removing any decreases
in the count between successive buckets.
* Fix up comment docs for ensureMonotonic
* ensureMonotonic: Use switch statement
Use switch statement rather than if/else for better readability.
Process the most frequent cases first.
2017-04-14 07:21:49 -07:00
|
|
|
|
|
|
|
# Nonmonotonic buckets
|
2020-06-15 03:32:10 -07:00
|
|
|
eval instant at 50m histogram_quantile(0.01, nonmonotonic_bucket)
|
|
|
|
{} 0.0045
|
|
|
|
|
|
|
|
eval instant at 50m histogram_quantile(0.5, nonmonotonic_bucket)
|
|
|
|
{} 8.5
|
|
|
|
|
Force buckets in a histogram to be monotonic for quantile estimation (#2610)
* Force buckets in a histogram to be monotonic for quantile estimation
The assumption that bucket counts increase monotonically with increasing
upperBound may be violated during:
* Recording rule evaluation of histogram_quantile, especially when rate()
has been applied to the underlying bucket timeseries.
* Evaluation of histogram_quantile computed over federated bucket
timeseries, especially when rate() has been applied
This is because scraped data is not made available to RR evalution or
federation atomically, so some buckets are computed with data from the N
most recent scrapes, but the other buckets are missing the most recent
observations.
Monotonicity is usually guaranteed because if a bucket with upper bound
u1 has count c1, then any bucket with a higher upper bound u > u1 must
have counted all c1 observations and perhaps more, so that c >= c1.
Randomly interspersed partial sampling breaks that guarantee, and rate()
exacerbates it. Specifically, suppose bucket le=1000 has a count of 10 from
4 samples but the bucket with le=2000 has a count of 7, from 3 samples. The
monotonicity is broken. It is exacerbated by rate() because under normal
operation, cumulative counting of buckets will cause the bucket counts to
diverge such that small differences from missing samples are not a problem.
rate() removes this divergence.)
bucketQuantile depends on that monotonicity to do a binary search for the
bucket with the qth percentile count, so breaking the monotonicity
guarantee causes bucketQuantile() to return undefined (nonsense) results.
As a somewhat hacky solution until the Prometheus project is ready to
accept the changes required to make scrapes atomic, we calculate the
"envelope" of the histogram buckets, essentially removing any decreases
in the count between successive buckets.
* Fix up comment docs for ensureMonotonic
* ensureMonotonic: Use switch statement
Use switch statement rather than if/else for better readability.
Process the most frequent cases first.
2017-04-14 07:21:49 -07:00
|
|
|
eval instant at 50m histogram_quantile(0.99, nonmonotonic_bucket)
|
2020-06-15 03:32:10 -07:00
|
|
|
{} 979.75
|
2019-02-01 02:22:44 -08:00
|
|
|
|
|
|
|
# Buckets with different representations of the same upper bound.
|
|
|
|
eval instant at 50m histogram_quantile(0.5, rate(mixed_bucket[5m]))
|
|
|
|
{instance="ins1", job="job1"} 0.15
|
|
|
|
{instance="ins2", job="job1"} NaN
|
|
|
|
|
2024-04-24 00:36:05 -07:00
|
|
|
eval instant at 50m histogram_quantile(0.5, rate(mixed[5m]))
|
2019-02-01 02:22:44 -08:00
|
|
|
{instance="ins1", job="job1"} 0.2
|
|
|
|
{instance="ins2", job="job1"} NaN
|
|
|
|
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0.75, rate(mixed_bucket[5m]))
|
2019-02-01 02:22:44 -08:00
|
|
|
{instance="ins1", job="job1"} 0.2
|
|
|
|
{instance="ins2", job="job1"} NaN
|
2020-06-01 01:40:39 -07:00
|
|
|
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(1, rate(mixed_bucket[5m]))
|
|
|
|
{instance="ins1", job="job1"} 0.2
|
|
|
|
{instance="ins2", job="job1"} NaN
|
|
|
|
|
|
|
|
load_with_nhcb 5m
|
2020-06-01 01:40:39 -07:00
|
|
|
empty_bucket{le="0.1", job="job1", instance="ins1"} 0x10
|
|
|
|
empty_bucket{le="0.2", job="job1", instance="ins1"} 0x10
|
|
|
|
empty_bucket{le="+Inf", job="job1", instance="ins1"} 0x10
|
|
|
|
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb instant at 50m histogram_quantile(0.2, rate(empty_bucket[5m]))
|
2021-01-20 02:57:39 -08:00
|
|
|
{instance="ins1", job="job1"} NaN
|
2022-01-07 12:31:37 -08:00
|
|
|
|
|
|
|
# Load a duplicate histogram with a different name to test failure scenario on multiple histograms with the same label set
|
|
|
|
# https://github.com/prometheus/prometheus/issues/9910
|
2024-04-24 00:36:05 -07:00
|
|
|
load_with_nhcb 5m
|
2022-01-07 12:31:37 -08:00
|
|
|
request_duration_seconds2_bucket{job="job1", instance="ins1", le="0.1"} 0+1x10
|
|
|
|
request_duration_seconds2_bucket{job="job1", instance="ins1", le="0.2"} 0+3x10
|
|
|
|
request_duration_seconds2_bucket{job="job1", instance="ins1", le="+Inf"} 0+4x10
|
|
|
|
|
2024-04-24 00:36:05 -07:00
|
|
|
eval_with_nhcb_fail instant at 50m histogram_quantile(0.99, {__name__=~"request_duration_seconds\\d*_bucket$"})
|