prometheus/promql/promqltest/testdata/histograms.test

559 lines
20 KiB
Plaintext
Raw Normal View History

# Two histograms with 4 buckets each (x_sum and x_count not included,
# only buckets). Lowest bucket for one histogram < 0, for the other >
# 0. They have the same name, just separated by label. Not useful in
# practice, but can happen (if clients change bucketing), and the
# server has to cope with it.
# Test histogram.
load_with_nhcb 5m
testhistogram_bucket{le="0.1", start="positive"} 0+5x10
testhistogram_bucket{le=".2", start="positive"} 0+7x10
testhistogram_bucket{le="1e0", start="positive"} 0+11x10
testhistogram_bucket{le="+Inf", start="positive"} 0+12x10
testhistogram_bucket{le="-.2", start="negative"} 0+1x10
testhistogram_bucket{le="-0.1", start="negative"} 0+2x10
testhistogram_bucket{le="0.3", start="negative"} 0+2x10
testhistogram_bucket{le="+Inf", start="negative"} 0+3x10
# Another test histogram, where q(1/6), q(1/2), and q(5/6) are each in
# the middle of a bucket and should therefore be 1, 3, and 5,
# respectively.
load_with_nhcb 5m
testhistogram2_bucket{le="0"} 0+0x10
testhistogram2_bucket{le="2"} 0+1x10
testhistogram2_bucket{le="4"} 0+2x10
testhistogram2_bucket{le="6"} 0+3x10
testhistogram2_bucket{le="+Inf"} 0+3x10
# Another test histogram, this time without any observations in the +Inf bucket.
# This enables a meaningful calculation of standard deviation and variance.
load_with_nhcb 5m
testhistogram3_bucket{le="0", start="positive"} 0+0x10
testhistogram3_bucket{le="0.1", start="positive"} 0+5x10
testhistogram3_bucket{le=".2", start="positive"} 0+7x10
testhistogram3_bucket{le="1e0", start="positive"} 0+11x10
testhistogram3_bucket{le="+Inf", start="positive"} 0+11x10
testhistogram3_sum{start="positive"} 0+33x10
testhistogram3_count{start="positive"} 0+11x10
testhistogram3_bucket{le="-.25", start="negative"} 0+0x10
testhistogram3_bucket{le="-.2", start="negative"} 0+1x10
testhistogram3_bucket{le="-0.1", start="negative"} 0+2x10
testhistogram3_bucket{le="0.3", start="negative"} 0+2x10
testhistogram3_bucket{le="+Inf", start="negative"} 0+2x10
testhistogram3_sum{start="negative"} 0+8x10
testhistogram3_count{start="negative"} 0+2x10
# Now a more realistic histogram per job and instance to test aggregation.
load_with_nhcb 5m
request_duration_seconds_bucket{job="job1", instance="ins1", le="0.1"} 0+1x10
request_duration_seconds_bucket{job="job1", instance="ins1", le="0.2"} 0+3x10
request_duration_seconds_bucket{job="job1", instance="ins1", le="+Inf"} 0+4x10
request_duration_seconds_bucket{job="job1", instance="ins2", le="0.1"} 0+2x10
request_duration_seconds_bucket{job="job1", instance="ins2", le="0.2"} 0+5x10
request_duration_seconds_bucket{job="job1", instance="ins2", le="+Inf"} 0+6x10
request_duration_seconds_bucket{job="job2", instance="ins1", le="0.1"} 0+3x10
request_duration_seconds_bucket{job="job2", instance="ins1", le="0.2"} 0+4x10
request_duration_seconds_bucket{job="job2", instance="ins1", le="+Inf"} 0+6x10
request_duration_seconds_bucket{job="job2", instance="ins2", le="0.1"} 0+4x10
request_duration_seconds_bucket{job="job2", instance="ins2", le="0.2"} 0+7x10
request_duration_seconds_bucket{job="job2", instance="ins2", le="+Inf"} 0+9x10
# Different le representations in one histogram.
load_with_nhcb 5m
mixed_bucket{job="job1", instance="ins1", le="0.1"} 0+1x10
mixed_bucket{job="job1", instance="ins1", le="0.2"} 0+1x10
mixed_bucket{job="job1", instance="ins1", le="2e-1"} 0+1x10
mixed_bucket{job="job1", instance="ins1", le="2.0e-1"} 0+1x10
mixed_bucket{job="job1", instance="ins1", le="+Inf"} 0+4x10
mixed_bucket{job="job1", instance="ins2", le="+inf"} 0+0x10
mixed_bucket{job="job1", instance="ins2", le="+Inf"} 0+0x10
# Test histogram_count.
eval instant at 50m histogram_count(testhistogram3)
{start="positive"} 110
{start="negative"} 20
# Classic way of accessing the count still works.
eval instant at 50m testhistogram3_count
testhistogram3_count{start="positive"} 110
testhistogram3_count{start="negative"} 20
# Test histogram_sum.
eval instant at 50m histogram_sum(testhistogram3)
{start="positive"} 330
{start="negative"} 80
# Classic way of accessing the sum still works.
eval instant at 50m testhistogram3_sum
testhistogram3_sum{start="positive"} 330
testhistogram3_sum{start="negative"} 80
# Test histogram_avg. This has no classic equivalent.
eval instant at 50m histogram_avg(testhistogram3)
{start="positive"} 3
{start="negative"} 4
# Test histogram_stddev. This has no classic equivalent.
eval instant at 50m histogram_stddev(testhistogram3)
{start="positive"} 2.8189265757336734
{start="negative"} 4.182715937754936
# Test histogram_stdvar. This has no classic equivalent.
eval instant at 50m histogram_stdvar(testhistogram3)
{start="positive"} 7.946347039377573
{start="negative"} 17.495112615949154
# Test histogram_fraction.
eval instant at 50m histogram_fraction(0, 0.2, testhistogram3)
{start="positive"} 0.6363636363636364
{start="negative"} 0
eval instant at 50m histogram_fraction(0, 0.2, rate(testhistogram3[10m]))
{start="positive"} 0.6363636363636364
{start="negative"} 0
# In the classic histogram, we can access the corresponding bucket (if
# it exists) and divide by the count to get the same result.
eval instant at 50m testhistogram3_bucket{le=".2"} / ignoring(le) testhistogram3_count
{start="positive"} 0.6363636363636364
eval instant at 50m rate(testhistogram3_bucket{le=".2"}[10m]) / ignoring(le) rate(testhistogram3_count[10m])
{start="positive"} 0.6363636363636364
# Test histogram_quantile, native and classic.
eval instant at 50m histogram_quantile(0, testhistogram3)
{start="positive"} 0
{start="negative"} -0.25
eval instant at 50m histogram_quantile(0, testhistogram3_bucket)
{start="positive"} 0
{start="negative"} -0.25
eval instant at 50m histogram_quantile(0.25, testhistogram3)
{start="positive"} 0.055
{start="negative"} -0.225
eval instant at 50m histogram_quantile(0.25, testhistogram3_bucket)
{start="positive"} 0.055
{start="negative"} -0.225
eval instant at 50m histogram_quantile(0.5, testhistogram3)
{start="positive"} 0.125
{start="negative"} -0.2
eval instant at 50m histogram_quantile(0.5, testhistogram3_bucket)
{start="positive"} 0.125
{start="negative"} -0.2
eval instant at 50m histogram_quantile(0.75, testhistogram3)
{start="positive"} 0.45
{start="negative"} -0.15
eval instant at 50m histogram_quantile(0.75, testhistogram3_bucket)
{start="positive"} 0.45
{start="negative"} -0.15
eval instant at 50m histogram_quantile(1, testhistogram3)
{start="positive"} 1
{start="negative"} -0.1
eval instant at 50m histogram_quantile(1, testhistogram3_bucket)
{start="positive"} 1
{start="negative"} -0.1
# Quantile too low.
eval_warn instant at 50m histogram_quantile(-0.1, testhistogram)
{start="positive"} -Inf
{start="negative"} -Inf
eval_warn instant at 50m histogram_quantile(-0.1, testhistogram_bucket)
{start="positive"} -Inf
{start="negative"} -Inf
# Quantile too high.
eval_warn instant at 50m histogram_quantile(1.01, testhistogram)
{start="positive"} +Inf
{start="negative"} +Inf
eval_warn instant at 50m histogram_quantile(1.01, testhistogram_bucket)
{start="positive"} +Inf
{start="negative"} +Inf
# Quantile invalid.
eval_warn instant at 50m histogram_quantile(NaN, testhistogram)
{start="positive"} NaN
{start="negative"} NaN
eval_warn instant at 50m histogram_quantile(NaN, testhistogram_bucket)
{start="positive"} NaN
{start="negative"} NaN
# Quantile value in lowest bucket.
eval instant at 50m histogram_quantile(0, testhistogram)
{start="positive"} 0
{start="negative"} -0.2
eval instant at 50m histogram_quantile(0, testhistogram_bucket)
{start="positive"} 0
{start="negative"} -0.2
# Quantile value in highest bucket.
eval instant at 50m histogram_quantile(1, testhistogram)
{start="positive"} 1
{start="negative"} 0.3
eval instant at 50m histogram_quantile(1, testhistogram_bucket)
{start="positive"} 1
{start="negative"} 0.3
# Finally some useful quantiles.
eval instant at 50m histogram_quantile(0.2, testhistogram)
{start="positive"} 0.048
{start="negative"} -0.2
eval instant at 50m histogram_quantile(0.2, testhistogram_bucket)
{start="positive"} 0.048
{start="negative"} -0.2
eval instant at 50m histogram_quantile(0.5, testhistogram)
{start="positive"} 0.15
{start="negative"} -0.15
eval instant at 50m histogram_quantile(0.5, testhistogram_bucket)
{start="positive"} 0.15
{start="negative"} -0.15
eval instant at 50m histogram_quantile(0.8, testhistogram)
{start="positive"} 0.72
{start="negative"} 0.3
eval instant at 50m histogram_quantile(0.8, testhistogram_bucket)
{start="positive"} 0.72
{start="negative"} 0.3
# More realistic with rates.
eval instant at 50m histogram_quantile(0.2, rate(testhistogram[10m]))
{start="positive"} 0.048
{start="negative"} -0.2
eval instant at 50m histogram_quantile(0.2, rate(testhistogram_bucket[10m]))
{start="positive"} 0.048
{start="negative"} -0.2
eval instant at 50m histogram_quantile(0.5, rate(testhistogram[10m]))
{start="positive"} 0.15
{start="negative"} -0.15
eval instant at 50m histogram_quantile(0.5, rate(testhistogram_bucket[10m]))
{start="positive"} 0.15
{start="negative"} -0.15
eval instant at 50m histogram_quantile(0.8, rate(testhistogram[10m]))
{start="positive"} 0.72
{start="negative"} 0.3
eval instant at 50m histogram_quantile(0.8, rate(testhistogram_bucket[10m]))
{start="positive"} 0.72
{start="negative"} 0.3
# Want results exactly in the middle of the bucket.
eval instant at 7m histogram_quantile(1./6., testhistogram2)
{} 1
eval instant at 7m histogram_quantile(1./6., testhistogram2_bucket)
{} 1
eval instant at 7m histogram_quantile(0.5, testhistogram2)
{} 3
eval instant at 7m histogram_quantile(0.5, testhistogram2_bucket)
{} 3
eval instant at 7m histogram_quantile(5./6., testhistogram2)
{} 5
eval instant at 7m histogram_quantile(5./6., testhistogram2_bucket)
{} 5
eval instant at 47m histogram_quantile(1./6., rate(testhistogram2[15m]))
{} 1
eval instant at 47m histogram_quantile(1./6., rate(testhistogram2_bucket[15m]))
{} 1
eval instant at 47m histogram_quantile(0.5, rate(testhistogram2[15m]))
{} 3
eval instant at 47m histogram_quantile(0.5, rate(testhistogram2_bucket[15m]))
{} 3
eval instant at 47m histogram_quantile(5./6., rate(testhistogram2[15m]))
{} 5
eval instant at 47m histogram_quantile(5./6., rate(testhistogram2_bucket[15m]))
{} 5
# Aggregated histogram: Everything in one. Note how native histograms
# don't require aggregation by le.
eval instant at 50m histogram_quantile(0.3, sum(rate(request_duration_seconds[10m])))
{} 0.075
eval instant at 50m histogram_quantile(0.3, sum(rate(request_duration_seconds_bucket[10m])) by (le))
{} 0.075
eval instant at 50m histogram_quantile(0.5, sum(rate(request_duration_seconds[10m])))
{} 0.1277777777777778
eval instant at 50m histogram_quantile(0.5, sum(rate(request_duration_seconds_bucket[10m])) by (le))
{} 0.1277777777777778
# Aggregated histogram: Everything in one. Now with avg, which does not change anything.
eval instant at 50m histogram_quantile(0.3, avg(rate(request_duration_seconds[10m])))
{} 0.075
eval instant at 50m histogram_quantile(0.3, avg(rate(request_duration_seconds_bucket[10m])) by (le))
{} 0.075
eval instant at 50m histogram_quantile(0.5, avg(rate(request_duration_seconds[10m])))
{} 0.12777777777777778
eval instant at 50m histogram_quantile(0.5, avg(rate(request_duration_seconds_bucket[10m])) by (le))
{} 0.12777777777777778
# Aggregated histogram: By instance.
eval instant at 50m histogram_quantile(0.3, sum(rate(request_duration_seconds[10m])) by (instance))
{instance="ins1"} 0.075
{instance="ins2"} 0.075
eval instant at 50m histogram_quantile(0.3, sum(rate(request_duration_seconds_bucket[10m])) by (le, instance))
{instance="ins1"} 0.075
{instance="ins2"} 0.075
eval instant at 50m histogram_quantile(0.5, sum(rate(request_duration_seconds[10m])) by (instance))
{instance="ins1"} 0.1333333333
{instance="ins2"} 0.125
eval instant at 50m histogram_quantile(0.5, sum(rate(request_duration_seconds_bucket[10m])) by (le, instance))
{instance="ins1"} 0.1333333333
{instance="ins2"} 0.125
# Aggregated histogram: By job.
eval instant at 50m histogram_quantile(0.3, sum(rate(request_duration_seconds[10m])) by (job))
{job="job1"} 0.1
{job="job2"} 0.0642857142857143
eval instant at 50m histogram_quantile(0.3, sum(rate(request_duration_seconds_bucket[10m])) by (le, job))
{job="job1"} 0.1
{job="job2"} 0.0642857142857143
eval instant at 50m histogram_quantile(0.5, sum(rate(request_duration_seconds[10m])) by (job))
{job="job1"} 0.14
{job="job2"} 0.1125
eval instant at 50m histogram_quantile(0.5, sum(rate(request_duration_seconds_bucket[10m])) by (le, job))
{job="job1"} 0.14
{job="job2"} 0.1125
# Aggregated histogram: By job and instance.
eval instant at 50m histogram_quantile(0.3, sum(rate(request_duration_seconds[10m])) by (job, instance))
{instance="ins1", job="job1"} 0.11
{instance="ins2", job="job1"} 0.09
{instance="ins1", job="job2"} 0.06
{instance="ins2", job="job2"} 0.0675
eval instant at 50m histogram_quantile(0.3, sum(rate(request_duration_seconds_bucket[10m])) by (le, job, instance))
{instance="ins1", job="job1"} 0.11
{instance="ins2", job="job1"} 0.09
{instance="ins1", job="job2"} 0.06
{instance="ins2", job="job2"} 0.0675
eval instant at 50m histogram_quantile(0.5, sum(rate(request_duration_seconds[10m])) by (job, instance))
{instance="ins1", job="job1"} 0.15
{instance="ins2", job="job1"} 0.1333333333333333
{instance="ins1", job="job2"} 0.1
{instance="ins2", job="job2"} 0.1166666666666667
eval instant at 50m histogram_quantile(0.5, sum(rate(request_duration_seconds_bucket[10m])) by (le, job, instance))
{instance="ins1", job="job1"} 0.15
{instance="ins2", job="job1"} 0.1333333333333333
{instance="ins1", job="job2"} 0.1
{instance="ins2", job="job2"} 0.1166666666666667
# The unaggregated histogram for comparison. Same result as the previous one.
eval instant at 50m histogram_quantile(0.3, rate(request_duration_seconds[10m]))
{instance="ins1", job="job1"} 0.11
{instance="ins2", job="job1"} 0.09
{instance="ins1", job="job2"} 0.06
{instance="ins2", job="job2"} 0.0675
eval instant at 50m histogram_quantile(0.3, rate(request_duration_seconds_bucket[10m]))
{instance="ins1", job="job1"} 0.11
{instance="ins2", job="job1"} 0.09
{instance="ins1", job="job2"} 0.06
{instance="ins2", job="job2"} 0.0675
eval instant at 50m histogram_quantile(0.5, rate(request_duration_seconds[10m]))
{instance="ins1", job="job1"} 0.15
{instance="ins2", job="job1"} 0.13333333333333333
{instance="ins1", job="job2"} 0.1
{instance="ins2", job="job2"} 0.11666666666666667
eval instant at 50m histogram_quantile(0.5, rate(request_duration_seconds_bucket[10m]))
{instance="ins1", job="job1"} 0.15
{instance="ins2", job="job1"} 0.13333333333333333
{instance="ins1", job="job2"} 0.1
{instance="ins2", job="job2"} 0.11666666666666667
Force buckets in a histogram to be monotonic for quantile estimation (#2610) * Force buckets in a histogram to be monotonic for quantile estimation The assumption that bucket counts increase monotonically with increasing upperBound may be violated during: * Recording rule evaluation of histogram_quantile, especially when rate() has been applied to the underlying bucket timeseries. * Evaluation of histogram_quantile computed over federated bucket timeseries, especially when rate() has been applied This is because scraped data is not made available to RR evalution or federation atomically, so some buckets are computed with data from the N most recent scrapes, but the other buckets are missing the most recent observations. Monotonicity is usually guaranteed because if a bucket with upper bound u1 has count c1, then any bucket with a higher upper bound u > u1 must have counted all c1 observations and perhaps more, so that c >= c1. Randomly interspersed partial sampling breaks that guarantee, and rate() exacerbates it. Specifically, suppose bucket le=1000 has a count of 10 from 4 samples but the bucket with le=2000 has a count of 7, from 3 samples. The monotonicity is broken. It is exacerbated by rate() because under normal operation, cumulative counting of buckets will cause the bucket counts to diverge such that small differences from missing samples are not a problem. rate() removes this divergence.) bucketQuantile depends on that monotonicity to do a binary search for the bucket with the qth percentile count, so breaking the monotonicity guarantee causes bucketQuantile() to return undefined (nonsense) results. As a somewhat hacky solution until the Prometheus project is ready to accept the changes required to make scrapes atomic, we calculate the "envelope" of the histogram buckets, essentially removing any decreases in the count between successive buckets. * Fix up comment docs for ensureMonotonic * ensureMonotonic: Use switch statement Use switch statement rather than if/else for better readability. Process the most frequent cases first.
2017-04-14 07:21:49 -07:00
# All NHCBs summed into one.
eval instant at 50m sum(request_duration_seconds)
{} {{schema:-53 count:250 custom_values:[0.1 0.2] buckets:[100 90 60]}}
eval instant at 50m sum(request_duration_seconds{job="job1",instance="ins1"} + ignoring(job,instance) request_duration_seconds{job="job1",instance="ins2"} + ignoring(job,instance) request_duration_seconds{job="job2",instance="ins1"} + ignoring(job,instance) request_duration_seconds{job="job2",instance="ins2"})
{} {{schema:-53 count:250 custom_values:[0.1 0.2] buckets:[100 90 60]}}
eval instant at 50m avg(request_duration_seconds)
{} {{schema:-53 count:62.5 custom_values:[0.1 0.2] buckets:[25 22.5 15]}}
# To verify the result above, calculate from classic histogram as well.
eval instant at 50m avg (request_duration_seconds_bucket{le="0.1"})
{} 25
eval instant at 50m avg (request_duration_seconds_bucket{le="0.2"}) - avg (request_duration_seconds_bucket{le="0.1"})
{} 22.5
eval instant at 50m avg (request_duration_seconds_bucket{le="+Inf"}) - avg (request_duration_seconds_bucket{le="0.2"})
{} 15
eval instant at 50m count(request_duration_seconds)
{} 4
Force buckets in a histogram to be monotonic for quantile estimation (#2610) * Force buckets in a histogram to be monotonic for quantile estimation The assumption that bucket counts increase monotonically with increasing upperBound may be violated during: * Recording rule evaluation of histogram_quantile, especially when rate() has been applied to the underlying bucket timeseries. * Evaluation of histogram_quantile computed over federated bucket timeseries, especially when rate() has been applied This is because scraped data is not made available to RR evalution or federation atomically, so some buckets are computed with data from the N most recent scrapes, but the other buckets are missing the most recent observations. Monotonicity is usually guaranteed because if a bucket with upper bound u1 has count c1, then any bucket with a higher upper bound u > u1 must have counted all c1 observations and perhaps more, so that c >= c1. Randomly interspersed partial sampling breaks that guarantee, and rate() exacerbates it. Specifically, suppose bucket le=1000 has a count of 10 from 4 samples but the bucket with le=2000 has a count of 7, from 3 samples. The monotonicity is broken. It is exacerbated by rate() because under normal operation, cumulative counting of buckets will cause the bucket counts to diverge such that small differences from missing samples are not a problem. rate() removes this divergence.) bucketQuantile depends on that monotonicity to do a binary search for the bucket with the qth percentile count, so breaking the monotonicity guarantee causes bucketQuantile() to return undefined (nonsense) results. As a somewhat hacky solution until the Prometheus project is ready to accept the changes required to make scrapes atomic, we calculate the "envelope" of the histogram buckets, essentially removing any decreases in the count between successive buckets. * Fix up comment docs for ensureMonotonic * ensureMonotonic: Use switch statement Use switch statement rather than if/else for better readability. Process the most frequent cases first.
2017-04-14 07:21:49 -07:00
# A histogram with nonmonotonic bucket counts. This may happen when recording
# rule evaluation or federation races scrape ingestion, causing some buckets
# counts to be derived from fewer samples.
Force buckets in a histogram to be monotonic for quantile estimation (#2610) * Force buckets in a histogram to be monotonic for quantile estimation The assumption that bucket counts increase monotonically with increasing upperBound may be violated during: * Recording rule evaluation of histogram_quantile, especially when rate() has been applied to the underlying bucket timeseries. * Evaluation of histogram_quantile computed over federated bucket timeseries, especially when rate() has been applied This is because scraped data is not made available to RR evalution or federation atomically, so some buckets are computed with data from the N most recent scrapes, but the other buckets are missing the most recent observations. Monotonicity is usually guaranteed because if a bucket with upper bound u1 has count c1, then any bucket with a higher upper bound u > u1 must have counted all c1 observations and perhaps more, so that c >= c1. Randomly interspersed partial sampling breaks that guarantee, and rate() exacerbates it. Specifically, suppose bucket le=1000 has a count of 10 from 4 samples but the bucket with le=2000 has a count of 7, from 3 samples. The monotonicity is broken. It is exacerbated by rate() because under normal operation, cumulative counting of buckets will cause the bucket counts to diverge such that small differences from missing samples are not a problem. rate() removes this divergence.) bucketQuantile depends on that monotonicity to do a binary search for the bucket with the qth percentile count, so breaking the monotonicity guarantee causes bucketQuantile() to return undefined (nonsense) results. As a somewhat hacky solution until the Prometheus project is ready to accept the changes required to make scrapes atomic, we calculate the "envelope" of the histogram buckets, essentially removing any decreases in the count between successive buckets. * Fix up comment docs for ensureMonotonic * ensureMonotonic: Use switch statement Use switch statement rather than if/else for better readability. Process the most frequent cases first.
2017-04-14 07:21:49 -07:00
load 5m
nonmonotonic_bucket{le="0.1"} 0+2x10
nonmonotonic_bucket{le="1"} 0+1x10
nonmonotonic_bucket{le="10"} 0+5x10
nonmonotonic_bucket{le="100"} 0+4x10
Force buckets in a histogram to be monotonic for quantile estimation (#2610) * Force buckets in a histogram to be monotonic for quantile estimation The assumption that bucket counts increase monotonically with increasing upperBound may be violated during: * Recording rule evaluation of histogram_quantile, especially when rate() has been applied to the underlying bucket timeseries. * Evaluation of histogram_quantile computed over federated bucket timeseries, especially when rate() has been applied This is because scraped data is not made available to RR evalution or federation atomically, so some buckets are computed with data from the N most recent scrapes, but the other buckets are missing the most recent observations. Monotonicity is usually guaranteed because if a bucket with upper bound u1 has count c1, then any bucket with a higher upper bound u > u1 must have counted all c1 observations and perhaps more, so that c >= c1. Randomly interspersed partial sampling breaks that guarantee, and rate() exacerbates it. Specifically, suppose bucket le=1000 has a count of 10 from 4 samples but the bucket with le=2000 has a count of 7, from 3 samples. The monotonicity is broken. It is exacerbated by rate() because under normal operation, cumulative counting of buckets will cause the bucket counts to diverge such that small differences from missing samples are not a problem. rate() removes this divergence.) bucketQuantile depends on that monotonicity to do a binary search for the bucket with the qth percentile count, so breaking the monotonicity guarantee causes bucketQuantile() to return undefined (nonsense) results. As a somewhat hacky solution until the Prometheus project is ready to accept the changes required to make scrapes atomic, we calculate the "envelope" of the histogram buckets, essentially removing any decreases in the count between successive buckets. * Fix up comment docs for ensureMonotonic * ensureMonotonic: Use switch statement Use switch statement rather than if/else for better readability. Process the most frequent cases first.
2017-04-14 07:21:49 -07:00
nonmonotonic_bucket{le="1000"} 0+9x10
nonmonotonic_bucket{le="+Inf"} 0+8x10
Force buckets in a histogram to be monotonic for quantile estimation (#2610) * Force buckets in a histogram to be monotonic for quantile estimation The assumption that bucket counts increase monotonically with increasing upperBound may be violated during: * Recording rule evaluation of histogram_quantile, especially when rate() has been applied to the underlying bucket timeseries. * Evaluation of histogram_quantile computed over federated bucket timeseries, especially when rate() has been applied This is because scraped data is not made available to RR evalution or federation atomically, so some buckets are computed with data from the N most recent scrapes, but the other buckets are missing the most recent observations. Monotonicity is usually guaranteed because if a bucket with upper bound u1 has count c1, then any bucket with a higher upper bound u > u1 must have counted all c1 observations and perhaps more, so that c >= c1. Randomly interspersed partial sampling breaks that guarantee, and rate() exacerbates it. Specifically, suppose bucket le=1000 has a count of 10 from 4 samples but the bucket with le=2000 has a count of 7, from 3 samples. The monotonicity is broken. It is exacerbated by rate() because under normal operation, cumulative counting of buckets will cause the bucket counts to diverge such that small differences from missing samples are not a problem. rate() removes this divergence.) bucketQuantile depends on that monotonicity to do a binary search for the bucket with the qth percentile count, so breaking the monotonicity guarantee causes bucketQuantile() to return undefined (nonsense) results. As a somewhat hacky solution until the Prometheus project is ready to accept the changes required to make scrapes atomic, we calculate the "envelope" of the histogram buckets, essentially removing any decreases in the count between successive buckets. * Fix up comment docs for ensureMonotonic * ensureMonotonic: Use switch statement Use switch statement rather than if/else for better readability. Process the most frequent cases first.
2017-04-14 07:21:49 -07:00
# Nonmonotonic buckets, triggering an info annotation.
eval_info instant at 50m histogram_quantile(0.01, nonmonotonic_bucket)
{} 0.0045
eval_info instant at 50m histogram_quantile(0.5, nonmonotonic_bucket)
{} 8.5
eval_info instant at 50m histogram_quantile(0.99, nonmonotonic_bucket)
{} 979.75
# Buckets with different representations of the same upper bound.
eval instant at 50m histogram_quantile(0.5, rate(mixed_bucket[10m]))
{instance="ins1", job="job1"} 0.15
{instance="ins2", job="job1"} NaN
eval instant at 50m histogram_quantile(0.5, rate(mixed[10m]))
{instance="ins1", job="job1"} 0.2
{instance="ins2", job="job1"} NaN
eval instant at 50m histogram_quantile(0.75, rate(mixed_bucket[10m]))
{instance="ins1", job="job1"} 0.2
{instance="ins2", job="job1"} NaN
eval instant at 50m histogram_quantile(1, rate(mixed_bucket[10m]))
{instance="ins1", job="job1"} 0.2
{instance="ins2", job="job1"} NaN
load_with_nhcb 5m
empty_bucket{le="0.1", job="job1", instance="ins1"} 0x10
empty_bucket{le="0.2", job="job1", instance="ins1"} 0x10
empty_bucket{le="+Inf", job="job1", instance="ins1"} 0x10
eval instant at 50m histogram_quantile(0.2, rate(empty_bucket[10m]))
{instance="ins1", job="job1"} NaN
# Load a duplicate histogram with a different name to test failure scenario on multiple histograms with the same label set.
# https://github.com/prometheus/prometheus/issues/9910
load_with_nhcb 5m
request_duration_seconds2_bucket{job="job1", instance="ins1", le="0.1"} 0+1x10
request_duration_seconds2_bucket{job="job1", instance="ins1", le="0.2"} 0+3x10
request_duration_seconds2_bucket{job="job1", instance="ins1", le="+Inf"} 0+4x10
eval_fail instant at 50m histogram_quantile(0.99, {__name__=~"request_duration_seconds\\d*_bucket"})
eval_fail instant at 50m histogram_quantile(0.99, {__name__=~"request_duration_seconds\\d*"})
# Histogram with constant buckets.
load_with_nhcb 1m
const_histogram_bucket{le="0.0"} 1 1 1 1 1
const_histogram_bucket{le="1.0"} 1 1 1 1 1
const_histogram_bucket{le="2.0"} 1 1 1 1 1
const_histogram_bucket{le="+Inf"} 1 1 1 1 1
# There is no change to the bucket count over time, thus rate is 0 in each bucket.
eval instant at 5m rate(const_histogram_bucket[5m])
{le="0.0"} 0
{le="1.0"} 0
{le="2.0"} 0
{le="+Inf"} 0
# Native histograms do not represent empty buckets, so here the zeros are implicit.
eval instant at 5m rate(const_histogram[5m])
{} {{schema:-53 sum:0 count:0 custom_values:[0.0 1.0 2.0]}}
# Zero buckets mean no observations, so there is no value that observations fall below,
# which means that any quantile is a NaN.
eval instant at 5m histogram_quantile(1.0, sum by (le) (rate(const_histogram_bucket[5m])))
{} NaN
eval instant at 5m histogram_quantile(1.0, sum(rate(const_histogram[5m])))
{} NaN
load_with_nhcb 1m
histogram_over_time_bucket{le="0"} 0 1 3 9
histogram_over_time_bucket{le="1"} 2 3 3 9
histogram_over_time_bucket{le="2"} 3 8 5 10
histogram_over_time_bucket{le="4"} 3 10 6 18
# Test custom buckets with sum_over_time, avg_over_time.
eval instant at 3m sum_over_time(histogram_over_time[4m:1m])
{} {{schema:-53 count:37 custom_values:[0 1 2 4] buckets:[13 4 9 11]}}
eval instant at 3m avg_over_time(histogram_over_time[4m:1m])
{} {{schema:-53 count:9.25 custom_values:[0 1 2 4] buckets:[3.25 1 2.25 2.75]}}
# Test custom buckets with counter reset
load_with_nhcb 5m
histogram_with_reset_bucket{le="1"} 1 3 9
histogram_with_reset_bucket{le="2"} 3 3 9
histogram_with_reset_bucket{le="4"} 8 5 12
histogram_with_reset_bucket{le="8"} 10 6 18
histogram_with_reset_sum{} 36 16 61
eval instant at 10m increase(histogram_with_reset[15m])
{} {{schema:-53 count:27 sum:91.5 custom_values:[1 2 4 8] counter_reset_hint:gauge buckets:[13.5 0 4.5 9]}}
eval instant at 10m resets(histogram_with_reset[15m])
{} 1
eval instant at 10m histogram_count(increase(histogram_with_reset[15m]))
{} 27
eval instant at 10m histogram_sum(increase(histogram_with_reset[15m]))
{} 91.5