diff --git a/docs/querying/functions.md b/docs/querying/functions.md index 9d73b5344..b6f218180 100644 --- a/docs/querying/functions.md +++ b/docs/querying/functions.md @@ -326,45 +326,70 @@ With native histograms, aggregating everything works as usual without any `by` c histogram_quantile(0.9, sum(rate(http_request_duration_seconds[10m]))) -The `histogram_quantile()` function interpolates quantile values by -assuming a linear distribution within a bucket. +In the (common) case that a quantile value does not coincide with a bucket +boundary, the `histogram_quantile()` function interpolates the quantile value +within the bucket the quantile value falls into. For classic histograms, for +native histograms with custom bucket boundaries, and for the zero bucket of +other native histograms, it assumes a uniform distribution of observations +within the bucket (also called _linear interpolation_). For the +non-zero-buckets of native histograms with a standard exponential bucketing +schema, the interpolation is done under the assumption that the samples within +the bucket are distributed in a way that they would uniformly populate the +buckets in a hypothetical histogram with higher resolution. (This is also +called _exponential interpolation_.) If `b` has 0 observations, `NaN` is returned. For φ < 0, `-Inf` is returned. For φ > 1, `+Inf` is returned. For φ = `NaN`, `NaN` is returned. -The following is only relevant for classic histograms: If `b` contains -fewer than two buckets, `NaN` is returned. The highest bucket must have an -upper bound of `+Inf`. (Otherwise, `NaN` is returned.) If a quantile is located -in the highest bucket, the upper bound of the second highest bucket is -returned. A lower limit of the lowest bucket is assumed to be 0 if the upper -bound of that bucket is greater than -0. In that case, the usual linear interpolation is applied within that -bucket. Otherwise, the upper bound of the lowest bucket is returned for -quantiles located in the lowest bucket. +Special cases for classic histograms: -You can use `histogram_quantile(0, v instant-vector)` to get the estimated minimum value stored in -a histogram. +* If `b` contains fewer than two buckets, `NaN` is returned. +* The highest bucket must have an upper bound of `+Inf`. (Otherwise, `NaN` is + returned.) +* If a quantile is located in the highest bucket, the upper bound of the second + highest bucket is returned. +* The lower limit of the lowest bucket is assumed to be 0 if the upper bound of + that bucket is greater than 0. In that case, the usual linear interpolation + is applied within that bucket. Otherwise, the upper bound of the lowest + bucket is returned for quantiles located in the lowest bucket. -You can use `histogram_quantile(1, v instant-vector)` to get the estimated maximum value stored in -a histogram. +Special cases for native histograms (relevant for the exact interpolation +happening within the zero bucket): -Buckets of classic histograms are cumulative. Therefore, the following should always be the case: +* A zero bucket with finite width is assumed to contain no negative + observations if the histogram has observations in positive buckets, but none + in negative buckets. +* A zero bucket with finite width is assumed to contain no positive + observations if the histogram has observations in negative buckets, but none + in positive buckets. -* The counts in the buckets are monotonically increasing (strictly non-decreasing). -* A lack of observations between the upper limits of two consecutive buckets results in equal counts -in those two buckets. +You can use `histogram_quantile(0, v instant-vector)` to get the estimated +minimum value stored in a histogram. -However, floating point precision issues (e.g. small discrepancies introduced by computing of buckets -with `sum(rate(...))`) or invalid data might violate these assumptions. In that case, -`histogram_quantile` would be unable to return meaningful results. To mitigate the issue, -`histogram_quantile` assumes that tiny relative differences between consecutive buckets are happening -because of floating point precision errors and ignores them. (The threshold to ignore a difference -between two buckets is a trillionth (1e-12) of the sum of both buckets.) Furthermore, if there are -non-monotonic bucket counts even after this adjustment, they are increased to the value of the -previous buckets to enforce monotonicity. The latter is evidence for an actual issue with the input -data and is therefore flagged with an informational annotation reading `input to histogram_quantile -needed to be fixed for monotonicity`. If you encounter this annotation, you should find and remove -the source of the invalid data. +You can use `histogram_quantile(1, v instant-vector)` to get the estimated +maximum value stored in a histogram. + +Buckets of classic histograms are cumulative. Therefore, the following should +always be the case: + +* The counts in the buckets are monotonically increasing (strictly + non-decreasing). +* A lack of observations between the upper limits of two consecutive buckets + results in equal counts in those two buckets. + +However, floating point precision issues (e.g. small discrepancies introduced +by computing of buckets with `sum(rate(...))`) or invalid data might violate +these assumptions. In that case, `histogram_quantile` would be unable to return +meaningful results. To mitigate the issue, `histogram_quantile` assumes that +tiny relative differences between consecutive buckets are happening because of +floating point precision errors and ignores them. (The threshold to ignore a +difference between two buckets is a trillionth (1e-12) of the sum of both +buckets.) Furthermore, if there are non-monotonic bucket counts even after this +adjustment, they are increased to the value of the previous buckets to enforce +monotonicity. The latter is evidence for an actual issue with the input data +and is therefore flagged with an informational annotation reading `input to +histogram_quantile needed to be fixed for monotonicity`. If you encounter this +annotation, you should find and remove the source of the invalid data. ## `histogram_stddev()` and `histogram_stdvar()` diff --git a/promql/promqltest/testdata/native_histograms.test b/promql/promqltest/testdata/native_histograms.test index ee521f9c3..ca4993660 100644 --- a/promql/promqltest/testdata/native_histograms.test +++ b/promql/promqltest/testdata/native_histograms.test @@ -46,9 +46,12 @@ eval instant at 1m histogram_fraction(1, 2, single_histogram) eval instant at 1m histogram_fraction(0, 8, single_histogram) {} 1 -# Median is 1.5 due to linear estimation of the midpoint of the middle bucket, whose values are within range 1 < x <= 2. +# Median is 1.414213562373095 (2**2**-1, or sqrt(2)) due to +# exponential interpolation, i.e. the "midpoint" within range 1 < x <= +# 2 is assumed where the bucket boundary would be if we increased the +# resolution of the histogram by one step. eval instant at 1m histogram_quantile(0.5, single_histogram) - {} 1.5 + {} 1.414213562373095 clear @@ -68,8 +71,9 @@ eval instant at 5m histogram_avg(multi_histogram) eval instant at 5m histogram_fraction(1, 2, multi_histogram) {} 0.5 +# See explanation for exponential interpolation above. eval instant at 5m histogram_quantile(0.5, multi_histogram) - {} 1.5 + {} 1.414213562373095 # Each entry should look the same as the first. @@ -85,8 +89,9 @@ eval instant at 50m histogram_avg(multi_histogram) eval instant at 50m histogram_fraction(1, 2, multi_histogram) {} 0.5 +# See explanation for exponential interpolation above. eval instant at 50m histogram_quantile(0.5, multi_histogram) - {} 1.5 + {} 1.414213562373095 clear @@ -109,8 +114,9 @@ eval instant at 5m histogram_avg(incr_histogram) eval instant at 5m histogram_fraction(1, 2, incr_histogram) {} 0.6 +# See explanation for exponential interpolation above. eval instant at 5m histogram_quantile(0.5, incr_histogram) - {} 1.5 + {} 1.414213562373095 eval instant at 50m incr_histogram @@ -129,16 +135,18 @@ eval instant at 50m histogram_avg(incr_histogram) eval instant at 50m histogram_fraction(1, 2, incr_histogram) {} 0.8571428571428571 +# See explanation for exponential interpolation above. eval instant at 50m histogram_quantile(0.5, incr_histogram) - {} 1.5 + {} 1.414213562373095 # Per-second average rate of increase should be 1/(5*60) for count and buckets, then 2/(5*60) for sum. eval instant at 50m rate(incr_histogram[10m]) {} {{count:0.0033333333333333335 sum:0.006666666666666667 offset:1 buckets:[0.0033333333333333335]}} # Calculate the 50th percentile of observations over the last 10m. +# See explanation for exponential interpolation above. eval instant at 50m histogram_quantile(0.5, rate(incr_histogram[10m])) - {} 1.5 + {} 1.414213562373095 clear @@ -211,8 +219,9 @@ eval instant at 1m histogram_avg(negative_histogram) eval instant at 1m histogram_fraction(-2, -1, negative_histogram) {} 0.5 +# Exponential interpolation works the same as for positive buckets, just mirrored. eval instant at 1m histogram_quantile(0.5, negative_histogram) - {} -1.5 + {} -1.414213562373095 clear @@ -233,8 +242,9 @@ eval instant at 5m histogram_avg(two_samples_histogram) eval instant at 5m histogram_fraction(-2, -1, two_samples_histogram) {} 0.5 +# See explanation for exponential interpolation above. eval instant at 5m histogram_quantile(0.5, two_samples_histogram) - {} -1.5 + {} -1.414213562373095 clear @@ -392,20 +402,24 @@ eval_warn instant at 10m histogram_quantile(1.001, histogram_quantile_1) eval instant at 10m histogram_quantile(1, histogram_quantile_1) {} 16 +# The following quantiles are within a bucket. Exponential +# interpolation is applied (rather than linear, as it is done for +# classic histograms), leading to slightly different quantile values. eval instant at 10m histogram_quantile(0.99, histogram_quantile_1) - {} 15.759999999999998 + {} 15.67072476139083 eval instant at 10m histogram_quantile(0.9, histogram_quantile_1) - {} 13.600000000000001 + {} 12.99603834169977 eval instant at 10m histogram_quantile(0.6, histogram_quantile_1) - {} 4.799999999999997 + {} 4.594793419988138 eval instant at 10m histogram_quantile(0.5, histogram_quantile_1) - {} 1.6666666666666665 + {} 1.5874010519681994 +# Linear interpolation within the zero bucket after all. eval instant at 10m histogram_quantile(0.1, histogram_quantile_1) - {} 0.0006000000000000001 + {} 0.0006 eval instant at 10m histogram_quantile(0, histogram_quantile_1) {} 0 @@ -425,17 +439,20 @@ eval_warn instant at 10m histogram_quantile(1.001, histogram_quantile_2) eval instant at 10m histogram_quantile(1, histogram_quantile_2) {} 0 +# Again, the quantile values here are slightly different from what +# they would be with linear interpolation. Note that quantiles +# ending up in the zero bucket are linearly interpolated after all. eval instant at 10m histogram_quantile(0.99, histogram_quantile_2) - {} -6.000000000000048e-05 + {} -0.00006 eval instant at 10m histogram_quantile(0.9, histogram_quantile_2) - {} -0.0005999999999999996 + {} -0.0006 eval instant at 10m histogram_quantile(0.5, histogram_quantile_2) - {} -1.6666666666666667 + {} -1.5874010519681996 eval instant at 10m histogram_quantile(0.1, histogram_quantile_2) - {} -13.6 + {} -12.996038341699768 eval instant at 10m histogram_quantile(0, histogram_quantile_2) {} -16 @@ -445,7 +462,9 @@ eval_warn instant at 10m histogram_quantile(-1, histogram_quantile_2) clear -# Apply quantile function to histogram with both positive and negative buckets with zero bucket. +# Apply quantile function to histogram with both positive and negative +# buckets with zero bucket. +# First positive buckets with exponential interpolation. load 10m histogram_quantile_3 {{schema:0 count:24 sum:100 z_bucket:4 z_bucket_w:0.001 buckets:[2 3 0 1 4] n_buckets:[2 3 0 1 4]}}x1 @@ -456,31 +475,34 @@ eval instant at 10m histogram_quantile(1, histogram_quantile_3) {} 16 eval instant at 10m histogram_quantile(0.99, histogram_quantile_3) - {} 15.519999999999996 + {} 15.34822590920423 eval instant at 10m histogram_quantile(0.9, histogram_quantile_3) - {} 11.200000000000003 + {} 10.556063286183155 eval instant at 10m histogram_quantile(0.7, histogram_quantile_3) - {} 1.2666666666666657 + {} 1.2030250360821164 +# Linear interpolation in the zero bucket, symmetrically centered around +# the zero point. eval instant at 10m histogram_quantile(0.55, histogram_quantile_3) - {} 0.0006000000000000005 + {} 0.0006 eval instant at 10m histogram_quantile(0.5, histogram_quantile_3) {} 0 eval instant at 10m histogram_quantile(0.45, histogram_quantile_3) - {} -0.0005999999999999996 + {} -0.0006 +# Finally negative buckets with mirrored exponential interpolation. eval instant at 10m histogram_quantile(0.3, histogram_quantile_3) - {} -1.266666666666667 + {} -1.2030250360821169 eval instant at 10m histogram_quantile(0.1, histogram_quantile_3) - {} -11.2 + {} -10.556063286183155 eval instant at 10m histogram_quantile(0.01, histogram_quantile_3) - {} -15.52 + {} -15.34822590920423 eval instant at 10m histogram_quantile(0, histogram_quantile_3) {} -16 @@ -490,6 +512,90 @@ eval_warn instant at 10m histogram_quantile(-1, histogram_quantile_3) clear +# Try different schemas. (The interpolation logic must not depend on the schema.) +clear +load 1m + var_res_histogram{schema="-1"} {{schema:-1 sum:6 count:5 buckets:[0 5]}} + var_res_histogram{schema="0"} {{schema:0 sum:4 count:5 buckets:[0 5]}} + var_res_histogram{schema="+1"} {{schema:1 sum:4 count:5 buckets:[0 5]}} + +eval instant at 1m histogram_quantile(0.5, var_res_histogram) + {schema="-1"} 2.0 + {schema="0"} 1.4142135623730951 + {schema="+1"} 1.189207 + +eval instant at 1m histogram_fraction(0, 2, var_res_histogram{schema="-1"}) + {schema="-1"} 0.5 + +eval instant at 1m histogram_fraction(0, 1.4142135623730951, var_res_histogram{schema="0"}) + {schema="0"} 0.5 + +eval instant at 1m histogram_fraction(0, 1.189207, var_res_histogram{schema="+1"}) + {schema="+1"} 0.5 + +# The same as above, but one bucket "further to the right". +clear +load 1m + var_res_histogram{schema="-1"} {{schema:-1 sum:6 count:5 buckets:[0 0 5]}} + var_res_histogram{schema="0"} {{schema:0 sum:4 count:5 buckets:[0 0 5]}} + var_res_histogram{schema="+1"} {{schema:1 sum:4 count:5 buckets:[0 0 5]}} + +eval instant at 1m histogram_quantile(0.5, var_res_histogram) + {schema="-1"} 8.0 + {schema="0"} 2.82842712474619 + {schema="+1"} 1.6817928305074292 + +eval instant at 1m histogram_fraction(0, 8, var_res_histogram{schema="-1"}) + {schema="-1"} 0.5 + +eval instant at 1m histogram_fraction(0, 2.82842712474619, var_res_histogram{schema="0"}) + {schema="0"} 0.5 + +eval instant at 1m histogram_fraction(0, 1.6817928305074292, var_res_histogram{schema="+1"}) + {schema="+1"} 0.5 + +# And everything again but for negative buckets. +clear +load 1m + var_res_histogram{schema="-1"} {{schema:-1 sum:6 count:5 n_buckets:[0 5]}} + var_res_histogram{schema="0"} {{schema:0 sum:4 count:5 n_buckets:[0 5]}} + var_res_histogram{schema="+1"} {{schema:1 sum:4 count:5 n_buckets:[0 5]}} + +eval instant at 1m histogram_quantile(0.5, var_res_histogram) + {schema="-1"} -2.0 + {schema="0"} -1.4142135623730951 + {schema="+1"} -1.189207 + +eval instant at 1m histogram_fraction(-2, 0, var_res_histogram{schema="-1"}) + {schema="-1"} 0.5 + +eval instant at 1m histogram_fraction(-1.4142135623730951, 0, var_res_histogram{schema="0"}) + {schema="0"} 0.5 + +eval instant at 1m histogram_fraction(-1.189207, 0, var_res_histogram{schema="+1"}) + {schema="+1"} 0.5 + +clear +load 1m + var_res_histogram{schema="-1"} {{schema:-1 sum:6 count:5 n_buckets:[0 0 5]}} + var_res_histogram{schema="0"} {{schema:0 sum:4 count:5 n_buckets:[0 0 5]}} + var_res_histogram{schema="+1"} {{schema:1 sum:4 count:5 n_buckets:[0 0 5]}} + +eval instant at 1m histogram_quantile(0.5, var_res_histogram) + {schema="-1"} -8.0 + {schema="0"} -2.82842712474619 + {schema="+1"} -1.6817928305074292 + +eval instant at 1m histogram_fraction(-8, 0, var_res_histogram{schema="-1"}) + {schema="-1"} 0.5 + +eval instant at 1m histogram_fraction(-2.82842712474619, 0, var_res_histogram{schema="0"}) + {schema="0"} 0.5 + +eval instant at 1m histogram_fraction(-1.6817928305074292, 0, var_res_histogram{schema="+1"}) + {schema="+1"} 0.5 + + # Apply fraction function to empty histogram. load 10m histogram_fraction_1 {{}}x1 @@ -515,11 +621,18 @@ eval instant at 10m histogram_fraction(-0.001, 0, histogram_fraction_2) eval instant at 10m histogram_fraction(0, 0.001, histogram_fraction_2) {} 0.16666666666666666 +# Note that this result and the one above add up to 1. +eval instant at 10m histogram_fraction(0.001, inf, histogram_fraction_2) + {} 0.8333333333333334 + +# We are in the zero bucket, resulting in linear interpolation eval instant at 10m histogram_fraction(0, 0.0005, histogram_fraction_2) {} 0.08333333333333333 -eval instant at 10m histogram_fraction(0.001, inf, histogram_fraction_2) - {} 0.8333333333333334 +# Demonstrate that the inverse operation with histogram_quantile yields +# the original value with the non-trivial result above. +eval instant at 10m histogram_quantile(0.08333333333333333, histogram_fraction_2) + {} 0.0005 eval instant at 10m histogram_fraction(-inf, -0.001, histogram_fraction_2) {} 0 @@ -527,17 +640,30 @@ eval instant at 10m histogram_fraction(-inf, -0.001, histogram_fraction_2) eval instant at 10m histogram_fraction(1, 2, histogram_fraction_2) {} 0.25 +# More non-trivial results with interpolation involved below, including +# some round-trips via histogram_quantile to prove that the inverse +# operation leads to the same results. + +eval instant at 10m histogram_fraction(0, 1.5, histogram_fraction_2) + {} 0.4795739585136224 + eval instant at 10m histogram_fraction(1.5, 2, histogram_fraction_2) - {} 0.125 + {} 0.10375937481971091 eval instant at 10m histogram_fraction(1, 8, histogram_fraction_2) {} 0.3333333333333333 +eval instant at 10m histogram_fraction(0, 6, histogram_fraction_2) + {} 0.6320802083934297 + +eval instant at 10m histogram_quantile(0.6320802083934297, histogram_fraction_2) + {} 6 + eval instant at 10m histogram_fraction(1, 6, histogram_fraction_2) - {} 0.2916666666666667 + {} 0.29874687506009634 eval instant at 10m histogram_fraction(1.5, 6, histogram_fraction_2) - {} 0.16666666666666666 + {} 0.15250624987980724 eval instant at 10m histogram_fraction(-2, -1, histogram_fraction_2) {} 0 @@ -600,6 +726,12 @@ eval instant at 10m histogram_fraction(0, 0.001, histogram_fraction_3) eval instant at 10m histogram_fraction(-0.0005, 0, histogram_fraction_3) {} 0.08333333333333333 +eval instant at 10m histogram_fraction(-inf, -0.0005, histogram_fraction_3) + {} 0.9166666666666666 + +eval instant at 10m histogram_quantile(0.9166666666666666, histogram_fraction_3) + {} -0.0005 + eval instant at 10m histogram_fraction(0.001, inf, histogram_fraction_3) {} 0 @@ -625,16 +757,22 @@ eval instant at 10m histogram_fraction(-2, -1, histogram_fraction_3) {} 0.25 eval instant at 10m histogram_fraction(-2, -1.5, histogram_fraction_3) - {} 0.125 + {} 0.10375937481971091 eval instant at 10m histogram_fraction(-8, -1, histogram_fraction_3) {} 0.3333333333333333 +eval instant at 10m histogram_fraction(-inf, -6, histogram_fraction_3) + {} 0.36791979160657035 + +eval instant at 10m histogram_quantile(0.36791979160657035, histogram_fraction_3) + {} -6 + eval instant at 10m histogram_fraction(-6, -1, histogram_fraction_3) - {} 0.2916666666666667 + {} 0.29874687506009634 eval instant at 10m histogram_fraction(-6, -1.5, histogram_fraction_3) - {} 0.16666666666666666 + {} 0.15250624987980724 eval instant at 10m histogram_fraction(42, 3.1415, histogram_fraction_3) {} 0 @@ -684,6 +822,18 @@ eval instant at 10m histogram_fraction(0, 0.001, histogram_fraction_4) eval instant at 10m histogram_fraction(-0.0005, 0.0005, histogram_fraction_4) {} 0.08333333333333333 +eval instant at 10m histogram_fraction(-inf, 0.0005, histogram_fraction_4) + {} 0.5416666666666666 + +eval instant at 10m histogram_quantile(0.5416666666666666, histogram_fraction_4) + {} 0.0005 + +eval instant at 10m histogram_fraction(-inf, -0.0005, histogram_fraction_4) + {} 0.4583333333333333 + +eval instant at 10m histogram_quantile(0.4583333333333333, histogram_fraction_4) + {} -0.0005 + eval instant at 10m histogram_fraction(0.001, inf, histogram_fraction_4) {} 0.4166666666666667 @@ -694,31 +844,31 @@ eval instant at 10m histogram_fraction(1, 2, histogram_fraction_4) {} 0.125 eval instant at 10m histogram_fraction(1.5, 2, histogram_fraction_4) - {} 0.0625 + {} 0.051879687409855414 eval instant at 10m histogram_fraction(1, 8, histogram_fraction_4) {} 0.16666666666666666 eval instant at 10m histogram_fraction(1, 6, histogram_fraction_4) - {} 0.14583333333333334 + {} 0.14937343753004825 eval instant at 10m histogram_fraction(1.5, 6, histogram_fraction_4) - {} 0.08333333333333333 + {} 0.07625312493990366 eval instant at 10m histogram_fraction(-2, -1, histogram_fraction_4) {} 0.125 eval instant at 10m histogram_fraction(-2, -1.5, histogram_fraction_4) - {} 0.0625 + {} 0.051879687409855456 eval instant at 10m histogram_fraction(-8, -1, histogram_fraction_4) {} 0.16666666666666666 eval instant at 10m histogram_fraction(-6, -1, histogram_fraction_4) - {} 0.14583333333333334 + {} 0.14937343753004817 eval instant at 10m histogram_fraction(-6, -1.5, histogram_fraction_4) - {} 0.08333333333333333 + {} 0.07625312493990362 eval instant at 10m histogram_fraction(42, 3.1415, histogram_fraction_4) {} 0 diff --git a/promql/quantile.go b/promql/quantile.go index 7ddb76acb..06775d3ae 100644 --- a/promql/quantile.go +++ b/promql/quantile.go @@ -153,19 +153,31 @@ func bucketQuantile(q float64, buckets buckets) (float64, bool, bool) { // histogramQuantile calculates the quantile 'q' based on the given histogram. // -// The quantile value is interpolated assuming a linear distribution within a -// bucket. -// TODO(beorn7): Find an interpolation method that is a better fit for -// exponential buckets (and think about configurable interpolation). +// For custom buckets, the result is interpolated linearly, i.e. it is assumed +// the observations are uniformly distributed within each bucket. (This is a +// quite blunt assumption, but it is consistent with the interpolation method +// used for classic histograms so far.) +// +// For exponential buckets, the interpolation is done under the assumption that +// the samples within each bucket are distributed in a way that they would +// uniformly populate the buckets in a hypothetical histogram with higher +// resolution. For example, if the rank calculation suggests that the requested +// quantile is right in the middle of the population of the (1,2] bucket, we +// assume the quantile would be right at the bucket boundary between the two +// buckets the (1,2] bucket would be divided into if the histogram had double +// the resolution, which is 2**2**-1 = 1.4142... We call this exponential +// interpolation. +// +// However, for a quantile that ends up in the zero bucket, this method isn't +// very helpful (because there is an infinite number of buckets close to zero, +// so we would have to assume zero as the result). Therefore, we return to +// linear interpolation in the zero bucket. // // A natural lower bound of 0 is assumed if the histogram has only positive // buckets. Likewise, a natural upper bound of 0 is assumed if the histogram has // only negative buckets. -// TODO(beorn7): Come to terms if we want that. // -// There are a number of special cases (once we have a way to report errors -// happening during evaluations of AST functions, we should report those -// explicitly): +// There are a number of special cases: // // If the histogram has 0 observations, NaN is returned. // @@ -193,9 +205,9 @@ func histogramQuantile(q float64, h *histogram.FloatHistogram) float64 { rank float64 ) - // if there are NaN observations in the histogram (h.Sum is NaN), use the forward iterator - // if the q < 0.5, use the forward iterator - // if the q >= 0.5, use the reverse iterator + // If there are NaN observations in the histogram (h.Sum is NaN), use the forward iterator. + // If q < 0.5, use the forward iterator. + // If q >= 0.5, use the reverse iterator. if math.IsNaN(h.Sum) || q < 0.5 { it = h.AllBucketIterator() rank = q * h.Count @@ -260,8 +272,29 @@ func histogramQuantile(q float64, h *histogram.FloatHistogram) float64 { rank = count - rank } - // TODO(codesome): Use a better estimation than linear. - return bucket.Lower + (bucket.Upper-bucket.Lower)*(rank/bucket.Count) + // The fraction of how far we are into the current bucket. + fraction := rank / bucket.Count + + // Return linear interpolation for custom buckets and for quantiles that + // end up in the zero bucket. + if h.UsesCustomBuckets() || (bucket.Lower <= 0 && bucket.Upper >= 0) { + return bucket.Lower + (bucket.Upper-bucket.Lower)*fraction + } + + // For exponential buckets, we interpolate on a logarithmic scale. On a + // logarithmic scale, the exponential bucket boundaries (for any schema) + // become linear (every bucket has the same width). Therefore, after + // taking the logarithm of both bucket boundaries, we can use the + // calculated fraction in the same way as for linear interpolation (see + // above). Finally, we return to the normal scale by applying the + // exponential function to the result. + logLower := math.Log2(math.Abs(bucket.Lower)) + logUpper := math.Log2(math.Abs(bucket.Upper)) + if bucket.Lower > 0 { // Positive bucket. + return math.Exp2(logLower + (logUpper-logLower)*fraction) + } + // Otherwise, we are in a negative bucket and have to mirror things. + return -math.Exp2(logUpper + (logLower-logUpper)*(1-fraction)) } // histogramFraction calculates the fraction of observations between the @@ -271,8 +304,8 @@ func histogramQuantile(q float64, h *histogram.FloatHistogram) float64 { // histogramQuantile(0.9, h) returns 123.4, then histogramFraction(-Inf, 123.4, h) // returns 0.9. // -// The same notes (and TODOs) with regard to interpolation and assumptions about -// the zero bucket boundaries apply as for histogramQuantile. +// The same notes with regard to interpolation and assumptions about the zero +// bucket boundaries apply as for histogramQuantile. // // Whether either boundary is inclusive or exclusive doesn’t actually matter as // long as interpolation has to be performed anyway. In the case of a boundary @@ -310,7 +343,35 @@ func histogramFraction(lower, upper float64, h *histogram.FloatHistogram) float6 ) for it.Next() { b := it.At() - if b.Lower < 0 && b.Upper > 0 { + zeroBucket := false + + // interpolateLinearly is used for custom buckets to be + // consistent with the linear interpolation known from classic + // histograms. It is also used for the zero bucket. + interpolateLinearly := func(v float64) float64 { + return rank + b.Count*(v-b.Lower)/(b.Upper-b.Lower) + } + + // interpolateExponentially is using the same exponential + // interpolation method as above for histogramQuantile. This + // method is a better fit for exponential bucketing. + interpolateExponentially := func(v float64) float64 { + var ( + logLower = math.Log2(math.Abs(b.Lower)) + logUpper = math.Log2(math.Abs(b.Upper)) + logV = math.Log2(math.Abs(v)) + fraction float64 + ) + if v > 0 { + fraction = (logV - logLower) / (logUpper - logLower) + } else { + fraction = 1 - ((logV - logUpper) / (logLower - logUpper)) + } + return rank + b.Count*fraction + } + + if b.Lower <= 0 && b.Upper >= 0 { + zeroBucket = true switch { case len(h.NegativeBuckets) == 0 && len(h.PositiveBuckets) > 0: // This is the zero bucket and the histogram has only @@ -325,10 +386,12 @@ func histogramFraction(lower, upper float64, h *histogram.FloatHistogram) float6 } } if !lowerSet && b.Lower >= lower { + // We have hit the lower value at the lower bucket boundary. lowerRank = rank lowerSet = true } if !upperSet && b.Lower >= upper { + // We have hit the upper value at the lower bucket boundary. upperRank = rank upperSet = true } @@ -336,11 +399,21 @@ func histogramFraction(lower, upper float64, h *histogram.FloatHistogram) float6 break } if !lowerSet && b.Lower < lower && b.Upper > lower { - lowerRank = rank + b.Count*(lower-b.Lower)/(b.Upper-b.Lower) + // The lower value is in this bucket. + if h.UsesCustomBuckets() || zeroBucket { + lowerRank = interpolateLinearly(lower) + } else { + lowerRank = interpolateExponentially(lower) + } lowerSet = true } if !upperSet && b.Lower < upper && b.Upper > upper { - upperRank = rank + b.Count*(upper-b.Lower)/(b.Upper-b.Lower) + // The upper value is in this bucket. + if h.UsesCustomBuckets() || zeroBucket { + upperRank = interpolateLinearly(upper) + } else { + upperRank = interpolateExponentially(upper) + } upperSet = true } if lowerSet && upperSet {