PromQL: ignore small errors for bucketQuantile (#13153)

promql: Improve histogram_quantile calculation for classic buckets Tiny differences between classic buckets are most likely caused by floating point precision issues. With this commit, relative changes below a certain threshold are ignored. This makes the result of histogram_quantile more meaningful, and also avoids triggering the _input to histogram_quantile needed to be fixed for monotonicity_ annotations in unactionable cases. This commit also adds explanation of the new adjustment and of the monotonicity annotation to the documentation of `histogram_quantile`. --------- Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
2025-03-05 20:59:13 -08:00 · 2023-11-25 07:05:38 +08:00 · 2023-11-25 07:05:38 +08:00 · ccfe14d7e7
parent 2e205ee95c
commit ccfe14d7e7
7 changed files with 416 additions and 36 deletions
--- a/docs/querying/functions.md
+++ b/docs/querying/functions.md
@ -323,6 +323,24 @@ a histogram.
 You can use `histogram_quantile(1, v instant-vector)` to get the estimated maximum value stored in
 a histogram.
 Buckets of classic histograms are cumulative. Therefore, the following should always be the case:
 - The counts in the buckets are monotonically increasing (strictly non-decreasing).
 - A lack of observations between the upper limits of two consecutive buckets results in equal counts
 in those two buckets.
 However, floating point precision issues (e.g. small discrepancies introduced by computing of buckets
 with `sum(rate(...))`) or invalid data might violate these assumptions. In that case,
 `histogram_quantile` would be unable to return meaningful results. To mitigate the issue,
 `histogram_quantile` assumes that tiny relative differences between consecutive buckets are happening
 because of floating point precision errors and ignores them. (The threshold to ignore a difference
 between two buckets is a trillionth (1e-12) of the sum of both buckets.) Furthermore, if there are
 non-monotonic bucket counts even after this adjustment, they are increased to the value of the
 previous buckets to enforce monotonicity. The latter is evidence for an actual issue with the input
 data and is therefore flagged with an informational annotation reading `input to histogram_quantile
 needed to be fixed for monotonicity`. If you encounter this annotation, you should find and remove
 the source of the invalid data.
 ## `histogram_stddev()` and `histogram_stdvar()`
 _Both functions only act on native histograms, which are an experimental
--- a/promql/engine_test.go
+++ b/promql/engine_test.go
@ -3699,7 +3699,7 @@ func TestNativeHistogram_HistogramQuantile(t *testing.T) {
 						require.Len(t, vector, 1)
 						require.Nil(t, vector[0].H)
-						require.True(t, almostEqual(sc.value, vector[0].F))
+						require.True(t, almostEqual(sc.value, vector[0].F, defaultEpsilon))
 					})
 				}
 				idx++
--- a/promql/functions.go
+++ b/promql/functions.go
@ -1163,7 +1163,7 @@ func funcHistogramQuantile(vals []parser.Value, args parser.Expressions, enh *Ev
 	for _, mb := range enh.signatureToMetricWithBuckets {
 		if len(mb.buckets) > 0 {
-			res, forcedMonotonicity := bucketQuantile(q, mb.buckets)
+			res, forcedMonotonicity, _ := bucketQuantile(q, mb.buckets)
 			enh.Out = append(enh.Out, Sample{
 				Metric: mb.metric,
 				F:      res,
--- a/promql/quantile.go
+++ b/promql/quantile.go
@ -23,6 +23,25 @@ import (
 	"github.com/prometheus/prometheus/model/labels"
 )
 // smallDeltaTolerance is the threshold for relative deltas between classic
 // histogram buckets that will be ignored by the histogram_quantile function
 // because they are most likely artifacts of floating point precision issues.
 // Testing on 2 sets of real data with bugs arising from small deltas,
 // the safe ranges were from:
 // - 1e-05 to 1e-15
 // - 1e-06 to 1e-15
 // Anything to the left of that would cause non-query-sharded data to have
 // small deltas ignored (unnecessary and we should avoid this), and anything
 // to the right of that would cause query-sharded data to not have its small
 // deltas ignored (so the problem won't be fixed).
 // For context, query sharding triggers these float precision errors in Mimir.
 // To illustrate, with a relative deviation of 1e-12, we need to have 1e12
 // observations in the bucket so that the change of one observation is small
 // enough to get ignored. With the usual observation rate even of very busy
 // services, this will hardly be reached in timeframes that matters for
 // monitoring.
 const smallDeltaTolerance = 1e-12
 // Helpers to calculate quantiles.
 // excludedLabels are the labels to exclude from signature calculation for
@ -72,16 +91,19 @@ type metricWithBuckets struct {
 //
 // If q>1, +Inf is returned.
 //
-// We also return a bool to indicate if monotonicity needed to be forced.
+// We also return a bool to indicate if monotonicity needed to be forced,
-func bucketQuantile(q float64, buckets buckets) (float64, bool) {
+// and another bool to indicate if small differences between buckets (that
 // are likely artifacts of floating point precision issues) have been
 // ignored.
 func bucketQuantile(q float64, buckets buckets) (float64, bool, bool) {
 	if math.IsNaN(q) {
-		return math.NaN(), false
+		return math.NaN(), false, false
 	}
 	if q < 0 {
-		return math.Inf(-1), false
+		return math.Inf(-1), false, false
 	}
 	if q > 1 {
-		return math.Inf(+1), false
+		return math.Inf(+1), false, false
 	}
 	slices.SortFunc(buckets, func(a, b bucket) int {
 		// We don't expect the bucket boundary to be a NaN.
@ -94,27 +116,27 @@ func bucketQuantile(q float64, buckets buckets) (float64, bool) {
 		return 0
 	})
 	if !math.IsInf(buckets[len(buckets)-1].upperBound, +1) {
-		return math.NaN(), false
+		return math.NaN(), false, false
 	}
 	buckets = coalesceBuckets(buckets)
-	forcedMonotonic := ensureMonotonic(buckets)
+	forcedMonotonic, fixedPrecision := ensureMonotonicAndIgnoreSmallDeltas(buckets, smallDeltaTolerance)
 	if len(buckets) < 2 {
-		return math.NaN(), false
+		return math.NaN(), false, false
 	}
 	observations := buckets[len(buckets)-1].count
 	if observations == 0 {
-		return math.NaN(), false
+		return math.NaN(), false, false
 	}
 	rank := q * observations
 	b := sort.Search(len(buckets)-1, func(i int) bool { return buckets[i].count >= rank })
 	if b == len(buckets)-1 {
-		return buckets[len(buckets)-2].upperBound, forcedMonotonic
+		return buckets[len(buckets)-2].upperBound, forcedMonotonic, fixedPrecision
 	}
 	if b == 0 && buckets[0].upperBound <= 0 {
-		return buckets[0].upperBound, forcedMonotonic
+		return buckets[0].upperBound, forcedMonotonic, fixedPrecision
 	}
 	var (
 		bucketStart float64
@ -126,7 +148,7 @@ func bucketQuantile(q float64, buckets buckets) (float64, bool) {
 		count -= buckets[b-1].count
 		rank -= buckets[b-1].count
 	}
-	return bucketStart + (bucketEnd-bucketStart)*(rank/count), forcedMonotonic
+	return bucketStart + (bucketEnd-bucketStart)*(rank/count), forcedMonotonic, fixedPrecision
 }
 // histogramQuantile calculates the quantile 'q' based on the given histogram.
@ -348,6 +370,7 @@ func coalesceBuckets(buckets buckets) buckets {
 //   - Ingestion via the remote write receiver that Prometheus implements.
 //   - Optimisation of query execution where precision is sacrificed for other
 //     benefits, not by Prometheus but by systems built on top of it.
 //   - Circumstances where floating point precision errors accumulate.
 //
 // Monotonicity is usually guaranteed because if a bucket with upper bound
 // u1 has count c1, then any bucket with a higher upper bound u > u1 must
@ -357,22 +380,42 @@ func coalesceBuckets(buckets buckets) buckets {
 // bucket with the φ-quantile count, so breaking the monotonicity
 // guarantee causes bucketQuantile() to return undefined (nonsense) results.
 //
-// As a somewhat hacky solution, we calculate the "envelope" of the histogram
+// As a somewhat hacky solution, we first silently ignore any numerically
-// buckets, essentially removing any decreases in the count between successive
+// insignificant (relative delta below the requested tolerance and likely to
-// buckets. We return a bool to indicate if this monotonicity was forced or not.
+// be from floating point precision errors) differences between successive
-func ensureMonotonic(buckets buckets) bool {
+// buckets regardless of the direction. Then we calculate the "envelope" of
-	forced := false
+// the histogram buckets, essentially removing any decreases in the count
-	max := buckets[0].count
+// between successive buckets.
 //
 // We return a bool to indicate if this monotonicity was forced or not, and
 // another bool to indicate if small deltas were ignored or not.
 func ensureMonotonicAndIgnoreSmallDeltas(buckets buckets, tolerance float64) (bool, bool) {
 	var forcedMonotonic, fixedPrecision bool
 	prev := buckets[0].count
 	for i := 1; i < len(buckets); i++ {
-		switch {
+		curr := buckets[i].count // Assumed always positive.
-		case buckets[i].count > max:
+		if curr == prev {
-			max = buckets[i].count
+			// No correction needed if the counts are identical between buckets.
-		case buckets[i].count < max:
+			continue
 			buckets[i].count = max
 			forced = true
 		}
 		if almostEqual(prev, curr, tolerance) {
 			// Silently correct numerically insignificant differences from floating
 			// point precision errors, regardless of direction.
 			// Do not update the 'prev' value as we are ignoring the difference.
 			buckets[i].count = prev
 			fixedPrecision = true
 			continue
 		}
 		if curr < prev {
 			// Force monotonicity by removing any decreases regardless of magnitude.
 			// Do not update the 'prev' value as we are ignoring the decrease.
 			buckets[i].count = prev
 			forcedMonotonic = true
 			continue
 		}
 		prev = curr
 	}
-	return forced
+	return forcedMonotonic, fixedPrecision
 }
 // quantile calculates the given quantile of a vector of samples.
--- a/promql/quantile_test.go
+++ b/promql/quantile_test.go
@ -0,0 +1,318 @@
 // Copyright 2023 The Prometheus Authors
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 // http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 package promql
 import (
 	"math"
 	"testing"
 	"github.com/stretchr/testify/require"
 )
 func TestBucketQuantile_ForcedMonotonicity(t *testing.T) {
 	eps := 1e-12
 	for name, tc := range map[string]struct {
 		getInput       func() buckets // The buckets can be modified in-place so return a new one each time.
 		expectedForced bool
 		expectedFixed  bool
 		expectedValues map[float64]float64
 	}{
 		"simple - monotonic": {
 			getInput: func() buckets {
 				return buckets{
 					{
 						upperBound: 10,
 						count:      10,
 					}, {
 						upperBound: 15,
 						count:      15,
 					}, {
 						upperBound: 20,
 						count:      15,
 					}, {
 						upperBound: 30,
 						count:      15,
 					}, {
 						upperBound: math.Inf(1),
 						count:      15,
 					},
 				}
 			},
 			expectedForced: false,
 			expectedFixed:  false,
 			expectedValues: map[float64]float64{
 				1:    15.,
 				0.99: 14.85,
 				0.9:  13.5,
 				0.5:  7.5,
 			},
 		},
 		"simple - non-monotonic middle": {
 			getInput: func() buckets {
 				return buckets{
 					{
 						upperBound: 10,
 						count:      10,
 					}, {
 						upperBound: 15,
 						count:      15,
 					}, {
 						upperBound: 20,
 						count:      15.00000000001, // Simulate the case there's a small imprecision in float64.
 					}, {
 						upperBound: 30,
 						count:      15,
 					}, {
 						upperBound: math.Inf(1),
 						count:      15,
 					},
 				}
 			},
 			expectedForced: false,
 			expectedFixed:  true,
 			expectedValues: map[float64]float64{
 				1:    15.,
 				0.99: 14.85,
 				0.9:  13.5,
 				0.5:  7.5,
 			},
 		},
 		"real example - monotonic": {
 			getInput: func() buckets {
 				return buckets{
 					{
 						upperBound: 1,
 						count:      6454661.3014166197,
 					}, {
 						upperBound: 5,
 						count:      8339611.2001912938,
 					}, {
 						upperBound: 10,
 						count:      14118319.2444762159,
 					}, {
 						upperBound: 25,
 						count:      14130031.5272856522,
 					}, {
 						upperBound: 50,
 						count:      46001270.3030008152,
 					}, {
 						upperBound: 64,
 						count:      46008473.8585563600,
 					}, {
 						upperBound: 80,
 						count:      46008473.8585563600,
 					}, {
 						upperBound: 100,
 						count:      46008473.8585563600,
 					}, {
 						upperBound: 250,
 						count:      46008473.8585563600,
 					}, {
 						upperBound: 1000,
 						count:      46008473.8585563600,
 					}, {
 						upperBound: math.Inf(1),
 						count:      46008473.8585563600,
 					},
 				}
 			},
 			expectedForced: false,
 			expectedFixed:  false,
 			expectedValues: map[float64]float64{
 				1:    64.,
 				0.99: 49.64475715376406,
 				0.9:  46.39671690938454,
 				0.5:  31.96098248992002,
 			},
 		},
 		"real example - non-monotonic": {
 			getInput: func() buckets {
 				return buckets{
 					{
 						upperBound: 1,
 						count:      6454661.3014166225,
 					}, {
 						upperBound: 5,
 						count:      8339611.2001912957,
 					}, {
 						upperBound: 10,
 						count:      14118319.2444762159,
 					}, {
 						upperBound: 25,
 						count:      14130031.5272856504,
 					}, {
 						upperBound: 50,
 						count:      46001270.3030008227,
 					}, {
 						upperBound: 64,
 						count:      46008473.8585563824,
 					}, {
 						upperBound: 80,
 						count:      46008473.8585563898,
 					}, {
 						upperBound: 100,
 						count:      46008473.8585563824,
 					}, {
 						upperBound: 250,
 						count:      46008473.8585563824,
 					}, {
 						upperBound: 1000,
 						count:      46008473.8585563898,
 					}, {
 						upperBound: math.Inf(1),
 						count:      46008473.8585563824,
 					},
 				}
 			},
 			expectedForced: false,
 			expectedFixed:  true,
 			expectedValues: map[float64]float64{
 				1:    64.,
 				0.99: 49.64475715376406,
 				0.9:  46.39671690938454,
 				0.5:  31.96098248992002,
 			},
 		},
 		"real example 2 - monotonic": {
 			getInput: func() buckets {
 				return buckets{
 					{
 						upperBound: 0.005,
 						count:      9.6,
 					}, {
 						upperBound: 0.01,
 						count:      9.688888889,
 					}, {
 						upperBound: 0.025,
 						count:      9.755555556,
 					}, {
 						upperBound: 0.05,
 						count:      9.844444444,
 					}, {
 						upperBound: 0.1,
 						count:      9.888888889,
 					}, {
 						upperBound: 0.25,
 						count:      9.888888889,
 					}, {
 						upperBound: 0.5,
 						count:      9.888888889,
 					}, {
 						upperBound: 1,
 						count:      9.888888889,
 					}, {
 						upperBound: 2.5,
 						count:      9.888888889,
 					}, {
 						upperBound: 5,
 						count:      9.888888889,
 					}, {
 						upperBound: 10,
 						count:      9.888888889,
 					}, {
 						upperBound: 25,
 						count:      9.888888889,
 					}, {
 						upperBound: 50,
 						count:      9.888888889,
 					}, {
 						upperBound: 100,
 						count:      9.888888889,
 					}, {
 						upperBound: math.Inf(1),
 						count:      9.888888889,
 					},
 				}
 			},
 			expectedForced: false,
 			expectedFixed:  false,
 			expectedValues: map[float64]float64{
 				1:    0.1,
 				0.99: 0.03468750000281261,
 				0.9:  0.00463541666671875,
 				0.5:  0.0025752314815104174,
 			},
 		},
 		"real example 2 - non-monotonic": {
 			getInput: func() buckets {
 				return buckets{
 					{
 						upperBound: 0.005,
 						count:      9.6,
 					}, {
 						upperBound: 0.01,
 						count:      9.688888889,
 					}, {
 						upperBound: 0.025,
 						count:      9.755555556,
 					}, {
 						upperBound: 0.05,
 						count:      9.844444444,
 					}, {
 						upperBound: 0.1,
 						count:      9.888888889,
 					}, {
 						upperBound: 0.25,
 						count:      9.888888889,
 					}, {
 						upperBound: 0.5,
 						count:      9.888888889,
 					}, {
 						upperBound: 1,
 						count:      9.888888889,
 					}, {
 						upperBound: 2.5,
 						count:      9.888888889,
 					}, {
 						upperBound: 5,
 						count:      9.888888889,
 					}, {
 						upperBound: 10,
 						count:      9.888888889001, // Simulate the case there's a small imprecision in float64.
 					}, {
 						upperBound: 25,
 						count:      9.888888889,
 					}, {
 						upperBound: 50,
 						count:      9.888888888999, // Simulate the case there's a small imprecision in float64.
 					}, {
 						upperBound: 100,
 						count:      9.888888889,
 					}, {
 						upperBound: math.Inf(1),
 						count:      9.888888889,
 					},
 				}
 			},
 			expectedForced: false,
 			expectedFixed:  true,
 			expectedValues: map[float64]float64{
 				1:    0.1,
 				0.99: 0.03468750000281261,
 				0.9:  0.00463541666671875,
 				0.5:  0.0025752314815104174,
 			},
 		},
 	} {
 		t.Run(name, func(t *testing.T) {
 			for q, v := range tc.expectedValues {
 				res, forced, fixed := bucketQuantile(q, tc.getInput())
 				require.Equal(t, tc.expectedForced, forced)
 				require.Equal(t, tc.expectedFixed, fixed)
 				require.InEpsilon(t, v, res, eps)
 			}
 		})
 	}
 }
--- a/promql/test.go
+++ b/promql/test.go
@ -49,7 +49,7 @@ var (
 )
 const (
-	epsilon = 0.000001 // Relative error allowed for sample values.
+	defaultEpsilon = 0.000001 // Relative error allowed for sample values.
 )
 var testStartTime = time.Unix(0, 0).UTC()
@ -440,7 +440,7 @@ func (ev *evalCmd) compareResult(result parser.Value) error {
 			if (expH == nil) != (v.H == nil) || (expH != nil && !expH.Equals(v.H)) {
 				return fmt.Errorf("expected %v for %s but got %s", HistogramTestExpression(expH), v.Metric, HistogramTestExpression(v.H))
 			}
-			if !almostEqual(exp0.Value, v.F) {
+			if !almostEqual(exp0.Value, v.F, defaultEpsilon) {
 				return fmt.Errorf("expected %v for %s but got %v", exp0.Value, v.Metric, v.F)
 			}
@ -464,7 +464,7 @@ func (ev *evalCmd) compareResult(result parser.Value) error {
 		if exp0.Histogram != nil {
 			return fmt.Errorf("expected Histogram %v but got scalar %s", exp0.Histogram.TestExpression(), val.String())
 		}
-		if !almostEqual(exp0.Value, val.V) {
+		if !almostEqual(exp0.Value, val.V, defaultEpsilon) {
 			return fmt.Errorf("expected Scalar %v but got %v", val.V, exp0.Value)
 		}
@ -663,9 +663,9 @@ func (t *test) clear() {
 	t.context, t.cancelCtx = context.WithCancel(context.Background())
 }
-// samplesAlmostEqual returns true if the two sample lines only differ by a
+// almostEqual returns true if a and b differ by less than their sum
-// small relative error in their sample value.
+// multiplied by epsilon.
-func almostEqual(a, b float64) bool {
+func almostEqual(a, b, epsilon float64) bool {
 	// NaN has no equality but for testing we still want to know whether both values
 	// are NaN.
 	if math.IsNaN(a) && math.IsNaN(b) {
@ -677,12 +677,13 @@ func almostEqual(a, b float64) bool {
 		return true
 	}
 	absSum := math.Abs(a) + math.Abs(b)
 	diff := math.Abs(a - b)
-	if a == 0 || b == 0 || diff < minNormal {
+	if a == 0 || b == 0 || absSum < minNormal {
 		return diff < epsilon*minNormal
 	}
-	return diff/(math.Abs(a)+math.Abs(b)) < epsilon
+	return diff/math.Min(absSum, math.MaxFloat64) < epsilon
 }
 func parseNumber(s string) (float64, error) {
--- a/util/annotations/annotations.go
+++ b/util/annotations/annotations.go
@ -108,7 +108,7 @@ var (
 	MixedClassicNativeHistogramsWarning = fmt.Errorf("%w: vector contains a mix of classic and native histograms for metric name", PromQLWarning)
 	PossibleNonCounterInfo                  = fmt.Errorf("%w: metric might not be a counter, name does not end in _total/_sum/_count/_bucket:", PromQLInfo)
-	HistogramQuantileForcedMonotonicityInfo = fmt.Errorf("%w: input to histogram_quantile needed to be fixed for monotonicity (and may give inaccurate results) for metric name", PromQLInfo)
+	HistogramQuantileForcedMonotonicityInfo = fmt.Errorf("%w: input to histogram_quantile needed to be fixed for monotonicity (see https://prometheus.io/docs/prometheus/latest/querying/functions/#histogram_quantile) for metric name", PromQLInfo)
 )
 type annoErr struct {