PromQL: ignore small errors for bucketQuantile (#13153)

promql: Improve histogram_quantile calculation for classic buckets

Tiny differences between classic buckets are most likely caused by floating point precision issues. With this commit, relative changes below a certain threshold are ignored. This makes the result of histogram_quantile more meaningful, and also avoids triggering the _input to histogram_quantile needed to be fixed for monotonicity_ annotations in unactionable cases.

This commit also adds explanation of the new adjustment and of the monotonicity annotation to the documentation of `histogram_quantile`.

---------

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
This commit is contained in:
zenador 2023-11-25 07:05:38 +08:00 committed by GitHub
parent 2e205ee95c
commit ccfe14d7e7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 416 additions and 36 deletions

View file

@ -323,6 +323,24 @@ a histogram.
You can use `histogram_quantile(1, v instant-vector)` to get the estimated maximum value stored in
a histogram.
Buckets of classic histograms are cumulative. Therefore, the following should always be the case:
- The counts in the buckets are monotonically increasing (strictly non-decreasing).
- A lack of observations between the upper limits of two consecutive buckets results in equal counts
in those two buckets.
However, floating point precision issues (e.g. small discrepancies introduced by computing of buckets
with `sum(rate(...))`) or invalid data might violate these assumptions. In that case,
`histogram_quantile` would be unable to return meaningful results. To mitigate the issue,
`histogram_quantile` assumes that tiny relative differences between consecutive buckets are happening
because of floating point precision errors and ignores them. (The threshold to ignore a difference
between two buckets is a trillionth (1e-12) of the sum of both buckets.) Furthermore, if there are
non-monotonic bucket counts even after this adjustment, they are increased to the value of the
previous buckets to enforce monotonicity. The latter is evidence for an actual issue with the input
data and is therefore flagged with an informational annotation reading `input to histogram_quantile
needed to be fixed for monotonicity`. If you encounter this annotation, you should find and remove
the source of the invalid data.
## `histogram_stddev()` and `histogram_stdvar()`
_Both functions only act on native histograms, which are an experimental

View file

@ -3699,7 +3699,7 @@ func TestNativeHistogram_HistogramQuantile(t *testing.T) {
require.Len(t, vector, 1)
require.Nil(t, vector[0].H)
require.True(t, almostEqual(sc.value, vector[0].F))
require.True(t, almostEqual(sc.value, vector[0].F, defaultEpsilon))
})
}
idx++

View file

@ -1163,7 +1163,7 @@ func funcHistogramQuantile(vals []parser.Value, args parser.Expressions, enh *Ev
for _, mb := range enh.signatureToMetricWithBuckets {
if len(mb.buckets) > 0 {
res, forcedMonotonicity := bucketQuantile(q, mb.buckets)
res, forcedMonotonicity, _ := bucketQuantile(q, mb.buckets)
enh.Out = append(enh.Out, Sample{
Metric: mb.metric,
F: res,

View file

@ -23,6 +23,25 @@ import (
"github.com/prometheus/prometheus/model/labels"
)
// smallDeltaTolerance is the threshold for relative deltas between classic
// histogram buckets that will be ignored by the histogram_quantile function
// because they are most likely artifacts of floating point precision issues.
// Testing on 2 sets of real data with bugs arising from small deltas,
// the safe ranges were from:
// - 1e-05 to 1e-15
// - 1e-06 to 1e-15
// Anything to the left of that would cause non-query-sharded data to have
// small deltas ignored (unnecessary and we should avoid this), and anything
// to the right of that would cause query-sharded data to not have its small
// deltas ignored (so the problem won't be fixed).
// For context, query sharding triggers these float precision errors in Mimir.
// To illustrate, with a relative deviation of 1e-12, we need to have 1e12
// observations in the bucket so that the change of one observation is small
// enough to get ignored. With the usual observation rate even of very busy
// services, this will hardly be reached in timeframes that matters for
// monitoring.
const smallDeltaTolerance = 1e-12
// Helpers to calculate quantiles.
// excludedLabels are the labels to exclude from signature calculation for
@ -72,16 +91,19 @@ type metricWithBuckets struct {
//
// If q>1, +Inf is returned.
//
// We also return a bool to indicate if monotonicity needed to be forced.
func bucketQuantile(q float64, buckets buckets) (float64, bool) {
// We also return a bool to indicate if monotonicity needed to be forced,
// and another bool to indicate if small differences between buckets (that
// are likely artifacts of floating point precision issues) have been
// ignored.
func bucketQuantile(q float64, buckets buckets) (float64, bool, bool) {
if math.IsNaN(q) {
return math.NaN(), false
return math.NaN(), false, false
}
if q < 0 {
return math.Inf(-1), false
return math.Inf(-1), false, false
}
if q > 1 {
return math.Inf(+1), false
return math.Inf(+1), false, false
}
slices.SortFunc(buckets, func(a, b bucket) int {
// We don't expect the bucket boundary to be a NaN.
@ -94,27 +116,27 @@ func bucketQuantile(q float64, buckets buckets) (float64, bool) {
return 0
})
if !math.IsInf(buckets[len(buckets)-1].upperBound, +1) {
return math.NaN(), false
return math.NaN(), false, false
}
buckets = coalesceBuckets(buckets)
forcedMonotonic := ensureMonotonic(buckets)
forcedMonotonic, fixedPrecision := ensureMonotonicAndIgnoreSmallDeltas(buckets, smallDeltaTolerance)
if len(buckets) < 2 {
return math.NaN(), false
return math.NaN(), false, false
}
observations := buckets[len(buckets)-1].count
if observations == 0 {
return math.NaN(), false
return math.NaN(), false, false
}
rank := q * observations
b := sort.Search(len(buckets)-1, func(i int) bool { return buckets[i].count >= rank })
if b == len(buckets)-1 {
return buckets[len(buckets)-2].upperBound, forcedMonotonic
return buckets[len(buckets)-2].upperBound, forcedMonotonic, fixedPrecision
}
if b == 0 && buckets[0].upperBound <= 0 {
return buckets[0].upperBound, forcedMonotonic
return buckets[0].upperBound, forcedMonotonic, fixedPrecision
}
var (
bucketStart float64
@ -126,7 +148,7 @@ func bucketQuantile(q float64, buckets buckets) (float64, bool) {
count -= buckets[b-1].count
rank -= buckets[b-1].count
}
return bucketStart + (bucketEnd-bucketStart)*(rank/count), forcedMonotonic
return bucketStart + (bucketEnd-bucketStart)*(rank/count), forcedMonotonic, fixedPrecision
}
// histogramQuantile calculates the quantile 'q' based on the given histogram.
@ -348,6 +370,7 @@ func coalesceBuckets(buckets buckets) buckets {
// - Ingestion via the remote write receiver that Prometheus implements.
// - Optimisation of query execution where precision is sacrificed for other
// benefits, not by Prometheus but by systems built on top of it.
// - Circumstances where floating point precision errors accumulate.
//
// Monotonicity is usually guaranteed because if a bucket with upper bound
// u1 has count c1, then any bucket with a higher upper bound u > u1 must
@ -357,22 +380,42 @@ func coalesceBuckets(buckets buckets) buckets {
// bucket with the φ-quantile count, so breaking the monotonicity
// guarantee causes bucketQuantile() to return undefined (nonsense) results.
//
// As a somewhat hacky solution, we calculate the "envelope" of the histogram
// buckets, essentially removing any decreases in the count between successive
// buckets. We return a bool to indicate if this monotonicity was forced or not.
func ensureMonotonic(buckets buckets) bool {
forced := false
max := buckets[0].count
// As a somewhat hacky solution, we first silently ignore any numerically
// insignificant (relative delta below the requested tolerance and likely to
// be from floating point precision errors) differences between successive
// buckets regardless of the direction. Then we calculate the "envelope" of
// the histogram buckets, essentially removing any decreases in the count
// between successive buckets.
//
// We return a bool to indicate if this monotonicity was forced or not, and
// another bool to indicate if small deltas were ignored or not.
func ensureMonotonicAndIgnoreSmallDeltas(buckets buckets, tolerance float64) (bool, bool) {
var forcedMonotonic, fixedPrecision bool
prev := buckets[0].count
for i := 1; i < len(buckets); i++ {
switch {
case buckets[i].count > max:
max = buckets[i].count
case buckets[i].count < max:
buckets[i].count = max
forced = true
curr := buckets[i].count // Assumed always positive.
if curr == prev {
// No correction needed if the counts are identical between buckets.
continue
}
if almostEqual(prev, curr, tolerance) {
// Silently correct numerically insignificant differences from floating
// point precision errors, regardless of direction.
// Do not update the 'prev' value as we are ignoring the difference.
buckets[i].count = prev
fixedPrecision = true
continue
}
if curr < prev {
// Force monotonicity by removing any decreases regardless of magnitude.
// Do not update the 'prev' value as we are ignoring the decrease.
buckets[i].count = prev
forcedMonotonic = true
continue
}
prev = curr
}
return forced
return forcedMonotonic, fixedPrecision
}
// quantile calculates the given quantile of a vector of samples.

318
promql/quantile_test.go Normal file
View file

@ -0,0 +1,318 @@
// Copyright 2023 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package promql
import (
"math"
"testing"
"github.com/stretchr/testify/require"
)
func TestBucketQuantile_ForcedMonotonicity(t *testing.T) {
eps := 1e-12
for name, tc := range map[string]struct {
getInput func() buckets // The buckets can be modified in-place so return a new one each time.
expectedForced bool
expectedFixed bool
expectedValues map[float64]float64
}{
"simple - monotonic": {
getInput: func() buckets {
return buckets{
{
upperBound: 10,
count: 10,
}, {
upperBound: 15,
count: 15,
}, {
upperBound: 20,
count: 15,
}, {
upperBound: 30,
count: 15,
}, {
upperBound: math.Inf(1),
count: 15,
},
}
},
expectedForced: false,
expectedFixed: false,
expectedValues: map[float64]float64{
1: 15.,
0.99: 14.85,
0.9: 13.5,
0.5: 7.5,
},
},
"simple - non-monotonic middle": {
getInput: func() buckets {
return buckets{
{
upperBound: 10,
count: 10,
}, {
upperBound: 15,
count: 15,
}, {
upperBound: 20,
count: 15.00000000001, // Simulate the case there's a small imprecision in float64.
}, {
upperBound: 30,
count: 15,
}, {
upperBound: math.Inf(1),
count: 15,
},
}
},
expectedForced: false,
expectedFixed: true,
expectedValues: map[float64]float64{
1: 15.,
0.99: 14.85,
0.9: 13.5,
0.5: 7.5,
},
},
"real example - monotonic": {
getInput: func() buckets {
return buckets{
{
upperBound: 1,
count: 6454661.3014166197,
}, {
upperBound: 5,
count: 8339611.2001912938,
}, {
upperBound: 10,
count: 14118319.2444762159,
}, {
upperBound: 25,
count: 14130031.5272856522,
}, {
upperBound: 50,
count: 46001270.3030008152,
}, {
upperBound: 64,
count: 46008473.8585563600,
}, {
upperBound: 80,
count: 46008473.8585563600,
}, {
upperBound: 100,
count: 46008473.8585563600,
}, {
upperBound: 250,
count: 46008473.8585563600,
}, {
upperBound: 1000,
count: 46008473.8585563600,
}, {
upperBound: math.Inf(1),
count: 46008473.8585563600,
},
}
},
expectedForced: false,
expectedFixed: false,
expectedValues: map[float64]float64{
1: 64.,
0.99: 49.64475715376406,
0.9: 46.39671690938454,
0.5: 31.96098248992002,
},
},
"real example - non-monotonic": {
getInput: func() buckets {
return buckets{
{
upperBound: 1,
count: 6454661.3014166225,
}, {
upperBound: 5,
count: 8339611.2001912957,
}, {
upperBound: 10,
count: 14118319.2444762159,
}, {
upperBound: 25,
count: 14130031.5272856504,
}, {
upperBound: 50,
count: 46001270.3030008227,
}, {
upperBound: 64,
count: 46008473.8585563824,
}, {
upperBound: 80,
count: 46008473.8585563898,
}, {
upperBound: 100,
count: 46008473.8585563824,
}, {
upperBound: 250,
count: 46008473.8585563824,
}, {
upperBound: 1000,
count: 46008473.8585563898,
}, {
upperBound: math.Inf(1),
count: 46008473.8585563824,
},
}
},
expectedForced: false,
expectedFixed: true,
expectedValues: map[float64]float64{
1: 64.,
0.99: 49.64475715376406,
0.9: 46.39671690938454,
0.5: 31.96098248992002,
},
},
"real example 2 - monotonic": {
getInput: func() buckets {
return buckets{
{
upperBound: 0.005,
count: 9.6,
}, {
upperBound: 0.01,
count: 9.688888889,
}, {
upperBound: 0.025,
count: 9.755555556,
}, {
upperBound: 0.05,
count: 9.844444444,
}, {
upperBound: 0.1,
count: 9.888888889,
}, {
upperBound: 0.25,
count: 9.888888889,
}, {
upperBound: 0.5,
count: 9.888888889,
}, {
upperBound: 1,
count: 9.888888889,
}, {
upperBound: 2.5,
count: 9.888888889,
}, {
upperBound: 5,
count: 9.888888889,
}, {
upperBound: 10,
count: 9.888888889,
}, {
upperBound: 25,
count: 9.888888889,
}, {
upperBound: 50,
count: 9.888888889,
}, {
upperBound: 100,
count: 9.888888889,
}, {
upperBound: math.Inf(1),
count: 9.888888889,
},
}
},
expectedForced: false,
expectedFixed: false,
expectedValues: map[float64]float64{
1: 0.1,
0.99: 0.03468750000281261,
0.9: 0.00463541666671875,
0.5: 0.0025752314815104174,
},
},
"real example 2 - non-monotonic": {
getInput: func() buckets {
return buckets{
{
upperBound: 0.005,
count: 9.6,
}, {
upperBound: 0.01,
count: 9.688888889,
}, {
upperBound: 0.025,
count: 9.755555556,
}, {
upperBound: 0.05,
count: 9.844444444,
}, {
upperBound: 0.1,
count: 9.888888889,
}, {
upperBound: 0.25,
count: 9.888888889,
}, {
upperBound: 0.5,
count: 9.888888889,
}, {
upperBound: 1,
count: 9.888888889,
}, {
upperBound: 2.5,
count: 9.888888889,
}, {
upperBound: 5,
count: 9.888888889,
}, {
upperBound: 10,
count: 9.888888889001, // Simulate the case there's a small imprecision in float64.
}, {
upperBound: 25,
count: 9.888888889,
}, {
upperBound: 50,
count: 9.888888888999, // Simulate the case there's a small imprecision in float64.
}, {
upperBound: 100,
count: 9.888888889,
}, {
upperBound: math.Inf(1),
count: 9.888888889,
},
}
},
expectedForced: false,
expectedFixed: true,
expectedValues: map[float64]float64{
1: 0.1,
0.99: 0.03468750000281261,
0.9: 0.00463541666671875,
0.5: 0.0025752314815104174,
},
},
} {
t.Run(name, func(t *testing.T) {
for q, v := range tc.expectedValues {
res, forced, fixed := bucketQuantile(q, tc.getInput())
require.Equal(t, tc.expectedForced, forced)
require.Equal(t, tc.expectedFixed, fixed)
require.InEpsilon(t, v, res, eps)
}
})
}
}

View file

@ -49,7 +49,7 @@ var (
)
const (
epsilon = 0.000001 // Relative error allowed for sample values.
defaultEpsilon = 0.000001 // Relative error allowed for sample values.
)
var testStartTime = time.Unix(0, 0).UTC()
@ -440,7 +440,7 @@ func (ev *evalCmd) compareResult(result parser.Value) error {
if (expH == nil) != (v.H == nil) || (expH != nil && !expH.Equals(v.H)) {
return fmt.Errorf("expected %v for %s but got %s", HistogramTestExpression(expH), v.Metric, HistogramTestExpression(v.H))
}
if !almostEqual(exp0.Value, v.F) {
if !almostEqual(exp0.Value, v.F, defaultEpsilon) {
return fmt.Errorf("expected %v for %s but got %v", exp0.Value, v.Metric, v.F)
}
@ -464,7 +464,7 @@ func (ev *evalCmd) compareResult(result parser.Value) error {
if exp0.Histogram != nil {
return fmt.Errorf("expected Histogram %v but got scalar %s", exp0.Histogram.TestExpression(), val.String())
}
if !almostEqual(exp0.Value, val.V) {
if !almostEqual(exp0.Value, val.V, defaultEpsilon) {
return fmt.Errorf("expected Scalar %v but got %v", val.V, exp0.Value)
}
@ -663,9 +663,9 @@ func (t *test) clear() {
t.context, t.cancelCtx = context.WithCancel(context.Background())
}
// samplesAlmostEqual returns true if the two sample lines only differ by a
// small relative error in their sample value.
func almostEqual(a, b float64) bool {
// almostEqual returns true if a and b differ by less than their sum
// multiplied by epsilon.
func almostEqual(a, b, epsilon float64) bool {
// NaN has no equality but for testing we still want to know whether both values
// are NaN.
if math.IsNaN(a) && math.IsNaN(b) {
@ -677,12 +677,13 @@ func almostEqual(a, b float64) bool {
return true
}
absSum := math.Abs(a) + math.Abs(b)
diff := math.Abs(a - b)
if a == 0 || b == 0 || diff < minNormal {
if a == 0 || b == 0 || absSum < minNormal {
return diff < epsilon*minNormal
}
return diff/(math.Abs(a)+math.Abs(b)) < epsilon
return diff/math.Min(absSum, math.MaxFloat64) < epsilon
}
func parseNumber(s string) (float64, error) {

View file

@ -108,7 +108,7 @@ var (
MixedClassicNativeHistogramsWarning = fmt.Errorf("%w: vector contains a mix of classic and native histograms for metric name", PromQLWarning)
PossibleNonCounterInfo = fmt.Errorf("%w: metric might not be a counter, name does not end in _total/_sum/_count/_bucket:", PromQLInfo)
HistogramQuantileForcedMonotonicityInfo = fmt.Errorf("%w: input to histogram_quantile needed to be fixed for monotonicity (and may give inaccurate results) for metric name", PromQLInfo)
HistogramQuantileForcedMonotonicityInfo = fmt.Errorf("%w: input to histogram_quantile needed to be fixed for monotonicity (see https://prometheus.io/docs/prometheus/latest/querying/functions/#histogram_quantile) for metric name", PromQLInfo)
)
type annoErr struct {