Merge pull request #12931 from zenador/warning-non-monotonic-classic-histogram

Add warning when monotonicity is forced in the input to histogram_quantile
This commit is contained in:
Björn Rabenstein 2023-10-08 15:59:48 +02:00 committed by GitHub
commit b787e5f190
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 41 additions and 36 deletions

View file

@ -1168,10 +1168,14 @@ func funcHistogramQuantile(vals []parser.Value, args parser.Expressions, enh *Ev
for _, mb := range enh.signatureToMetricWithBuckets {
if len(mb.buckets) > 0 {
res, forcedMonotonicity := bucketQuantile(q, mb.buckets)
enh.Out = append(enh.Out, Sample{
Metric: mb.metric,
F: bucketQuantile(q, mb.buckets),
F: res,
})
if forcedMonotonicity {
annos.Add(annotations.NewHistogramQuantileForcedMonotonicityInfo(mb.metric.Get(labels.MetricName), args[1].PositionRange()))
}
}
}

View file

@ -71,15 +71,17 @@ type metricWithBuckets struct {
// If q<0, -Inf is returned.
//
// If q>1, +Inf is returned.
func bucketQuantile(q float64, buckets buckets) float64 {
//
// We also return a bool to indicate if monotonicity needed to be forced.
func bucketQuantile(q float64, buckets buckets) (float64, bool) {
if math.IsNaN(q) {
return math.NaN()
return math.NaN(), false
}
if q < 0 {
return math.Inf(-1)
return math.Inf(-1), false
}
if q > 1 {
return math.Inf(+1)
return math.Inf(+1), false
}
slices.SortFunc(buckets, func(a, b bucket) int {
// We don't expect the bucket boundary to be a NaN.
@ -92,27 +94,27 @@ func bucketQuantile(q float64, buckets buckets) float64 {
return 0
})
if !math.IsInf(buckets[len(buckets)-1].upperBound, +1) {
return math.NaN()
return math.NaN(), false
}
buckets = coalesceBuckets(buckets)
ensureMonotonic(buckets)
forcedMonotonic := ensureMonotonic(buckets)
if len(buckets) < 2 {
return math.NaN()
return math.NaN(), false
}
observations := buckets[len(buckets)-1].count
if observations == 0 {
return math.NaN()
return math.NaN(), false
}
rank := q * observations
b := sort.Search(len(buckets)-1, func(i int) bool { return buckets[i].count >= rank })
if b == len(buckets)-1 {
return buckets[len(buckets)-2].upperBound
return buckets[len(buckets)-2].upperBound, forcedMonotonic
}
if b == 0 && buckets[0].upperBound <= 0 {
return buckets[0].upperBound
return buckets[0].upperBound, forcedMonotonic
}
var (
bucketStart float64
@ -124,7 +126,7 @@ func bucketQuantile(q float64, buckets buckets) float64 {
count -= buckets[b-1].count
rank -= buckets[b-1].count
}
return bucketStart + (bucketEnd-bucketStart)*(rank/count)
return bucketStart + (bucketEnd-bucketStart)*(rank/count), forcedMonotonic
}
// histogramQuantile calculates the quantile 'q' based on the given histogram.
@ -342,37 +344,24 @@ func coalesceBuckets(buckets buckets) buckets {
// The assumption that bucket counts increase monotonically with increasing
// upperBound may be violated during:
//
// * Recording rule evaluation of histogram_quantile, especially when rate()
// has been applied to the underlying bucket timeseries.
// * Evaluation of histogram_quantile computed over federated bucket
// timeseries, especially when rate() has been applied.
//
// This is because scraped data is not made available to rule evaluation or
// federation atomically, so some buckets are computed with data from the
// most recent scrapes, but the other buckets are missing data from the most
// recent scrape.
// - Circumstances where data is already inconsistent at the target's side.
// - Ingestion via the remote write receiver that Prometheus implements.
// - Optimisation of query execution where precision is sacrificed for other
// benefits, not by Prometheus but by systems built on top of it.
//
// Monotonicity is usually guaranteed because if a bucket with upper bound
// u1 has count c1, then any bucket with a higher upper bound u > u1 must
// have counted all c1 observations and perhaps more, so that c >= c1.
//
// Randomly interspersed partial sampling breaks that guarantee, and rate()
// exacerbates it. Specifically, suppose bucket le=1000 has a count of 10 from
// 4 samples but the bucket with le=2000 has a count of 7 from 3 samples. The
// monotonicity is broken. It is exacerbated by rate() because under normal
// operation, cumulative counting of buckets will cause the bucket counts to
// diverge such that small differences from missing samples are not a problem.
// rate() removes this divergence.)
// have counted all c1 observations and perhaps more, so that c >= c1.
//
// bucketQuantile depends on that monotonicity to do a binary search for the
// bucket with the φ-quantile count, so breaking the monotonicity
// guarantee causes bucketQuantile() to return undefined (nonsense) results.
//
// As a somewhat hacky solution until ingestion is atomic per scrape, we
// calculate the "envelope" of the histogram buckets, essentially removing
// any decreases in the count between successive buckets.
func ensureMonotonic(buckets buckets) {
// As a somewhat hacky solution, we calculate the "envelope" of the histogram
// buckets, essentially removing any decreases in the count between successive
// buckets. We return a bool to indicate if this monotonicity was forced or not.
func ensureMonotonic(buckets buckets) bool {
forced := false
max := buckets[0].count
for i := 1; i < len(buckets); i++ {
switch {
@ -380,8 +369,10 @@ func ensureMonotonic(buckets buckets) {
max = buckets[i].count
case buckets[i].count < max:
buckets[i].count = max
forced = true
}
}
return forced
}
// quantile calculates the given quantile of a vector of samples.

View file

@ -105,7 +105,8 @@ var (
MixedFloatsHistogramsWarning = fmt.Errorf("%w: encountered a mix of histograms and floats for metric name", PromQLWarning)
MixedClassicNativeHistogramsWarning = fmt.Errorf("%w: vector contains a mix of classic and native histograms for metric name", PromQLWarning)
PossibleNonCounterInfo = fmt.Errorf("%w: metric might not be a counter, name does not end in _total/_sum/_count:", PromQLInfo)
PossibleNonCounterInfo = fmt.Errorf("%w: metric might not be a counter, name does not end in _total/_sum/_count:", PromQLInfo)
HistogramQuantileForcedMonotonicityInfo = fmt.Errorf("%w: input to histogram_quantile needed to be fixed for monotonicity (and may give inaccurate results) for metric name", PromQLInfo)
)
type annoErr struct {
@ -163,3 +164,12 @@ func NewPossibleNonCounterInfo(metricName string, pos posrange.PositionRange) an
Err: fmt.Errorf("%w %q", PossibleNonCounterInfo, metricName),
}
}
// NewHistogramQuantileForcedMonotonicityInfo is used when the input (classic histograms) to
// histogram_quantile needs to be forced to be monotonic.
func NewHistogramQuantileForcedMonotonicityInfo(metricName string, pos posrange.PositionRange) annoErr {
return annoErr{
PositionRange: pos,
Err: fmt.Errorf("%w %q", HistogramQuantileForcedMonotonicityInfo, metricName),
}
}