prometheus/promql/quantile.go

// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package promql

import (
	"math"
	"slices"
	"sort"

	"github.com/prometheus/prometheus/model/histogram"
	"github.com/prometheus/prometheus/model/labels"
	"github.com/prometheus/prometheus/util/almost"
)

// smallDeltaTolerance is the threshold for relative deltas between classic
// histogram buckets that will be ignored by the histogram_quantile function
// because they are most likely artifacts of floating point precision issues.
// Testing on 2 sets of real data with bugs arising from small deltas,
// the safe ranges were from:
// - 1e-05 to 1e-15
// - 1e-06 to 1e-15
// Anything to the left of that would cause non-query-sharded data to have
// small deltas ignored (unnecessary and we should avoid this), and anything
// to the right of that would cause query-sharded data to not have its small
// deltas ignored (so the problem won't be fixed).
// For context, query sharding triggers these float precision errors in Mimir.
// To illustrate, with a relative deviation of 1e-12, we need to have 1e12
// observations in the bucket so that the change of one observation is small
// enough to get ignored. With the usual observation rate even of very busy
// services, this will hardly be reached in timeframes that matters for
// monitoring.
const smallDeltaTolerance = 1e-12

// Helpers to calculate quantiles.

// excludedLabels are the labels to exclude from signature calculation for
// quantiles.
var excludedLabels = []string{
	labels.MetricName,
	labels.BucketLabel,
}

type bucket struct {
	upperBound float64
	count      float64
}

// buckets implements sort.Interface.
type buckets []bucket

type metricWithBuckets struct {
	metric  labels.Labels
	buckets buckets
}

// bucketQuantile calculates the quantile 'q' based on the given buckets. The
// buckets will be sorted by upperBound by this function (i.e. no sorting
// needed before calling this function). The quantile value is interpolated
// assuming a linear distribution within a bucket. However, if the quantile
// falls into the highest bucket, the upper bound of the 2nd highest bucket is
// returned. A natural lower bound of 0 is assumed if the upper bound of the
// lowest bucket is greater 0. In that case, interpolation in the lowest bucket
// happens linearly between 0 and the upper bound of the lowest bucket.
// However, if the lowest bucket has an upper bound less or equal 0, this upper
// bound is returned if the quantile falls into the lowest bucket.
//
// There are a number of special cases (once we have a way to report errors
// happening during evaluations of AST functions, we should report those
// explicitly):
//
// If 'buckets' has 0 observations, NaN is returned.
//
// If 'buckets' has fewer than 2 elements, NaN is returned.
//
// If the highest bucket is not +Inf, NaN is returned.
//
// If q==NaN, NaN is returned.
//
// If q<0, -Inf is returned.
//
// If q>1, +Inf is returned.
//
// We also return a bool to indicate if monotonicity needed to be forced,
// and another bool to indicate if small differences between buckets (that
// are likely artifacts of floating point precision issues) have been
// ignored.
func bucketQuantile(q float64, buckets buckets) (float64, bool, bool) {
	if math.IsNaN(q) {
		return math.NaN(), false, false
	}
	if q < 0 {
		return math.Inf(-1), false, false
	}
	if q > 1 {
		return math.Inf(+1), false, false
	}
	slices.SortFunc(buckets, func(a, b bucket) int {
		// We don't expect the bucket boundary to be a NaN.
		if a.upperBound < b.upperBound {
			return -1
		}
		if a.upperBound > b.upperBound {
			return +1
		}
		return 0
	})
	if !math.IsInf(buckets[len(buckets)-1].upperBound, +1) {
		return math.NaN(), false, false
	}

	buckets = coalesceBuckets(buckets)
	forcedMonotonic, fixedPrecision := ensureMonotonicAndIgnoreSmallDeltas(buckets, smallDeltaTolerance)

	if len(buckets) < 2 {
		return math.NaN(), false, false
	}
	observations := buckets[len(buckets)-1].count
	if observations == 0 {
		return math.NaN(), false, false
	}
	rank := q * observations
	b := sort.Search(len(buckets)-1, func(i int) bool { return buckets[i].count >= rank })

	if b == len(buckets)-1 {
		return buckets[len(buckets)-2].upperBound, forcedMonotonic, fixedPrecision
	}
	if b == 0 && buckets[0].upperBound <= 0 {
		return buckets[0].upperBound, forcedMonotonic, fixedPrecision
	}
	var (
		bucketStart float64
		bucketEnd   = buckets[b].upperBound
		count       = buckets[b].count
	)
	if b > 0 {
		bucketStart = buckets[b-1].upperBound
		count -= buckets[b-1].count
		rank -= buckets[b-1].count
	}
	return bucketStart + (bucketEnd-bucketStart)*(rank/count), forcedMonotonic, fixedPrecision
}

// histogramQuantile calculates the quantile 'q' based on the given histogram.
//
// For custom buckets, the result is interpolated linearly, i.e. it is assumed
// the observations are uniformly distributed within each bucket. (This is a
// quite blunt assumption, but it is consistent with the interpolation method
// used for classic histograms so far.)
//
// For exponential buckets, the interpolation is done under the assumption that
// the samples within each bucket are distributed in a way that they would
// uniformly populate the buckets in a hypothetical histogram with higher
// resolution. For example, if the rank calculation suggests that the requested
// quantile is right in the middle of the population of the (1,2] bucket, we
// assume the quantile would be right at the bucket boundary between the two
// buckets the (1,2] bucket would be divided into if the histogram had double
// the resolution, which is 2**2**-1 = 1.4142... We call this exponential
// interpolation.
//
// However, for a quantile that ends up in the zero bucket, this method isn't
// very helpful (because there is an infinite number of buckets close to zero,
// so we would have to assume zero as the result). Therefore, we return to
// linear interpolation in the zero bucket.
//
// A natural lower bound of 0 is assumed if the histogram has only positive
// buckets. Likewise, a natural upper bound of 0 is assumed if the histogram has
// only negative buckets.
//
// There are a number of special cases:
//
// If the histogram has 0 observations, NaN is returned.
//
// If q<0, -Inf is returned.
//
// If q>1, +Inf is returned.
//
// If q is NaN, NaN is returned.
func histogramQuantile(q float64, h *histogram.FloatHistogram) float64 {
	if q < 0 {
		return math.Inf(-1)
	}
	if q > 1 {
		return math.Inf(+1)
	}

	if h.Count == 0 || math.IsNaN(q) {
		return math.NaN()
	}

	var (
		bucket histogram.Bucket[float64]
		count  float64
		it     histogram.BucketIterator[float64]
		rank   float64
	)

	// If there are NaN observations in the histogram (h.Sum is NaN), use the forward iterator.
	// If q < 0.5, use the forward iterator.
	// If q >= 0.5, use the reverse iterator.
	if math.IsNaN(h.Sum) || q < 0.5 {
		it = h.AllBucketIterator()
		rank = q * h.Count
	} else {
		it = h.AllReverseBucketIterator()
		rank = (1 - q) * h.Count
	}

	for it.Next() {
		bucket = it.At()
		if bucket.Count == 0 {
			continue
		}
		count += bucket.Count
		if count >= rank {
			break
		}
	}
	if !h.UsesCustomBuckets() && bucket.Lower < 0 && bucket.Upper > 0 {
		switch {
		case len(h.NegativeBuckets) == 0 && len(h.PositiveBuckets) > 0:
			// The result is in the zero bucket and the histogram has only
			// positive buckets. So we consider 0 to be the lower bound.
			bucket.Lower = 0
		case len(h.PositiveBuckets) == 0 && len(h.NegativeBuckets) > 0:
			// The result is in the zero bucket and the histogram has only
			// negative buckets. So we consider 0 to be the upper bound.
			bucket.Upper = 0
		}
	} else if h.UsesCustomBuckets() {
		if bucket.Lower == math.Inf(-1) {
			// first bucket, with lower bound -Inf
			if bucket.Upper <= 0 {
				return bucket.Upper
			}
			bucket.Lower = 0
		} else if bucket.Upper == math.Inf(1) {
			// last bucket, with upper bound +Inf
			return bucket.Lower
		}
	}
	// Due to numerical inaccuracies, we could end up with a higher count
	// than h.Count. Thus, make sure count is never higher than h.Count.
	if count > h.Count {
		count = h.Count
	}
	// We could have hit the highest bucket without even reaching the rank
	// (this should only happen if the histogram contains observations of
	// the value NaN), in which case we simply return the upper limit of the
	// highest explicit bucket.
	if count < rank {
		return bucket.Upper
	}

	// NaN observations increase h.Count but not the total number of
	// observations in the buckets. Therefore, we have to use the forward
	// iterator to find percentiles. We recognize histograms containing NaN
	// observations by checking if their h.Sum is NaN.
	if math.IsNaN(h.Sum) || q < 0.5 {
		rank -= count - bucket.Count
	} else {
		rank = count - rank
	}

	// The fraction of how far we are into the current bucket.
	fraction := rank / bucket.Count

	// Return linear interpolation for custom buckets and for quantiles that
	// end up in the zero bucket.
	if h.UsesCustomBuckets() || (bucket.Lower <= 0 && bucket.Upper >= 0) {
		return bucket.Lower + (bucket.Upper-bucket.Lower)*fraction
	}

	// For exponential buckets, we interpolate on a logarithmic scale. On a
	// logarithmic scale, the exponential bucket boundaries (for any schema)
	// become linear (every bucket has the same width). Therefore, after
	// taking the logarithm of both bucket boundaries, we can use the
	// calculated fraction in the same way as for linear interpolation (see
	// above). Finally, we return to the normal scale by applying the
	// exponential function to the result.
	logLower := math.Log2(math.Abs(bucket.Lower))
	logUpper := math.Log2(math.Abs(bucket.Upper))
	if bucket.Lower > 0 { // Positive bucket.
		return math.Exp2(logLower + (logUpper-logLower)*fraction)
	}
	// Otherwise, we are in a negative bucket and have to mirror things.
	return -math.Exp2(logUpper + (logLower-logUpper)*(1-fraction))
}

// histogramFraction calculates the fraction of observations between the
// provided lower and upper bounds, based on the provided histogram.
//
// histogramFraction is in a certain way the inverse of histogramQuantile.  If
// histogramQuantile(0.9, h) returns 123.4, then histogramFraction(-Inf, 123.4, h)
// returns 0.9.
//
// The same notes with regard to interpolation and assumptions about the zero
// bucket boundaries apply as for histogramQuantile.
//
// Whether either boundary is inclusive or exclusive doesn’t actually matter as
// long as interpolation has to be performed anyway. In the case of a boundary
// coinciding with a bucket boundary, the inclusive or exclusive nature of the
// boundary determines the exact behavior of the threshold. With the current
// implementation, that means that lower is exclusive for positive values and
// inclusive for negative values, while upper is inclusive for positive values
// and exclusive for negative values.
//
// Special cases:
//
// If the histogram has 0 observations, NaN is returned.
//
// Use a lower bound of -Inf to get the fraction of all observations below the
// upper bound.
//
// Use an upper bound of +Inf to get the fraction of all observations above the
// lower bound.
//
// If lower or upper is NaN, NaN is returned.
//
// If lower >= upper and the histogram has at least 1 observation, zero is returned.
func histogramFraction(lower, upper float64, h *histogram.FloatHistogram) float64 {
	if h.Count == 0 || math.IsNaN(lower) || math.IsNaN(upper) {
		return math.NaN()
	}
	if lower >= upper {
		return 0
	}

	var (
		rank, lowerRank, upperRank float64
		lowerSet, upperSet         bool
		it                         = h.AllBucketIterator()
	)
	for it.Next() {
		b := it.At()
		zeroBucket := false

		// interpolateLinearly is used for custom buckets to be
		// consistent with the linear interpolation known from classic
		// histograms. It is also used for the zero bucket.
		interpolateLinearly := func(v float64) float64 {
			return rank + b.Count*(v-b.Lower)/(b.Upper-b.Lower)
		}

		// interpolateExponentially is using the same exponential
		// interpolation method as above for histogramQuantile. This
		// method is a better fit for exponential bucketing.
		interpolateExponentially := func(v float64) float64 {
			var (
				logLower = math.Log2(math.Abs(b.Lower))
				logUpper = math.Log2(math.Abs(b.Upper))
				logV     = math.Log2(math.Abs(v))
				fraction float64
			)
			if v > 0 {
				fraction = (logV - logLower) / (logUpper - logLower)
			} else {
				fraction = 1 - ((logV - logUpper) / (logLower - logUpper))
			}
			return rank + b.Count*fraction
		}

		if b.Lower <= 0 && b.Upper >= 0 {
			zeroBucket = true
			switch {
			case len(h.NegativeBuckets) == 0 && len(h.PositiveBuckets) > 0:
				// This is the zero bucket and the histogram has only
				// positive buckets. So we consider 0 to be the lower
				// bound.
				b.Lower = 0
			case len(h.PositiveBuckets) == 0 && len(h.NegativeBuckets) > 0:
				// This is in the zero bucket and the histogram has only
				// negative buckets. So we consider 0 to be the upper
				// bound.
				b.Upper = 0
			}
		}
		if !lowerSet && b.Lower >= lower {
			// We have hit the lower value at the lower bucket boundary.
			lowerRank = rank
			lowerSet = true
		}
		if !upperSet && b.Lower >= upper {
			// We have hit the upper value at the lower bucket boundary.
			upperRank = rank
			upperSet = true
		}
		if lowerSet && upperSet {
			break
		}
		if !lowerSet && b.Lower < lower && b.Upper > lower {
			// The lower value is in this bucket.
			if h.UsesCustomBuckets() || zeroBucket {
				lowerRank = interpolateLinearly(lower)
			} else {
				lowerRank = interpolateExponentially(lower)
			}
			lowerSet = true
		}
		if !upperSet && b.Lower < upper && b.Upper > upper {
			// The upper value is in this bucket.
			if h.UsesCustomBuckets() || zeroBucket {
				upperRank = interpolateLinearly(upper)
			} else {
				upperRank = interpolateExponentially(upper)
			}
			upperSet = true
		}
		if lowerSet && upperSet {
			break
		}
		rank += b.Count
	}
	if !lowerSet || lowerRank > h.Count {
		lowerRank = h.Count
	}
	if !upperSet || upperRank > h.Count {
		upperRank = h.Count
	}

	return (upperRank - lowerRank) / h.Count
}

// coalesceBuckets merges buckets with the same upper bound.
//
// The input buckets must be sorted.
func coalesceBuckets(buckets buckets) buckets {
	last := buckets[0]
	i := 0
	for _, b := range buckets[1:] {
		if b.upperBound == last.upperBound {
			last.count += b.count
		} else {
			buckets[i] = last
			last = b
			i++
		}
	}
	buckets[i] = last
	return buckets[:i+1]
}

// The assumption that bucket counts increase monotonically with increasing
// upperBound may be violated during:
//
//   - Circumstances where data is already inconsistent at the target's side.
//   - Ingestion via the remote write receiver that Prometheus implements.
//   - Optimisation of query execution where precision is sacrificed for other
//     benefits, not by Prometheus but by systems built on top of it.
//   - Circumstances where floating point precision errors accumulate.
//
// Monotonicity is usually guaranteed because if a bucket with upper bound
// u1 has count c1, then any bucket with a higher upper bound u > u1 must
// have counted all c1 observations and perhaps more, so that c >= c1.
//
// bucketQuantile depends on that monotonicity to do a binary search for the
// bucket with the φ-quantile count, so breaking the monotonicity
// guarantee causes bucketQuantile() to return undefined (nonsense) results.
//
// As a somewhat hacky solution, we first silently ignore any numerically
// insignificant (relative delta below the requested tolerance and likely to
// be from floating point precision errors) differences between successive
// buckets regardless of the direction. Then we calculate the "envelope" of
// the histogram buckets, essentially removing any decreases in the count
// between successive buckets.
//
// We return a bool to indicate if this monotonicity was forced or not, and
// another bool to indicate if small deltas were ignored or not.
func ensureMonotonicAndIgnoreSmallDeltas(buckets buckets, tolerance float64) (bool, bool) {
	var forcedMonotonic, fixedPrecision bool
	prev := buckets[0].count
	for i := 1; i < len(buckets); i++ {
		curr := buckets[i].count // Assumed always positive.
		if curr == prev {
			// No correction needed if the counts are identical between buckets.
			continue
		}
		if almost.Equal(prev, curr, tolerance) {
			// Silently correct numerically insignificant differences from floating
			// point precision errors, regardless of direction.
			// Do not update the 'prev' value as we are ignoring the difference.
			buckets[i].count = prev
			fixedPrecision = true
			continue
		}
		if curr < prev {
			// Force monotonicity by removing any decreases regardless of magnitude.
			// Do not update the 'prev' value as we are ignoring the decrease.
			buckets[i].count = prev
			forcedMonotonic = true
			continue
		}
		prev = curr
	}
	return forcedMonotonic, fixedPrecision
}

// quantile calculates the given quantile of a vector of samples.
//
// The Vector will be sorted.
// If 'values' has zero elements, NaN is returned.
// If q==NaN, NaN is returned.
// If q<0, -Inf is returned.
// If q>1, +Inf is returned.
func quantile(q float64, values vectorByValueHeap) float64 {
	if len(values) == 0 || math.IsNaN(q) {
		return math.NaN()
	}
	if q < 0 {
		return math.Inf(-1)
	}
	if q > 1 {
		return math.Inf(+1)
	}
	sort.Sort(values)

	n := float64(len(values))
	// When the quantile lies between two samples,
	// we use a weighted average of the two samples.
	rank := q * (n - 1)

	lowerIndex := math.Max(0, math.Floor(rank))
	upperIndex := math.Min(n-1, lowerIndex+1)

	weight := rank - math.Floor(rank)
	return values[int(lowerIndex)].F*(1-weight) + values[int(upperIndex)].F*weight
}
-												Refactor query evaluation.

This copies the evaluation logic from the current rules/ package.
The new engine handles the execution process from query string to final result.
It provides query timeout and cancellation and general flexibility for
future changes.

functions.go: Add evaluation implementation. Slight changes to in/out data but
	not to the processing logic.
quantile.go: No changes.
analyzer.go: No changes.
engine.go: Actually new part. Mainly consists of evaluation methods
	which were not changed.
setup_test.go: Copy of rules/helpers_test.go to setup test storage.
promql_test.go: Copy of rules/rules_test.go.

											
										
										
											2015-03-30 10:13:36 -07:00
+								// Copyright 2015 The Prometheus Authors
 								// Licensed under the Apache License, Version 2.0 (the "License");
 								// you may not use this file except in compliance with the License.
 								// You may obtain a copy of the License at
 								//
 								// http://www.apache.org/licenses/LICENSE-2.0
 								//
 								// Unless required by applicable law or agreed to in writing, software
 								// distributed under the License is distributed on an "AS IS" BASIS,
 								// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								// See the License for the specific language governing permissions and
 								// limitations under the License.
 								package promql
 								import (
 									"math"
-												Move from golang.org/x/exp/slices into slices now that we only support Go >= 1.21

Prevent adding back golang.org/x/exp/slices.

Signed-off-by: machine424 <ayoubmrini424@gmail.com>

											
										
										
											2024-01-15 08:24:46 -08:00
+									"slices"
-												Refactor query evaluation.

This copies the evaluation logic from the current rules/ package.
The new engine handles the execution process from query string to final result.
It provides query timeout and cancellation and general flexibility for
future changes.

functions.go: Add evaluation implementation. Slight changes to in/out data but
	not to the processing logic.
quantile.go: No changes.
analyzer.go: No changes.
engine.go: Actually new part. Mainly consists of evaluation methods
	which were not changed.
setup_test.go: Copy of rules/helpers_test.go to setup test storage.
promql_test.go: Copy of rules/rules_test.go.

											
										
										
											2015-03-30 10:13:36 -07:00
+									"sort"
-												`histogram_quantile` for sparse histograms (#9935)

* MergeFloatBucketIterator for []FloatBucketIterator

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* histogram_quantile for histograms

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix histogram_quantile

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Unit test and enhancements

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Iterators to iterate buckets in reverse and all buckets together including zero bucket

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Consider all buckets for histogram_quantile and fix the implementation

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Remove unneeded code

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix lint

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-12-06 05:47:22 -08:00
+									"github.com/prometheus/prometheus/model/histogram"
-												Move packages out of deprecated pkg directory

This creates a new `model` directory and moves all data-model related
packages over there:
  exemplar labels relabel rulefmt textparse timestamp value

All the others are more or less utilities and have been moved to `util`:
  gate logging modetimevfs pool runtime

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2021-11-08 06:23:17 -08:00
+									"github.com/prometheus/prometheus/model/labels"
-												refactor: extract almost.Equal() to new package

To avoid a circular reference between promql and promqltest.

Signed-off-by: Bryan Boreham <bjboreham@gmail.com>

											
										
										
											2024-04-29 04:37:32 -07:00
+									"github.com/prometheus/prometheus/util/almost"
-												Refactor query evaluation.

This copies the evaluation logic from the current rules/ package.
The new engine handles the execution process from query string to final result.
It provides query timeout and cancellation and general flexibility for
future changes.

functions.go: Add evaluation implementation. Slight changes to in/out data but
	not to the processing logic.
quantile.go: No changes.
analyzer.go: No changes.
engine.go: Actually new part. Mainly consists of evaluation methods
	which were not changed.
setup_test.go: Copy of rules/helpers_test.go to setup test storage.
promql_test.go: Copy of rules/rules_test.go.

											
										
										
											2015-03-30 10:13:36 -07:00
+								)
-												PromQL: ignore small errors for bucketQuantile (#13153)

promql: Improve histogram_quantile calculation for classic buckets

Tiny differences between classic buckets are most likely caused by floating point precision issues. With this commit, relative changes below a certain threshold are ignored. This makes the result of histogram_quantile more meaningful, and also avoids triggering the _input to histogram_quantile needed to be fixed for monotonicity_ annotations in unactionable cases.

This commit also adds explanation of the new adjustment and of the monotonicity annotation to the documentation of `histogram_quantile`.

---------

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
											
										
										
											2023-11-24 15:05:38 -08:00
+								// smallDeltaTolerance is the threshold for relative deltas between classic
 								// histogram buckets that will be ignored by the histogram_quantile function
 								// because they are most likely artifacts of floating point precision issues.
 								// Testing on 2 sets of real data with bugs arising from small deltas,
 								// the safe ranges were from:
 								// - 1e-05 to 1e-15
 								// - 1e-06 to 1e-15
 								// Anything to the left of that would cause non-query-sharded data to have
 								// small deltas ignored (unnecessary and we should avoid this), and anything
 								// to the right of that would cause query-sharded data to not have its small
 								// deltas ignored (so the problem won't be fixed).
 								// For context, query sharding triggers these float precision errors in Mimir.
 								// To illustrate, with a relative deviation of 1e-12, we need to have 1e12
 								// observations in the bucket so that the change of one observation is small
 								// enough to get ignored. With the usual observation rate even of very busy
 								// services, this will hardly be reached in timeframes that matters for
 								// monitoring.
 								const smallDeltaTolerance = 1e-12
-												Refactor query evaluation.

This copies the evaluation logic from the current rules/ package.
The new engine handles the execution process from query string to final result.
It provides query timeout and cancellation and general flexibility for
future changes.

functions.go: Add evaluation implementation. Slight changes to in/out data but
	not to the processing logic.
quantile.go: No changes.
analyzer.go: No changes.
engine.go: Actually new part. Mainly consists of evaluation methods
	which were not changed.
setup_test.go: Copy of rules/helpers_test.go to setup test storage.
promql_test.go: Copy of rules/rules_test.go.

											
										
										
											2015-03-30 10:13:36 -07:00
+								// Helpers to calculate quantiles.
 								// excludedLabels are the labels to exclude from signature calculation for
 								// quantiles.
-												promql: migrate model types, use tsdb interfaces

											
										
										
											2016-12-23 04:51:59 -08:00
+								var excludedLabels = []string{
-												promql: use labels.Builder to modify labels

											
										
										
											2016-12-24 05:35:24 -08:00
+									labels.MetricName,
 									labels.BucketLabel,
-												Refactor query evaluation.

This copies the evaluation logic from the current rules/ package.
The new engine handles the execution process from query string to final result.
It provides query timeout and cancellation and general flexibility for
future changes.

functions.go: Add evaluation implementation. Slight changes to in/out data but
	not to the processing logic.
quantile.go: No changes.
analyzer.go: No changes.
engine.go: Actually new part. Mainly consists of evaluation methods
	which were not changed.
setup_test.go: Copy of rules/helpers_test.go to setup test storage.
promql_test.go: Copy of rules/rules_test.go.

											
										
										
											2015-03-30 10:13:36 -07:00
+								}
 								type bucket struct {
 									upperBound float64
-												promql: migrate model types, use tsdb interfaces

											
										
										
											2016-12-23 04:51:59 -08:00
+									count      float64
-												Refactor query evaluation.

This copies the evaluation logic from the current rules/ package.
The new engine handles the execution process from query string to final result.
It provides query timeout and cancellation and general flexibility for
future changes.

functions.go: Add evaluation implementation. Slight changes to in/out data but
	not to the processing logic.
quantile.go: No changes.
analyzer.go: No changes.
engine.go: Actually new part. Mainly consists of evaluation methods
	which were not changed.
setup_test.go: Copy of rules/helpers_test.go to setup test storage.
promql_test.go: Copy of rules/rules_test.go.

											
										
										
											2015-03-30 10:13:36 -07:00
+								}
 								// buckets implements sort.Interface.
 								type buckets []bucket
 								type metricWithBuckets struct {
-												promql: migrate model types, use tsdb interfaces

											
										
										
											2016-12-23 04:51:59 -08:00
+									metric  labels.Labels
-												Refactor query evaluation.

This copies the evaluation logic from the current rules/ package.
The new engine handles the execution process from query string to final result.
It provides query timeout and cancellation and general flexibility for
future changes.

functions.go: Add evaluation implementation. Slight changes to in/out data but
	not to the processing logic.
quantile.go: No changes.
analyzer.go: No changes.
engine.go: Actually new part. Mainly consists of evaluation methods
	which were not changed.
setup_test.go: Copy of rules/helpers_test.go to setup test storage.
promql_test.go: Copy of rules/rules_test.go.

											
										
										
											2015-03-30 10:13:36 -07:00
+									buckets buckets
 								}
-												Factor out quantile fucntion.

											
										
										
											2016-07-08 05:33:20 -07:00
+								// bucketQuantile calculates the quantile 'q' based on the given buckets. The
 								// buckets will be sorted by upperBound by this function (i.e. no sorting
 								// needed before calling this function). The quantile value is interpolated
 								// assuming a linear distribution within a bucket. However, if the quantile
 								// falls into the highest bucket, the upper bound of the 2nd highest bucket is
 								// returned. A natural lower bound of 0 is assumed if the upper bound of the
 								// lowest bucket is greater 0. In that case, interpolation in the lowest bucket
 								// happens linearly between 0 and the upper bound of the lowest bucket.
 								// However, if the lowest bucket has an upper bound less or equal 0, this upper
 								// bound is returned if the quantile falls into the lowest bucket.
-												Refactor query evaluation.

This copies the evaluation logic from the current rules/ package.
The new engine handles the execution process from query string to final result.
It provides query timeout and cancellation and general flexibility for
future changes.

functions.go: Add evaluation implementation. Slight changes to in/out data but
	not to the processing logic.
quantile.go: No changes.
analyzer.go: No changes.
engine.go: Actually new part. Mainly consists of evaluation methods
	which were not changed.
setup_test.go: Copy of rules/helpers_test.go to setup test storage.
promql_test.go: Copy of rules/rules_test.go.

											
										
										
											2015-03-30 10:13:36 -07:00
+								//
 								// There are a number of special cases (once we have a way to report errors
 								// happening during evaluations of AST functions, we should report those
 								// explicitly):
 								//
-												Return NaN for histogram_quantile when buckets have 0 observations (#7318)

Signed-off-by: jberny <f.bernardi89@gmail.com>
											
										
										
											2020-06-01 01:40:39 -07:00
+								// If 'buckets' has 0 observations, NaN is returned.
 								//
-												Refactor query evaluation.

This copies the evaluation logic from the current rules/ package.
The new engine handles the execution process from query string to final result.
It provides query timeout and cancellation and general flexibility for
future changes.

functions.go: Add evaluation implementation. Slight changes to in/out data but
	not to the processing logic.
quantile.go: No changes.
analyzer.go: No changes.
engine.go: Actually new part. Mainly consists of evaluation methods
	which were not changed.
setup_test.go: Copy of rules/helpers_test.go to setup test storage.
promql_test.go: Copy of rules/rules_test.go.

											
										
										
											2015-03-30 10:13:36 -07:00
+								// If 'buckets' has fewer than 2 elements, NaN is returned.
 								//
 								// If the highest bucket is not +Inf, NaN is returned.
 								//
-												add doc

Signed-off-by: jyz0309 <45495947@qq.com>

											
										
										
											2022-02-13 05:59:03 -08:00
+								// If q==NaN, NaN is returned.
 								//
-												Refactor query evaluation.

This copies the evaluation logic from the current rules/ package.
The new engine handles the execution process from query string to final result.
It provides query timeout and cancellation and general flexibility for
future changes.

functions.go: Add evaluation implementation. Slight changes to in/out data but
	not to the processing logic.
quantile.go: No changes.
analyzer.go: No changes.
engine.go: Actually new part. Mainly consists of evaluation methods
	which were not changed.
setup_test.go: Copy of rules/helpers_test.go to setup test storage.
promql_test.go: Copy of rules/rules_test.go.

											
										
										
											2015-03-30 10:13:36 -07:00
+								// If q<0, -Inf is returned.
 								//
 								// If q>1, +Inf is returned.
-												Add warning when monotonicity is forced in the input to histogram_quantile

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>

											
										
										
											2023-10-04 03:53:55 -07:00
+								//
-												PromQL: ignore small errors for bucketQuantile (#13153)

promql: Improve histogram_quantile calculation for classic buckets

Tiny differences between classic buckets are most likely caused by floating point precision issues. With this commit, relative changes below a certain threshold are ignored. This makes the result of histogram_quantile more meaningful, and also avoids triggering the _input to histogram_quantile needed to be fixed for monotonicity_ annotations in unactionable cases.

This commit also adds explanation of the new adjustment and of the monotonicity annotation to the documentation of `histogram_quantile`.

---------

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
											
										
										
											2023-11-24 15:05:38 -08:00
+								// We also return a bool to indicate if monotonicity needed to be forced,
 								// and another bool to indicate if small differences between buckets (that
 								// are likely artifacts of floating point precision issues) have been
 								// ignored.
 								func bucketQuantile(q float64, buckets buckets) (float64, bool, bool) {
-												add doc

Signed-off-by: jyz0309 <45495947@qq.com>

											
										
										
											2022-02-13 05:59:03 -08:00
+									if math.IsNaN(q) {
-												PromQL: ignore small errors for bucketQuantile (#13153)

promql: Improve histogram_quantile calculation for classic buckets

Tiny differences between classic buckets are most likely caused by floating point precision issues. With this commit, relative changes below a certain threshold are ignored. This makes the result of histogram_quantile more meaningful, and also avoids triggering the _input to histogram_quantile needed to be fixed for monotonicity_ annotations in unactionable cases.

This commit also adds explanation of the new adjustment and of the monotonicity annotation to the documentation of `histogram_quantile`.

---------

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
											
										
										
											2023-11-24 15:05:38 -08:00
+										return math.NaN(), false, false
-												add NaN case

Signed-off-by: jyz0309 <45495947@qq.com>

											
										
										
											2022-02-13 05:41:28 -08:00
+									}
-												Refactor query evaluation.

This copies the evaluation logic from the current rules/ package.
The new engine handles the execution process from query string to final result.
It provides query timeout and cancellation and general flexibility for
future changes.

functions.go: Add evaluation implementation. Slight changes to in/out data but
	not to the processing logic.
quantile.go: No changes.
analyzer.go: No changes.
engine.go: Actually new part. Mainly consists of evaluation methods
	which were not changed.
setup_test.go: Copy of rules/helpers_test.go to setup test storage.
promql_test.go: Copy of rules/rules_test.go.

											
										
										
											2015-03-30 10:13:36 -07:00
+									if q < 0 {
-												PromQL: ignore small errors for bucketQuantile (#13153)

promql: Improve histogram_quantile calculation for classic buckets

Tiny differences between classic buckets are most likely caused by floating point precision issues. With this commit, relative changes below a certain threshold are ignored. This makes the result of histogram_quantile more meaningful, and also avoids triggering the _input to histogram_quantile needed to be fixed for monotonicity_ annotations in unactionable cases.

This commit also adds explanation of the new adjustment and of the monotonicity annotation to the documentation of `histogram_quantile`.

---------

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
											
										
										
											2023-11-24 15:05:38 -08:00
+										return math.Inf(-1), false, false
-												Refactor query evaluation.

This copies the evaluation logic from the current rules/ package.
The new engine handles the execution process from query string to final result.
It provides query timeout and cancellation and general flexibility for
future changes.

functions.go: Add evaluation implementation. Slight changes to in/out data but
	not to the processing logic.
quantile.go: No changes.
analyzer.go: No changes.
engine.go: Actually new part. Mainly consists of evaluation methods
	which were not changed.
setup_test.go: Copy of rules/helpers_test.go to setup test storage.
promql_test.go: Copy of rules/rules_test.go.

											
										
										
											2015-03-30 10:13:36 -07:00
+									}
 									if q > 1 {
-												PromQL: ignore small errors for bucketQuantile (#13153)

promql: Improve histogram_quantile calculation for classic buckets

Tiny differences between classic buckets are most likely caused by floating point precision issues. With this commit, relative changes below a certain threshold are ignored. This makes the result of histogram_quantile more meaningful, and also avoids triggering the _input to histogram_quantile needed to be fixed for monotonicity_ annotations in unactionable cases.

This commit also adds explanation of the new adjustment and of the monotonicity annotation to the documentation of `histogram_quantile`.

---------

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
											
										
										
											2023-11-24 15:05:38 -08:00
+										return math.Inf(+1), false, false
-												Refactor query evaluation.

This copies the evaluation logic from the current rules/ package.
The new engine handles the execution process from query string to final result.
It provides query timeout and cancellation and general flexibility for
future changes.

functions.go: Add evaluation implementation. Slight changes to in/out data but
	not to the processing logic.
quantile.go: No changes.
analyzer.go: No changes.
engine.go: Actually new part. Mainly consists of evaluation methods
	which were not changed.
setup_test.go: Copy of rules/helpers_test.go to setup test storage.
promql_test.go: Copy of rules/rules_test.go.

											
										
										
											2015-03-30 10:13:36 -07:00
+									}
-												Update exp package (#12650)


											
										
										
											2023-09-21 13:53:51 -07:00
+									slices.SortFunc(buckets, func(a, b bucket) int {
 										// We don't expect the bucket boundary to be a NaN.
 										if a.upperBound < b.upperBound {
 											return -1
 										}
 										if a.upperBound > b.upperBound {
 											return +1
 										}
 										return 0
-												Replace sort.Sort with faster slices.SortFunc

The generic version is more efficient.

Signed-off-by: Bryan Boreham <bjboreham@gmail.com>

											
										
										
											2023-07-08 05:45:56 -07:00
+									})
-												Refactor query evaluation.

This copies the evaluation logic from the current rules/ package.
The new engine handles the execution process from query string to final result.
It provides query timeout and cancellation and general flexibility for
future changes.

functions.go: Add evaluation implementation. Slight changes to in/out data but
	not to the processing logic.
quantile.go: No changes.
analyzer.go: No changes.
engine.go: Actually new part. Mainly consists of evaluation methods
	which were not changed.
setup_test.go: Copy of rules/helpers_test.go to setup test storage.
promql_test.go: Copy of rules/rules_test.go.

											
										
										
											2015-03-30 10:13:36 -07:00
+									if !math.IsInf(buckets[len(buckets)-1].upperBound, +1) {
-												PromQL: ignore small errors for bucketQuantile (#13153)

promql: Improve histogram_quantile calculation for classic buckets

Tiny differences between classic buckets are most likely caused by floating point precision issues. With this commit, relative changes below a certain threshold are ignored. This makes the result of histogram_quantile more meaningful, and also avoids triggering the _input to histogram_quantile needed to be fixed for monotonicity_ annotations in unactionable cases.

This commit also adds explanation of the new adjustment and of the monotonicity annotation to the documentation of `histogram_quantile`.

---------

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
											
										
										
											2023-11-24 15:05:38 -08:00
+										return math.NaN(), false, false
-												Refactor query evaluation.

This copies the evaluation logic from the current rules/ package.
The new engine handles the execution process from query string to final result.
It provides query timeout and cancellation and general flexibility for
future changes.

functions.go: Add evaluation implementation. Slight changes to in/out data but
	not to the processing logic.
quantile.go: No changes.
analyzer.go: No changes.
engine.go: Actually new part. Mainly consists of evaluation methods
	which were not changed.
setup_test.go: Copy of rules/helpers_test.go to setup test storage.
promql_test.go: Copy of rules/rules_test.go.

											
										
										
											2015-03-30 10:13:36 -07:00
+									}
-												In histogram_quantile merge buckets with equivalent le values (#5158)

This makes things generally more resilient, and will
help with OpenMetrics transitions (and inconsistencies).

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-02-01 02:22:44 -08:00
+									buckets = coalesceBuckets(buckets)
-												PromQL: ignore small errors for bucketQuantile (#13153)

promql: Improve histogram_quantile calculation for classic buckets

Tiny differences between classic buckets are most likely caused by floating point precision issues. With this commit, relative changes below a certain threshold are ignored. This makes the result of histogram_quantile more meaningful, and also avoids triggering the _input to histogram_quantile needed to be fixed for monotonicity_ annotations in unactionable cases.

This commit also adds explanation of the new adjustment and of the monotonicity annotation to the documentation of `histogram_quantile`.

---------

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
											
										
										
											2023-11-24 15:05:38 -08:00
+									forcedMonotonic, fixedPrecision := ensureMonotonicAndIgnoreSmallDeltas(buckets, smallDeltaTolerance)
-												Force buckets in a histogram to be monotonic for quantile estimation (#2610)

* Force buckets in a histogram to be monotonic for quantile estimation

The assumption that bucket counts increase monotonically with increasing
upperBound may be violated during:

  * Recording rule evaluation of histogram_quantile, especially when rate()
     has been applied to the underlying bucket timeseries.
  * Evaluation of histogram_quantile computed over federated bucket
     timeseries, especially when rate() has been applied

This is because scraped data is not made available to RR evalution or
federation atomically, so some buckets are computed with data from the N
most recent scrapes, but the other buckets are missing the most recent
observations.

Monotonicity is usually guaranteed because if a bucket with upper bound
u1 has count c1, then any bucket with a higher upper bound u > u1 must
have counted all c1 observations and perhaps more, so that c  >= c1.

Randomly interspersed partial sampling breaks that guarantee, and rate()
exacerbates it. Specifically, suppose bucket le=1000 has a count of 10 from
4 samples but the bucket with le=2000 has a count of 7, from 3 samples. The
monotonicity is broken. It is exacerbated by rate() because under normal
operation, cumulative counting of buckets will cause the bucket counts to
diverge such that small differences from missing samples are not a problem.
rate() removes this divergence.)

bucketQuantile depends on that monotonicity to do a binary search for the
bucket with the qth percentile count, so breaking the monotonicity
guarantee causes bucketQuantile() to return undefined (nonsense) results.

As a somewhat hacky solution until the Prometheus project is ready to
accept the changes required to make scrapes atomic, we calculate the
"envelope" of the histogram buckets, essentially removing any decreases
in the count between successive buckets.

* Fix up comment docs for ensureMonotonic

* ensureMonotonic: Use switch statement

Use switch statement rather than if/else for better readability.
Process the most frequent cases first.

											
										
										
											2017-04-14 07:21:49 -07:00
-												In histogram_quantile merge buckets with equivalent le values (#5158)

This makes things generally more resilient, and will
help with OpenMetrics transitions (and inconsistencies).

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-02-01 02:22:44 -08:00
+									if len(buckets) < 2 {
-												PromQL: ignore small errors for bucketQuantile (#13153)

promql: Improve histogram_quantile calculation for classic buckets

Tiny differences between classic buckets are most likely caused by floating point precision issues. With this commit, relative changes below a certain threshold are ignored. This makes the result of histogram_quantile more meaningful, and also avoids triggering the _input to histogram_quantile needed to be fixed for monotonicity_ annotations in unactionable cases.

This commit also adds explanation of the new adjustment and of the monotonicity annotation to the documentation of `histogram_quantile`.

---------

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
											
										
										
											2023-11-24 15:05:38 -08:00
+										return math.NaN(), false, false
-												In histogram_quantile merge buckets with equivalent le values (#5158)

This makes things generally more resilient, and will
help with OpenMetrics transitions (and inconsistencies).

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-02-01 02:22:44 -08:00
+									}
-												Return NaN for histogram_quantile when buckets have 0 observations (#7318)

Signed-off-by: jberny <f.bernardi89@gmail.com>
											
										
										
											2020-06-01 01:40:39 -07:00
+									observations := buckets[len(buckets)-1].count
 									if observations == 0 {
-												PromQL: ignore small errors for bucketQuantile (#13153)

promql: Improve histogram_quantile calculation for classic buckets

Tiny differences between classic buckets are most likely caused by floating point precision issues. With this commit, relative changes below a certain threshold are ignored. This makes the result of histogram_quantile more meaningful, and also avoids triggering the _input to histogram_quantile needed to be fixed for monotonicity_ annotations in unactionable cases.

This commit also adds explanation of the new adjustment and of the monotonicity annotation to the documentation of `histogram_quantile`.

---------

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
											
										
										
											2023-11-24 15:05:38 -08:00
+										return math.NaN(), false, false
-												Return NaN for histogram_quantile when buckets have 0 observations (#7318)

Signed-off-by: jberny <f.bernardi89@gmail.com>
											
										
										
											2020-06-01 01:40:39 -07:00
+									}
 									rank := q * observations
-												Refactor query evaluation.

This copies the evaluation logic from the current rules/ package.
The new engine handles the execution process from query string to final result.
It provides query timeout and cancellation and general flexibility for
future changes.

functions.go: Add evaluation implementation. Slight changes to in/out data but
	not to the processing logic.
quantile.go: No changes.
analyzer.go: No changes.
engine.go: Actually new part. Mainly consists of evaluation methods
	which were not changed.
setup_test.go: Copy of rules/helpers_test.go to setup test storage.
promql_test.go: Copy of rules/rules_test.go.

											
										
										
											2015-03-30 10:13:36 -07:00
+									b := sort.Search(len(buckets)-1, func(i int) bool { return buckets[i].count >= rank })
 									if b == len(buckets)-1 {
-												PromQL: ignore small errors for bucketQuantile (#13153)

promql: Improve histogram_quantile calculation for classic buckets

Tiny differences between classic buckets are most likely caused by floating point precision issues. With this commit, relative changes below a certain threshold are ignored. This makes the result of histogram_quantile more meaningful, and also avoids triggering the _input to histogram_quantile needed to be fixed for monotonicity_ annotations in unactionable cases.

This commit also adds explanation of the new adjustment and of the monotonicity annotation to the documentation of `histogram_quantile`.

---------

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
											
										
										
											2023-11-24 15:05:38 -08:00
+										return buckets[len(buckets)-2].upperBound, forcedMonotonic, fixedPrecision
-												Refactor query evaluation.

This copies the evaluation logic from the current rules/ package.
The new engine handles the execution process from query string to final result.
It provides query timeout and cancellation and general flexibility for
future changes.

functions.go: Add evaluation implementation. Slight changes to in/out data but
	not to the processing logic.
quantile.go: No changes.
analyzer.go: No changes.
engine.go: Actually new part. Mainly consists of evaluation methods
	which were not changed.
setup_test.go: Copy of rules/helpers_test.go to setup test storage.
promql_test.go: Copy of rules/rules_test.go.

											
										
										
											2015-03-30 10:13:36 -07:00
+									}
 									if b == 0 && buckets[0].upperBound <= 0 {
-												PromQL: ignore small errors for bucketQuantile (#13153)

promql: Improve histogram_quantile calculation for classic buckets

Tiny differences between classic buckets are most likely caused by floating point precision issues. With this commit, relative changes below a certain threshold are ignored. This makes the result of histogram_quantile more meaningful, and also avoids triggering the _input to histogram_quantile needed to be fixed for monotonicity_ annotations in unactionable cases.

This commit also adds explanation of the new adjustment and of the monotonicity annotation to the documentation of `histogram_quantile`.

---------

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
											
										
										
											2023-11-24 15:05:38 -08:00
+										return buckets[0].upperBound, forcedMonotonic, fixedPrecision
-												Refactor query evaluation.

This copies the evaluation logic from the current rules/ package.
The new engine handles the execution process from query string to final result.
It provides query timeout and cancellation and general flexibility for
future changes.

functions.go: Add evaluation implementation. Slight changes to in/out data but
	not to the processing logic.
quantile.go: No changes.
analyzer.go: No changes.
engine.go: Actually new part. Mainly consists of evaluation methods
	which were not changed.
setup_test.go: Copy of rules/helpers_test.go to setup test storage.
promql_test.go: Copy of rules/rules_test.go.

											
										
										
											2015-03-30 10:13:36 -07:00
+									}
 									var (
 										bucketStart float64
 										bucketEnd   = buckets[b].upperBound
 										count       = buckets[b].count
 									)
 									if b > 0 {
 										bucketStart = buckets[b-1].upperBound
 										count -= buckets[b-1].count
 										rank -= buckets[b-1].count
 									}
-												PromQL: ignore small errors for bucketQuantile (#13153)

promql: Improve histogram_quantile calculation for classic buckets

Tiny differences between classic buckets are most likely caused by floating point precision issues. With this commit, relative changes below a certain threshold are ignored. This makes the result of histogram_quantile more meaningful, and also avoids triggering the _input to histogram_quantile needed to be fixed for monotonicity_ annotations in unactionable cases.

This commit also adds explanation of the new adjustment and of the monotonicity annotation to the documentation of `histogram_quantile`.

---------

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
											
										
										
											2023-11-24 15:05:38 -08:00
+									return bucketStart + (bucketEnd-bucketStart)*(rank/count), forcedMonotonic, fixedPrecision
-												Refactor query evaluation.

This copies the evaluation logic from the current rules/ package.
The new engine handles the execution process from query string to final result.
It provides query timeout and cancellation and general flexibility for
future changes.

functions.go: Add evaluation implementation. Slight changes to in/out data but
	not to the processing logic.
quantile.go: No changes.
analyzer.go: No changes.
engine.go: Actually new part. Mainly consists of evaluation methods
	which were not changed.
setup_test.go: Copy of rules/helpers_test.go to setup test storage.
promql_test.go: Copy of rules/rules_test.go.

											
										
										
											2015-03-30 10:13:36 -07:00
+								}
-												Factor out quantile fucntion.

											
										
										
											2016-07-08 05:33:20 -07:00
-												`histogram_quantile` for sparse histograms (#9935)

* MergeFloatBucketIterator for []FloatBucketIterator

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* histogram_quantile for histograms

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix histogram_quantile

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Unit test and enhancements

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Iterators to iterate buckets in reverse and all buckets together including zero bucket

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Consider all buckets for histogram_quantile and fix the implementation

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Remove unneeded code

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix lint

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-12-06 05:47:22 -08:00
+								// histogramQuantile calculates the quantile 'q' based on the given histogram.
-												histograms: Doc comment and naming improvements

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2021-12-15 07:50:37 -08:00
+								//
-												promql(native histograms): Introduce exponential interpolation

The linear interpolation (assuming that observations are uniformly
distributed within a bucket) is a solid and simple assumption in lack
of any other information. However, the exponential bucketing used by
standard schemas of native histograms has been chosen to cover the
whole range of observations in a way that bucket populations are
spread out over buckets in a reasonably way for typical distributions
encountered in real-world scenarios.

This is the origin of the idea implemented here: If we divide a given
bucket into two (or more) smaller exponential buckets, we "most
naturally" expect that the samples in the original buckets will split
among those smaller buckets in a more or less uniform fashion. With
this assumption, we end up with an "exponential interpolation", which
therefore appears to be a better match for histograms with exponential
bucketing.

This commit leaves the linear interpolation in place for NHCB, but
changes the interpolation for exponential native histograms to
exponential. This affects `histogram_quantile` and
`histogram_fraction` (because the latter is more or less the inverse
of the former).

The zero bucket has to be treated specially because the assumption
above would lead to an "interpolation to zero" (the bucket density
approaches infinity around zero, and with the postulated uniform usage
of buckets, we would end up with an estimate of zero for all quantiles
ending up in the zero bucket). We simply fall back to linear
interpolation within the zero bucket.

At the same time, this commit makes the call to stick with the
assumption that the zero bucket only contains positive observations
for native histograms without negative buckets (and vice versa). (This
is an assumption relevant for interpolation. It is a mostly academic
point, as the zero bucket is supposed to be very small anyway.
However, in cases where it _is_ relevantly broad, the assumption helps
a lot in practice.)

This commit also updates and completes the documentation to match both
details about interpolation.

As a more high level note: The approach here attempts to strike a
balance between a more simplistic approach without any assumption, and
a more involved approach with more sophisticated assumptions. I will
shortly describe both for reference:

The "zero assumption" approach would be to not interpolate at all, but
_always_ return the harmonic mean of the bucket boundaries of the
bucket the quantile ends up in. This has the advantage of minimizing
the maximum possible relative error of the quantile estimation.
(Depending on the exact definition of the relative error of an
estimation, there is also an argument to return the arithmetic mean of
the bucket boundaries.) While limiting the maximum possible relative
error is a good property, this approach would throw away the
information if a quantile is closer to the upper or lower end of the
population within a bucket. This can be valuable trending information
in a dashboard. With any kind of interpolation, the maximum possible
error of a quantile estimation increases to the full width of a bucket
(i.e. it more than doubles for the harmonic mean approach, and
precisely doubles for the arithmetic mean approach). However, in
return the _expectation value_ of the error decreases. The increase of
the theoretical maximum only has practical relevance for pathologic
distributions. For example, if there are thousand observations within
a bucket, they could _all_ be at the upper bound of the bucket. If the
quantile calculation picks the 1st observation in the bucket as the
relevant one, an interpolation will yield a value close to the lower
bucket boundary, while the true quantile value is close to the upper
boundary.

The "fancy interpolation" approach would be one that analyses the
_actual_ distribution of samples in the histogram. A lot of statistics
could be applied based on the information we have available in the
histogram. This would include the population of neighboring (or even
all) buckets in the histogram. In general, the resolution of a native
histogram should be quite high, and therefore, those "fancy"
approaches would increase the computational cost quite a bit with very
little practical benefits (i.e. just tiny corrections of the estimated
quantile value). The results are also much harder to reason with.

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2024-08-15 05:19:16 -07:00
+								// For custom buckets, the result is interpolated linearly, i.e. it is assumed
 								// the observations are uniformly distributed within each bucket. (This is a
 								// quite blunt assumption, but it is consistent with the interpolation method
 								// used for classic histograms so far.)
 								//
 								// For exponential buckets, the interpolation is done under the assumption that
 								// the samples within each bucket are distributed in a way that they would
 								// uniformly populate the buckets in a hypothetical histogram with higher
 								// resolution. For example, if the rank calculation suggests that the requested
 								// quantile is right in the middle of the population of the (1,2] bucket, we
 								// assume the quantile would be right at the bucket boundary between the two
 								// buckets the (1,2] bucket would be divided into if the histogram had double
 								// the resolution, which is 2**2**-1 = 1.4142... We call this exponential
 								// interpolation.
 								//
 								// However, for a quantile that ends up in the zero bucket, this method isn't
 								// very helpful (because there is an infinite number of buckets close to zero,
 								// so we would have to assume zero as the result). Therefore, we return to
 								// linear interpolation in the zero bucket.
-												histograms: Doc comment and naming improvements

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2021-12-15 07:50:37 -08:00
+								//
-												promql: Refine zero bucket treatment in histogramQuantile

Essentially, this mirrors the existing behavior for negative buckets:
If a histogram has only negative buckets, the upper bound of the zero
bucket is assumed to be zero.

Furthermore, it makes sure that the zero bucket boundaries are not
modified if a histogram that has no buckets at all but samples in the
zero bucket.

Also, add an TODO to vet if we really want this behavior.

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2022-06-16 09:54:28 -07:00
+								// A natural lower bound of 0 is assumed if the histogram has only positive
 								// buckets. Likewise, a natural upper bound of 0 is assumed if the histogram has
 								// only negative buckets.
-												`histogram_quantile` for sparse histograms (#9935)

* MergeFloatBucketIterator for []FloatBucketIterator

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* histogram_quantile for histograms

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix histogram_quantile

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Unit test and enhancements

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Iterators to iterate buckets in reverse and all buckets together including zero bucket

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Consider all buckets for histogram_quantile and fix the implementation

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Remove unneeded code

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix lint

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-12-06 05:47:22 -08:00
+								//
-												promql(native histograms): Introduce exponential interpolation

The linear interpolation (assuming that observations are uniformly
distributed within a bucket) is a solid and simple assumption in lack
of any other information. However, the exponential bucketing used by
standard schemas of native histograms has been chosen to cover the
whole range of observations in a way that bucket populations are
spread out over buckets in a reasonably way for typical distributions
encountered in real-world scenarios.

This is the origin of the idea implemented here: If we divide a given
bucket into two (or more) smaller exponential buckets, we "most
naturally" expect that the samples in the original buckets will split
among those smaller buckets in a more or less uniform fashion. With
this assumption, we end up with an "exponential interpolation", which
therefore appears to be a better match for histograms with exponential
bucketing.

This commit leaves the linear interpolation in place for NHCB, but
changes the interpolation for exponential native histograms to
exponential. This affects `histogram_quantile` and
`histogram_fraction` (because the latter is more or less the inverse
of the former).

The zero bucket has to be treated specially because the assumption
above would lead to an "interpolation to zero" (the bucket density
approaches infinity around zero, and with the postulated uniform usage
of buckets, we would end up with an estimate of zero for all quantiles
ending up in the zero bucket). We simply fall back to linear
interpolation within the zero bucket.

At the same time, this commit makes the call to stick with the
assumption that the zero bucket only contains positive observations
for native histograms without negative buckets (and vice versa). (This
is an assumption relevant for interpolation. It is a mostly academic
point, as the zero bucket is supposed to be very small anyway.
However, in cases where it _is_ relevantly broad, the assumption helps
a lot in practice.)

This commit also updates and completes the documentation to match both
details about interpolation.

As a more high level note: The approach here attempts to strike a
balance between a more simplistic approach without any assumption, and
a more involved approach with more sophisticated assumptions. I will
shortly describe both for reference:

The "zero assumption" approach would be to not interpolate at all, but
_always_ return the harmonic mean of the bucket boundaries of the
bucket the quantile ends up in. This has the advantage of minimizing
the maximum possible relative error of the quantile estimation.
(Depending on the exact definition of the relative error of an
estimation, there is also an argument to return the arithmetic mean of
the bucket boundaries.) While limiting the maximum possible relative
error is a good property, this approach would throw away the
information if a quantile is closer to the upper or lower end of the
population within a bucket. This can be valuable trending information
in a dashboard. With any kind of interpolation, the maximum possible
error of a quantile estimation increases to the full width of a bucket
(i.e. it more than doubles for the harmonic mean approach, and
precisely doubles for the arithmetic mean approach). However, in
return the _expectation value_ of the error decreases. The increase of
the theoretical maximum only has practical relevance for pathologic
distributions. For example, if there are thousand observations within
a bucket, they could _all_ be at the upper bound of the bucket. If the
quantile calculation picks the 1st observation in the bucket as the
relevant one, an interpolation will yield a value close to the lower
bucket boundary, while the true quantile value is close to the upper
boundary.

The "fancy interpolation" approach would be one that analyses the
_actual_ distribution of samples in the histogram. A lot of statistics
could be applied based on the information we have available in the
histogram. This would include the population of neighboring (or even
all) buckets in the histogram. In general, the resolution of a native
histogram should be quite high, and therefore, those "fancy"
approaches would increase the computational cost quite a bit with very
little practical benefits (i.e. just tiny corrections of the estimated
quantile value). The results are also much harder to reason with.

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2024-08-15 05:19:16 -07:00
+								// There are a number of special cases:
-												`histogram_quantile` for sparse histograms (#9935)

* MergeFloatBucketIterator for []FloatBucketIterator

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* histogram_quantile for histograms

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix histogram_quantile

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Unit test and enhancements

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Iterators to iterate buckets in reverse and all buckets together including zero bucket

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Consider all buckets for histogram_quantile and fix the implementation

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Remove unneeded code

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix lint

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-12-06 05:47:22 -08:00
+								//
-												histograms: Doc comment and naming improvements

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2021-12-15 07:50:37 -08:00
+								// If the histogram has 0 observations, NaN is returned.
-												`histogram_quantile` for sparse histograms (#9935)

* MergeFloatBucketIterator for []FloatBucketIterator

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* histogram_quantile for histograms

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix histogram_quantile

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Unit test and enhancements

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Iterators to iterate buckets in reverse and all buckets together including zero bucket

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Consider all buckets for histogram_quantile and fix the implementation

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Remove unneeded code

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix lint

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-12-06 05:47:22 -08:00
+								//
 								// If q<0, -Inf is returned.
 								//
-												histograms: Doc comment and naming improvements

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2021-12-15 07:50:37 -08:00
+								// If q>1, +Inf is returned.
-												promql: Add histogram_fraction function

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2022-06-16 11:44:12 -07:00
+								//
 								// If q is NaN, NaN is returned.
-												`histogram_quantile` for sparse histograms (#9935)

* MergeFloatBucketIterator for []FloatBucketIterator

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* histogram_quantile for histograms

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix histogram_quantile

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Unit test and enhancements

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Iterators to iterate buckets in reverse and all buckets together including zero bucket

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Consider all buckets for histogram_quantile and fix the implementation

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Remove unneeded code

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix lint

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-12-06 05:47:22 -08:00
+								func histogramQuantile(q float64, h *histogram.FloatHistogram) float64 {
 									if q < 0 {
 										return math.Inf(-1)
 									}
 									if q > 1 {
 										return math.Inf(+1)
 									}
-												promql: Add histogram_fraction function

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2022-06-16 11:44:12 -07:00
+									if h.Count == 0 || math.IsNaN(q) {
-												`histogram_quantile` for sparse histograms (#9935)

* MergeFloatBucketIterator for []FloatBucketIterator

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* histogram_quantile for histograms

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix histogram_quantile

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Unit test and enhancements

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Iterators to iterate buckets in reverse and all buckets together including zero bucket

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Consider all buckets for histogram_quantile and fix the implementation

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Remove unneeded code

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix lint

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-12-06 05:47:22 -08:00
+										return math.NaN()
 									}
 									var (
-												histogram: Remove code replication via generics (#11361)

* histogram: Simplify iterators

We don't really need currLower and currUpper and can calculate it when
needed (as already done for the floatBucketIterator). The calculation
is cheap, while keeping those extra variables around costs RAM
(potentially a lot with many iterators).

* histogram: Convert Bucket/FloatBucket to one generic type

* histogram: Move some bucket iterator code into generic base iterator

* histogram: Remove cumulative iterator for FloatHistogram

We added it in the past for completeness (Histogram has one), but it
has never been used. Plus, even the cumulative iterator for Histogram
is only there for test reasons.

We can always add it back, and then maybe even using generics.

Signed-off-by: beorn7 <beorn@grafana.com>
											
										
										
											2022-10-03 04:15:27 -07:00
+										bucket histogram.Bucket[float64]
-												`histogram_quantile` for sparse histograms (#9935)

* MergeFloatBucketIterator for []FloatBucketIterator

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* histogram_quantile for histograms

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix histogram_quantile

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Unit test and enhancements

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Iterators to iterate buckets in reverse and all buckets together including zero bucket

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Consider all buckets for histogram_quantile and fix the implementation

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Remove unneeded code

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix lint

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-12-06 05:47:22 -08:00
+										count  float64
-												enhance histogram_quantile to get min/max value

Signed-off-by: Ziqi Zhao <zhaoziqi9146@gmail.com>

											
										
										
											2023-07-05 04:05:53 -07:00
+										it     histogram.BucketIterator[float64]
 										rank   float64
-												`histogram_quantile` for sparse histograms (#9935)

* MergeFloatBucketIterator for []FloatBucketIterator

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* histogram_quantile for histograms

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix histogram_quantile

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Unit test and enhancements

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Iterators to iterate buckets in reverse and all buckets together including zero bucket

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Consider all buckets for histogram_quantile and fix the implementation

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Remove unneeded code

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix lint

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-12-06 05:47:22 -08:00
+									)
-												enhance histogram_quantile to get min/max value

Signed-off-by: Ziqi Zhao <zhaoziqi9146@gmail.com>

											
										
										
											2023-07-05 04:05:53 -07:00
-												promql(native histograms): Introduce exponential interpolation

The linear interpolation (assuming that observations are uniformly
distributed within a bucket) is a solid and simple assumption in lack
of any other information. However, the exponential bucketing used by
standard schemas of native histograms has been chosen to cover the
whole range of observations in a way that bucket populations are
spread out over buckets in a reasonably way for typical distributions
encountered in real-world scenarios.

This is the origin of the idea implemented here: If we divide a given
bucket into two (or more) smaller exponential buckets, we "most
naturally" expect that the samples in the original buckets will split
among those smaller buckets in a more or less uniform fashion. With
this assumption, we end up with an "exponential interpolation", which
therefore appears to be a better match for histograms with exponential
bucketing.

This commit leaves the linear interpolation in place for NHCB, but
changes the interpolation for exponential native histograms to
exponential. This affects `histogram_quantile` and
`histogram_fraction` (because the latter is more or less the inverse
of the former).

The zero bucket has to be treated specially because the assumption
above would lead to an "interpolation to zero" (the bucket density
approaches infinity around zero, and with the postulated uniform usage
of buckets, we would end up with an estimate of zero for all quantiles
ending up in the zero bucket). We simply fall back to linear
interpolation within the zero bucket.

At the same time, this commit makes the call to stick with the
assumption that the zero bucket only contains positive observations
for native histograms without negative buckets (and vice versa). (This
is an assumption relevant for interpolation. It is a mostly academic
point, as the zero bucket is supposed to be very small anyway.
However, in cases where it _is_ relevantly broad, the assumption helps
a lot in practice.)

This commit also updates and completes the documentation to match both
details about interpolation.

As a more high level note: The approach here attempts to strike a
balance between a more simplistic approach without any assumption, and
a more involved approach with more sophisticated assumptions. I will
shortly describe both for reference:

The "zero assumption" approach would be to not interpolate at all, but
_always_ return the harmonic mean of the bucket boundaries of the
bucket the quantile ends up in. This has the advantage of minimizing
the maximum possible relative error of the quantile estimation.
(Depending on the exact definition of the relative error of an
estimation, there is also an argument to return the arithmetic mean of
the bucket boundaries.) While limiting the maximum possible relative
error is a good property, this approach would throw away the
information if a quantile is closer to the upper or lower end of the
population within a bucket. This can be valuable trending information
in a dashboard. With any kind of interpolation, the maximum possible
error of a quantile estimation increases to the full width of a bucket
(i.e. it more than doubles for the harmonic mean approach, and
precisely doubles for the arithmetic mean approach). However, in
return the _expectation value_ of the error decreases. The increase of
the theoretical maximum only has practical relevance for pathologic
distributions. For example, if there are thousand observations within
a bucket, they could _all_ be at the upper bound of the bucket. If the
quantile calculation picks the 1st observation in the bucket as the
relevant one, an interpolation will yield a value close to the lower
bucket boundary, while the true quantile value is close to the upper
boundary.

The "fancy interpolation" approach would be one that analyses the
_actual_ distribution of samples in the histogram. A lot of statistics
could be applied based on the information we have available in the
histogram. This would include the population of neighboring (or even
all) buckets in the histogram. In general, the resolution of a native
histogram should be quite high, and therefore, those "fancy"
approaches would increase the computational cost quite a bit with very
little practical benefits (i.e. just tiny corrections of the estimated
quantile value). The results are also much harder to reason with.

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2024-08-15 05:19:16 -07:00
+									// If there are NaN observations in the histogram (h.Sum is NaN), use the forward iterator.
 									// If q < 0.5, use the forward iterator.
 									// If q >= 0.5, use the reverse iterator.
-												enhance histogram_quantile to get min/max value

Signed-off-by: Ziqi Zhao <zhaoziqi9146@gmail.com>

											
										
										
											2023-07-05 04:05:53 -07:00
+									if math.IsNaN(h.Sum) || q < 0.5 {
 										it = h.AllBucketIterator()
 										rank = q * h.Count
 									} else {
 										it = h.AllReverseBucketIterator()
 										rank = (1 - q) * h.Count
 									}
-												`histogram_quantile` for sparse histograms (#9935)

* MergeFloatBucketIterator for []FloatBucketIterator

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* histogram_quantile for histograms

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix histogram_quantile

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Unit test and enhancements

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Iterators to iterate buckets in reverse and all buckets together including zero bucket

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Consider all buckets for histogram_quantile and fix the implementation

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Remove unneeded code

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix lint

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-12-06 05:47:22 -08:00
+									for it.Next() {
-												promql: Tweak histogramQuantile

- Simplify the code a bit.

- Cover more corner cases.

- Remove TODO for negative buckets. (I think they are handled. Tests
  will reveal if not.)

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2021-12-15 08:39:32 -08:00
+										bucket = it.At()
-												[nhcb branch] Add basic unit tests for native histograms with custom buckets converted from classic histograms (#13794)

* modify unit test framework to automatically generate native histograms with custom buckets from classic histogram series
* add very basic tests for classic histogram converted into native histogram with custom bounds
* fix histogram_quantile for native histograms with custom buckets
* make loading with nhcb explicit
* evaluate native histograms with custom buckets on queries with explicit keyword
* use regex replacer
* use temp histogram struct for automatically loading converted nhcb

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
Signed-off-by: George Krajcsovits <krajorama@users.noreply.github.com>

											
										
										
											2024-04-24 00:36:05 -07:00
+										if bucket.Count == 0 {
 											continue
 										}
-												promql: Tweak histogramQuantile

- Simplify the code a bit.

- Cover more corner cases.

- Remove TODO for negative buckets. (I think they are handled. Tests
  will reveal if not.)

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2021-12-15 08:39:32 -08:00
+										count += bucket.Count
-												`histogram_quantile` for sparse histograms (#9935)

* MergeFloatBucketIterator for []FloatBucketIterator

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* histogram_quantile for histograms

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix histogram_quantile

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Unit test and enhancements

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Iterators to iterate buckets in reverse and all buckets together including zero bucket

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Consider all buckets for histogram_quantile and fix the implementation

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Remove unneeded code

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix lint

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-12-06 05:47:22 -08:00
+										if count >= rank {
 											break
 										}
 									}
-												[nhcb branch] Add basic unit tests for native histograms with custom buckets converted from classic histograms (#13794)

* modify unit test framework to automatically generate native histograms with custom buckets from classic histogram series
* add very basic tests for classic histogram converted into native histogram with custom bounds
* fix histogram_quantile for native histograms with custom buckets
* make loading with nhcb explicit
* evaluate native histograms with custom buckets on queries with explicit keyword
* use regex replacer
* use temp histogram struct for automatically loading converted nhcb

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
Signed-off-by: George Krajcsovits <krajorama@users.noreply.github.com>

											
										
										
											2024-04-24 00:36:05 -07:00
+									if !h.UsesCustomBuckets() && bucket.Lower < 0 && bucket.Upper > 0 {
-												style: Replace `else if` cascades with `switch`

Wiser coders than myself have come to the conclusion that a `switch`
statement is almost always superior to a statement that includes any
`else if`.

The exceptions that I have found in our codebase are just these two:

* The `if else` is followed by an additional statement before the next
  condition (separated by a `;`).
* The whole thing is within a `for` loop and `break` statements are
  used. In this case, using `switch` would require tagging the `for`
  loop, which probably tips the balance.

Why are `switch` statements more readable?

For one, fewer curly braces. But more importantly, the conditions all
have the same alignment, so the whole thing follows the natural flow
of going down a list of conditions. With `else if`, in contrast, all
conditions but the first are "hidden" behind `} else if `, harder to
spot and (for no good reason) presented differently from the first
condition.

I'm sure the aforemention wise coders can list even more reasons.

In any case, I like it so much that I have found myself recommending
it in code reviews. I would like to make it a habit in our code base,
without making it a hard requirement that we would test on the CI. But
for that, there has to be a role model, so this commit eliminates all
`if else` occurrences, unless it is autogenerated code or fits one of
the exceptions above.

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2023-04-12 07:14:31 -07:00
+										switch {
 										case len(h.NegativeBuckets) == 0 && len(h.PositiveBuckets) > 0:
-												promql: Refine zero bucket treatment in histogramQuantile

Essentially, this mirrors the existing behavior for negative buckets:
If a histogram has only negative buckets, the upper bound of the zero
bucket is assumed to be zero.

Furthermore, it makes sure that the zero bucket boundaries are not
modified if a histogram that has no buckets at all but samples in the
zero bucket.

Also, add an TODO to vet if we really want this behavior.

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2022-06-16 09:54:28 -07:00
+											// The result is in the zero bucket and the histogram has only
 											// positive buckets. So we consider 0 to be the lower bound.
 											bucket.Lower = 0
-												style: Replace `else if` cascades with `switch`

Wiser coders than myself have come to the conclusion that a `switch`
statement is almost always superior to a statement that includes any
`else if`.

The exceptions that I have found in our codebase are just these two:

* The `if else` is followed by an additional statement before the next
  condition (separated by a `;`).
* The whole thing is within a `for` loop and `break` statements are
  used. In this case, using `switch` would require tagging the `for`
  loop, which probably tips the balance.

Why are `switch` statements more readable?

For one, fewer curly braces. But more importantly, the conditions all
have the same alignment, so the whole thing follows the natural flow
of going down a list of conditions. With `else if`, in contrast, all
conditions but the first are "hidden" behind `} else if `, harder to
spot and (for no good reason) presented differently from the first
condition.

I'm sure the aforemention wise coders can list even more reasons.

In any case, I like it so much that I have found myself recommending
it in code reviews. I would like to make it a habit in our code base,
without making it a hard requirement that we would test on the CI. But
for that, there has to be a role model, so this commit eliminates all
`if else` occurrences, unless it is autogenerated code or fits one of
the exceptions above.

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2023-04-12 07:14:31 -07:00
+										case len(h.PositiveBuckets) == 0 && len(h.NegativeBuckets) > 0:
-												promql: Refine zero bucket treatment in histogramQuantile

Essentially, this mirrors the existing behavior for negative buckets:
If a histogram has only negative buckets, the upper bound of the zero
bucket is assumed to be zero.

Furthermore, it makes sure that the zero bucket boundaries are not
modified if a histogram that has no buckets at all but samples in the
zero bucket.

Also, add an TODO to vet if we really want this behavior.

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2022-06-16 09:54:28 -07:00
+											// The result is in the zero bucket and the histogram has only
 											// negative buckets. So we consider 0 to be the upper bound.
 											bucket.Upper = 0
 										}
-												[nhcb branch] Add basic unit tests for native histograms with custom buckets converted from classic histograms (#13794)

* modify unit test framework to automatically generate native histograms with custom buckets from classic histogram series
* add very basic tests for classic histogram converted into native histogram with custom bounds
* fix histogram_quantile for native histograms with custom buckets
* make loading with nhcb explicit
* evaluate native histograms with custom buckets on queries with explicit keyword
* use regex replacer
* use temp histogram struct for automatically loading converted nhcb

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
Signed-off-by: George Krajcsovits <krajorama@users.noreply.github.com>

											
										
										
											2024-04-24 00:36:05 -07:00
+									} else if h.UsesCustomBuckets() {
 										if bucket.Lower == math.Inf(-1) {
 											// first bucket, with lower bound -Inf
 											if bucket.Upper <= 0 {
 												return bucket.Upper
 											}
 											bucket.Lower = 0
 										} else if bucket.Upper == math.Inf(1) {
 											// last bucket, with upper bound +Inf
 											return bucket.Lower
 										}
-												`histogram_quantile` for sparse histograms (#9935)

* MergeFloatBucketIterator for []FloatBucketIterator

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* histogram_quantile for histograms

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix histogram_quantile

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Unit test and enhancements

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Iterators to iterate buckets in reverse and all buckets together including zero bucket

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Consider all buckets for histogram_quantile and fix the implementation

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Remove unneeded code

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix lint

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-12-06 05:47:22 -08:00
+									}
-												promql: Tweak histogramQuantile

- Simplify the code a bit.

- Cover more corner cases.

- Remove TODO for negative buckets. (I think they are handled. Tests
  will reveal if not.)

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2021-12-15 08:39:32 -08:00
+									// Due to numerical inaccuracies, we could end up with a higher count
 									// than h.Count. Thus, make sure count is never higher than h.Count.
 									if count > h.Count {
 										count = h.Count
 									}
 									// We could have hit the highest bucket without even reaching the rank
-												histogram: Modify getBound to deal properly with infinity

The bucket receiving math.MaxFloat64 observations now has
math.MaxFloat64 as upper bound, while the bucket after it (the last
possible bucket) has +Inf.

This also adds a test for getBound and moves the getBound code to
generic.go (where it should have been in the first place).

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2022-10-05 06:34:47 -07:00
+									// (this should only happen if the histogram contains observations of
 									// the value NaN), in which case we simply return the upper limit of the
 									// highest explicit bucket.
-												promql: Tweak histogramQuantile

- Simplify the code a bit.

- Cover more corner cases.

- Remove TODO for negative buckets. (I think they are handled. Tests
  will reveal if not.)

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2021-12-15 08:39:32 -08:00
+									if count < rank {
 										return bucket.Upper
 									}
-												`histogram_quantile` for sparse histograms (#9935)

* MergeFloatBucketIterator for []FloatBucketIterator

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* histogram_quantile for histograms

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix histogram_quantile

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Unit test and enhancements

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Iterators to iterate buckets in reverse and all buckets together including zero bucket

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Consider all buckets for histogram_quantile and fix the implementation

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Remove unneeded code

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix lint

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-12-06 05:47:22 -08:00
-												histograms: Improve comment

Oversight during review of #12525.

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2023-07-12 05:52:49 -07:00
+									// NaN observations increase h.Count but not the total number of
 									// observations in the buckets. Therefore, we have to use the forward
 									// iterator to find percentiles. We recognize histograms containing NaN
 									// observations by checking if their h.Sum is NaN.
-												enhance histogram_quantile to get min/max value

Signed-off-by: Ziqi Zhao <zhaoziqi9146@gmail.com>

											
										
										
											2023-07-05 04:05:53 -07:00
+									if math.IsNaN(h.Sum) || q < 0.5 {
 										rank -= count - bucket.Count
 									} else {
 										rank = count - rank
 									}
-												promql(native histograms): Introduce exponential interpolation

The linear interpolation (assuming that observations are uniformly
distributed within a bucket) is a solid and simple assumption in lack
of any other information. However, the exponential bucketing used by
standard schemas of native histograms has been chosen to cover the
whole range of observations in a way that bucket populations are
spread out over buckets in a reasonably way for typical distributions
encountered in real-world scenarios.

This is the origin of the idea implemented here: If we divide a given
bucket into two (or more) smaller exponential buckets, we "most
naturally" expect that the samples in the original buckets will split
among those smaller buckets in a more or less uniform fashion. With
this assumption, we end up with an "exponential interpolation", which
therefore appears to be a better match for histograms with exponential
bucketing.

This commit leaves the linear interpolation in place for NHCB, but
changes the interpolation for exponential native histograms to
exponential. This affects `histogram_quantile` and
`histogram_fraction` (because the latter is more or less the inverse
of the former).

The zero bucket has to be treated specially because the assumption
above would lead to an "interpolation to zero" (the bucket density
approaches infinity around zero, and with the postulated uniform usage
of buckets, we would end up with an estimate of zero for all quantiles
ending up in the zero bucket). We simply fall back to linear
interpolation within the zero bucket.

At the same time, this commit makes the call to stick with the
assumption that the zero bucket only contains positive observations
for native histograms without negative buckets (and vice versa). (This
is an assumption relevant for interpolation. It is a mostly academic
point, as the zero bucket is supposed to be very small anyway.
However, in cases where it _is_ relevantly broad, the assumption helps
a lot in practice.)

This commit also updates and completes the documentation to match both
details about interpolation.

As a more high level note: The approach here attempts to strike a
balance between a more simplistic approach without any assumption, and
a more involved approach with more sophisticated assumptions. I will
shortly describe both for reference:

The "zero assumption" approach would be to not interpolate at all, but
_always_ return the harmonic mean of the bucket boundaries of the
bucket the quantile ends up in. This has the advantage of minimizing
the maximum possible relative error of the quantile estimation.
(Depending on the exact definition of the relative error of an
estimation, there is also an argument to return the arithmetic mean of
the bucket boundaries.) While limiting the maximum possible relative
error is a good property, this approach would throw away the
information if a quantile is closer to the upper or lower end of the
population within a bucket. This can be valuable trending information
in a dashboard. With any kind of interpolation, the maximum possible
error of a quantile estimation increases to the full width of a bucket
(i.e. it more than doubles for the harmonic mean approach, and
precisely doubles for the arithmetic mean approach). However, in
return the _expectation value_ of the error decreases. The increase of
the theoretical maximum only has practical relevance for pathologic
distributions. For example, if there are thousand observations within
a bucket, they could _all_ be at the upper bound of the bucket. If the
quantile calculation picks the 1st observation in the bucket as the
relevant one, an interpolation will yield a value close to the lower
bucket boundary, while the true quantile value is close to the upper
boundary.

The "fancy interpolation" approach would be one that analyses the
_actual_ distribution of samples in the histogram. A lot of statistics
could be applied based on the information we have available in the
histogram. This would include the population of neighboring (or even
all) buckets in the histogram. In general, the resolution of a native
histogram should be quite high, and therefore, those "fancy"
approaches would increase the computational cost quite a bit with very
little practical benefits (i.e. just tiny corrections of the estimated
quantile value). The results are also much harder to reason with.

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2024-08-15 05:19:16 -07:00
+									// The fraction of how far we are into the current bucket.
 									fraction := rank / bucket.Count
 									// Return linear interpolation for custom buckets and for quantiles that
 									// end up in the zero bucket.
 									if h.UsesCustomBuckets() || (bucket.Lower <= 0 && bucket.Upper >= 0) {
 										return bucket.Lower + (bucket.Upper-bucket.Lower)*fraction
 									}
 									// For exponential buckets, we interpolate on a logarithmic scale. On a
 									// logarithmic scale, the exponential bucket boundaries (for any schema)
 									// become linear (every bucket has the same width). Therefore, after
 									// taking the logarithm of both bucket boundaries, we can use the
 									// calculated fraction in the same way as for linear interpolation (see
 									// above). Finally, we return to the normal scale by applying the
 									// exponential function to the result.
 									logLower := math.Log2(math.Abs(bucket.Lower))
 									logUpper := math.Log2(math.Abs(bucket.Upper))
 									if bucket.Lower > 0 { // Positive bucket.
 										return math.Exp2(logLower + (logUpper-logLower)*fraction)
 									}
 									// Otherwise, we are in a negative bucket and have to mirror things.
 									return -math.Exp2(logUpper + (logLower-logUpper)*(1-fraction))
-												`histogram_quantile` for sparse histograms (#9935)

* MergeFloatBucketIterator for []FloatBucketIterator

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* histogram_quantile for histograms

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix histogram_quantile

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Unit test and enhancements

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Iterators to iterate buckets in reverse and all buckets together including zero bucket

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Consider all buckets for histogram_quantile and fix the implementation

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Remove unneeded code

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix lint

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-12-06 05:47:22 -08:00
+								}
-												promql: Add histogram_fraction function

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2022-06-16 11:44:12 -07:00
+								// histogramFraction calculates the fraction of observations between the
 								// provided lower and upper bounds, based on the provided histogram.
 								//
 								// histogramFraction is in a certain way the inverse of histogramQuantile.  If
 								// histogramQuantile(0.9, h) returns 123.4, then histogramFraction(-Inf, 123.4, h)
 								// returns 0.9.
 								//
-												promql(native histograms): Introduce exponential interpolation

The linear interpolation (assuming that observations are uniformly
distributed within a bucket) is a solid and simple assumption in lack
of any other information. However, the exponential bucketing used by
standard schemas of native histograms has been chosen to cover the
whole range of observations in a way that bucket populations are
spread out over buckets in a reasonably way for typical distributions
encountered in real-world scenarios.

This is the origin of the idea implemented here: If we divide a given
bucket into two (or more) smaller exponential buckets, we "most
naturally" expect that the samples in the original buckets will split
among those smaller buckets in a more or less uniform fashion. With
this assumption, we end up with an "exponential interpolation", which
therefore appears to be a better match for histograms with exponential
bucketing.

This commit leaves the linear interpolation in place for NHCB, but
changes the interpolation for exponential native histograms to
exponential. This affects `histogram_quantile` and
`histogram_fraction` (because the latter is more or less the inverse
of the former).

The zero bucket has to be treated specially because the assumption
above would lead to an "interpolation to zero" (the bucket density
approaches infinity around zero, and with the postulated uniform usage
of buckets, we would end up with an estimate of zero for all quantiles
ending up in the zero bucket). We simply fall back to linear
interpolation within the zero bucket.

At the same time, this commit makes the call to stick with the
assumption that the zero bucket only contains positive observations
for native histograms without negative buckets (and vice versa). (This
is an assumption relevant for interpolation. It is a mostly academic
point, as the zero bucket is supposed to be very small anyway.
However, in cases where it _is_ relevantly broad, the assumption helps
a lot in practice.)

This commit also updates and completes the documentation to match both
details about interpolation.

As a more high level note: The approach here attempts to strike a
balance between a more simplistic approach without any assumption, and
a more involved approach with more sophisticated assumptions. I will
shortly describe both for reference:

The "zero assumption" approach would be to not interpolate at all, but
_always_ return the harmonic mean of the bucket boundaries of the
bucket the quantile ends up in. This has the advantage of minimizing
the maximum possible relative error of the quantile estimation.
(Depending on the exact definition of the relative error of an
estimation, there is also an argument to return the arithmetic mean of
the bucket boundaries.) While limiting the maximum possible relative
error is a good property, this approach would throw away the
information if a quantile is closer to the upper or lower end of the
population within a bucket. This can be valuable trending information
in a dashboard. With any kind of interpolation, the maximum possible
error of a quantile estimation increases to the full width of a bucket
(i.e. it more than doubles for the harmonic mean approach, and
precisely doubles for the arithmetic mean approach). However, in
return the _expectation value_ of the error decreases. The increase of
the theoretical maximum only has practical relevance for pathologic
distributions. For example, if there are thousand observations within
a bucket, they could _all_ be at the upper bound of the bucket. If the
quantile calculation picks the 1st observation in the bucket as the
relevant one, an interpolation will yield a value close to the lower
bucket boundary, while the true quantile value is close to the upper
boundary.

The "fancy interpolation" approach would be one that analyses the
_actual_ distribution of samples in the histogram. A lot of statistics
could be applied based on the information we have available in the
histogram. This would include the population of neighboring (or even
all) buckets in the histogram. In general, the resolution of a native
histogram should be quite high, and therefore, those "fancy"
approaches would increase the computational cost quite a bit with very
little practical benefits (i.e. just tiny corrections of the estimated
quantile value). The results are also much harder to reason with.

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2024-08-15 05:19:16 -07:00
+								// The same notes with regard to interpolation and assumptions about the zero
 								// bucket boundaries apply as for histogramQuantile.
-												promql: Add histogram_fraction function

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2022-06-16 11:44:12 -07:00
+								//
 								// Whether either boundary is inclusive or exclusive doesn’t actually matter as
 								// long as interpolation has to be performed anyway. In the case of a boundary
 								// coinciding with a bucket boundary, the inclusive or exclusive nature of the
 								// boundary determines the exact behavior of the threshold. With the current
 								// implementation, that means that lower is exclusive for positive values and
 								// inclusive for negative values, while upper is inclusive for positive values
 								// and exclusive for negative values.
 								//
 								// Special cases:
 								//
 								// If the histogram has 0 observations, NaN is returned.
 								//
 								// Use a lower bound of -Inf to get the fraction of all observations below the
 								// upper bound.
 								//
 								// Use an upper bound of +Inf to get the fraction of all observations above the
 								// lower bound.
 								//
 								// If lower or upper is NaN, NaN is returned.
 								//
 								// If lower >= upper and the histogram has at least 1 observation, zero is returned.
 								func histogramFraction(lower, upper float64, h *histogram.FloatHistogram) float64 {
 									if h.Count == 0 || math.IsNaN(lower) || math.IsNaN(upper) {
 										return math.NaN()
 									}
 									if lower >= upper {
 										return 0
 									}
 									var (
 										rank, lowerRank, upperRank float64
 										lowerSet, upperSet         bool
 										it                         = h.AllBucketIterator()
 									)
 									for it.Next() {
 										b := it.At()
-												promql(native histograms): Introduce exponential interpolation

The linear interpolation (assuming that observations are uniformly
distributed within a bucket) is a solid and simple assumption in lack
of any other information. However, the exponential bucketing used by
standard schemas of native histograms has been chosen to cover the
whole range of observations in a way that bucket populations are
spread out over buckets in a reasonably way for typical distributions
encountered in real-world scenarios.

This is the origin of the idea implemented here: If we divide a given
bucket into two (or more) smaller exponential buckets, we "most
naturally" expect that the samples in the original buckets will split
among those smaller buckets in a more or less uniform fashion. With
this assumption, we end up with an "exponential interpolation", which
therefore appears to be a better match for histograms with exponential
bucketing.

This commit leaves the linear interpolation in place for NHCB, but
changes the interpolation for exponential native histograms to
exponential. This affects `histogram_quantile` and
`histogram_fraction` (because the latter is more or less the inverse
of the former).

The zero bucket has to be treated specially because the assumption
above would lead to an "interpolation to zero" (the bucket density
approaches infinity around zero, and with the postulated uniform usage
of buckets, we would end up with an estimate of zero for all quantiles
ending up in the zero bucket). We simply fall back to linear
interpolation within the zero bucket.

At the same time, this commit makes the call to stick with the
assumption that the zero bucket only contains positive observations
for native histograms without negative buckets (and vice versa). (This
is an assumption relevant for interpolation. It is a mostly academic
point, as the zero bucket is supposed to be very small anyway.
However, in cases where it _is_ relevantly broad, the assumption helps
a lot in practice.)

This commit also updates and completes the documentation to match both
details about interpolation.

As a more high level note: The approach here attempts to strike a
balance between a more simplistic approach without any assumption, and
a more involved approach with more sophisticated assumptions. I will
shortly describe both for reference:

The "zero assumption" approach would be to not interpolate at all, but
_always_ return the harmonic mean of the bucket boundaries of the
bucket the quantile ends up in. This has the advantage of minimizing
the maximum possible relative error of the quantile estimation.
(Depending on the exact definition of the relative error of an
estimation, there is also an argument to return the arithmetic mean of
the bucket boundaries.) While limiting the maximum possible relative
error is a good property, this approach would throw away the
information if a quantile is closer to the upper or lower end of the
population within a bucket. This can be valuable trending information
in a dashboard. With any kind of interpolation, the maximum possible
error of a quantile estimation increases to the full width of a bucket
(i.e. it more than doubles for the harmonic mean approach, and
precisely doubles for the arithmetic mean approach). However, in
return the _expectation value_ of the error decreases. The increase of
the theoretical maximum only has practical relevance for pathologic
distributions. For example, if there are thousand observations within
a bucket, they could _all_ be at the upper bound of the bucket. If the
quantile calculation picks the 1st observation in the bucket as the
relevant one, an interpolation will yield a value close to the lower
bucket boundary, while the true quantile value is close to the upper
boundary.

The "fancy interpolation" approach would be one that analyses the
_actual_ distribution of samples in the histogram. A lot of statistics
could be applied based on the information we have available in the
histogram. This would include the population of neighboring (or even
all) buckets in the histogram. In general, the resolution of a native
histogram should be quite high, and therefore, those "fancy"
approaches would increase the computational cost quite a bit with very
little practical benefits (i.e. just tiny corrections of the estimated
quantile value). The results are also much harder to reason with.

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2024-08-15 05:19:16 -07:00
+										zeroBucket := false
 										// interpolateLinearly is used for custom buckets to be
 										// consistent with the linear interpolation known from classic
 										// histograms. It is also used for the zero bucket.
 										interpolateLinearly := func(v float64) float64 {
 											return rank + b.Count*(v-b.Lower)/(b.Upper-b.Lower)
 										}
 										// interpolateExponentially is using the same exponential
 										// interpolation method as above for histogramQuantile. This
 										// method is a better fit for exponential bucketing.
 										interpolateExponentially := func(v float64) float64 {
 											var (
 												logLower = math.Log2(math.Abs(b.Lower))
 												logUpper = math.Log2(math.Abs(b.Upper))
 												logV     = math.Log2(math.Abs(v))
 												fraction float64
 											)
 											if v > 0 {
 												fraction = (logV - logLower) / (logUpper - logLower)
 											} else {
 												fraction = 1 - ((logV - logUpper) / (logLower - logUpper))
 											}
 											return rank + b.Count*fraction
 										}
 										if b.Lower <= 0 && b.Upper >= 0 {
 											zeroBucket = true
-												style: Replace `else if` cascades with `switch`

Wiser coders than myself have come to the conclusion that a `switch`
statement is almost always superior to a statement that includes any
`else if`.

The exceptions that I have found in our codebase are just these two:

* The `if else` is followed by an additional statement before the next
  condition (separated by a `;`).
* The whole thing is within a `for` loop and `break` statements are
  used. In this case, using `switch` would require tagging the `for`
  loop, which probably tips the balance.

Why are `switch` statements more readable?

For one, fewer curly braces. But more importantly, the conditions all
have the same alignment, so the whole thing follows the natural flow
of going down a list of conditions. With `else if`, in contrast, all
conditions but the first are "hidden" behind `} else if `, harder to
spot and (for no good reason) presented differently from the first
condition.

I'm sure the aforemention wise coders can list even more reasons.

In any case, I like it so much that I have found myself recommending
it in code reviews. I would like to make it a habit in our code base,
without making it a hard requirement that we would test on the CI. But
for that, there has to be a role model, so this commit eliminates all
`if else` occurrences, unless it is autogenerated code or fits one of
the exceptions above.

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2023-04-12 07:14:31 -07:00
+											switch {
 											case len(h.NegativeBuckets) == 0 && len(h.PositiveBuckets) > 0:
-												promql: Add histogram_fraction function

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2022-06-16 11:44:12 -07:00
+												// This is the zero bucket and the histogram has only
 												// positive buckets. So we consider 0 to be the lower
 												// bound.
 												b.Lower = 0
-												style: Replace `else if` cascades with `switch`

Wiser coders than myself have come to the conclusion that a `switch`
statement is almost always superior to a statement that includes any
`else if`.

The exceptions that I have found in our codebase are just these two:

* The `if else` is followed by an additional statement before the next
  condition (separated by a `;`).
* The whole thing is within a `for` loop and `break` statements are
  used. In this case, using `switch` would require tagging the `for`
  loop, which probably tips the balance.

Why are `switch` statements more readable?

For one, fewer curly braces. But more importantly, the conditions all
have the same alignment, so the whole thing follows the natural flow
of going down a list of conditions. With `else if`, in contrast, all
conditions but the first are "hidden" behind `} else if `, harder to
spot and (for no good reason) presented differently from the first
condition.

I'm sure the aforemention wise coders can list even more reasons.

In any case, I like it so much that I have found myself recommending
it in code reviews. I would like to make it a habit in our code base,
without making it a hard requirement that we would test on the CI. But
for that, there has to be a role model, so this commit eliminates all
`if else` occurrences, unless it is autogenerated code or fits one of
the exceptions above.

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2023-04-12 07:14:31 -07:00
+											case len(h.PositiveBuckets) == 0 && len(h.NegativeBuckets) > 0:
-												promql: Add histogram_fraction function

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2022-06-16 11:44:12 -07:00
+												// This is in the zero bucket and the histogram has only
 												// negative buckets. So we consider 0 to be the upper
 												// bound.
 												b.Upper = 0
 											}
 										}
 										if !lowerSet && b.Lower >= lower {
-												promql(native histograms): Introduce exponential interpolation

The linear interpolation (assuming that observations are uniformly
distributed within a bucket) is a solid and simple assumption in lack
of any other information. However, the exponential bucketing used by
standard schemas of native histograms has been chosen to cover the
whole range of observations in a way that bucket populations are
spread out over buckets in a reasonably way for typical distributions
encountered in real-world scenarios.

This is the origin of the idea implemented here: If we divide a given
bucket into two (or more) smaller exponential buckets, we "most
naturally" expect that the samples in the original buckets will split
among those smaller buckets in a more or less uniform fashion. With
this assumption, we end up with an "exponential interpolation", which
therefore appears to be a better match for histograms with exponential
bucketing.

This commit leaves the linear interpolation in place for NHCB, but
changes the interpolation for exponential native histograms to
exponential. This affects `histogram_quantile` and
`histogram_fraction` (because the latter is more or less the inverse
of the former).

The zero bucket has to be treated specially because the assumption
above would lead to an "interpolation to zero" (the bucket density
approaches infinity around zero, and with the postulated uniform usage
of buckets, we would end up with an estimate of zero for all quantiles
ending up in the zero bucket). We simply fall back to linear
interpolation within the zero bucket.

At the same time, this commit makes the call to stick with the
assumption that the zero bucket only contains positive observations
for native histograms without negative buckets (and vice versa). (This
is an assumption relevant for interpolation. It is a mostly academic
point, as the zero bucket is supposed to be very small anyway.
However, in cases where it _is_ relevantly broad, the assumption helps
a lot in practice.)

This commit also updates and completes the documentation to match both
details about interpolation.

As a more high level note: The approach here attempts to strike a
balance between a more simplistic approach without any assumption, and
a more involved approach with more sophisticated assumptions. I will
shortly describe both for reference:

The "zero assumption" approach would be to not interpolate at all, but
_always_ return the harmonic mean of the bucket boundaries of the
bucket the quantile ends up in. This has the advantage of minimizing
the maximum possible relative error of the quantile estimation.
(Depending on the exact definition of the relative error of an
estimation, there is also an argument to return the arithmetic mean of
the bucket boundaries.) While limiting the maximum possible relative
error is a good property, this approach would throw away the
information if a quantile is closer to the upper or lower end of the
population within a bucket. This can be valuable trending information
in a dashboard. With any kind of interpolation, the maximum possible
error of a quantile estimation increases to the full width of a bucket
(i.e. it more than doubles for the harmonic mean approach, and
precisely doubles for the arithmetic mean approach). However, in
return the _expectation value_ of the error decreases. The increase of
the theoretical maximum only has practical relevance for pathologic
distributions. For example, if there are thousand observations within
a bucket, they could _all_ be at the upper bound of the bucket. If the
quantile calculation picks the 1st observation in the bucket as the
relevant one, an interpolation will yield a value close to the lower
bucket boundary, while the true quantile value is close to the upper
boundary.

The "fancy interpolation" approach would be one that analyses the
_actual_ distribution of samples in the histogram. A lot of statistics
could be applied based on the information we have available in the
histogram. This would include the population of neighboring (or even
all) buckets in the histogram. In general, the resolution of a native
histogram should be quite high, and therefore, those "fancy"
approaches would increase the computational cost quite a bit with very
little practical benefits (i.e. just tiny corrections of the estimated
quantile value). The results are also much harder to reason with.

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2024-08-15 05:19:16 -07:00
+											// We have hit the lower value at the lower bucket boundary.
-												promql: Add histogram_fraction function

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2022-06-16 11:44:12 -07:00
+											lowerRank = rank
 											lowerSet = true
 										}
 										if !upperSet && b.Lower >= upper {
-												promql(native histograms): Introduce exponential interpolation

The linear interpolation (assuming that observations are uniformly
distributed within a bucket) is a solid and simple assumption in lack
of any other information. However, the exponential bucketing used by
standard schemas of native histograms has been chosen to cover the
whole range of observations in a way that bucket populations are
spread out over buckets in a reasonably way for typical distributions
encountered in real-world scenarios.

This is the origin of the idea implemented here: If we divide a given
bucket into two (or more) smaller exponential buckets, we "most
naturally" expect that the samples in the original buckets will split
among those smaller buckets in a more or less uniform fashion. With
this assumption, we end up with an "exponential interpolation", which
therefore appears to be a better match for histograms with exponential
bucketing.

This commit leaves the linear interpolation in place for NHCB, but
changes the interpolation for exponential native histograms to
exponential. This affects `histogram_quantile` and
`histogram_fraction` (because the latter is more or less the inverse
of the former).

The zero bucket has to be treated specially because the assumption
above would lead to an "interpolation to zero" (the bucket density
approaches infinity around zero, and with the postulated uniform usage
of buckets, we would end up with an estimate of zero for all quantiles
ending up in the zero bucket). We simply fall back to linear
interpolation within the zero bucket.

At the same time, this commit makes the call to stick with the
assumption that the zero bucket only contains positive observations
for native histograms without negative buckets (and vice versa). (This
is an assumption relevant for interpolation. It is a mostly academic
point, as the zero bucket is supposed to be very small anyway.
However, in cases where it _is_ relevantly broad, the assumption helps
a lot in practice.)

This commit also updates and completes the documentation to match both
details about interpolation.

As a more high level note: The approach here attempts to strike a
balance between a more simplistic approach without any assumption, and
a more involved approach with more sophisticated assumptions. I will
shortly describe both for reference:

The "zero assumption" approach would be to not interpolate at all, but
_always_ return the harmonic mean of the bucket boundaries of the
bucket the quantile ends up in. This has the advantage of minimizing
the maximum possible relative error of the quantile estimation.
(Depending on the exact definition of the relative error of an
estimation, there is also an argument to return the arithmetic mean of
the bucket boundaries.) While limiting the maximum possible relative
error is a good property, this approach would throw away the
information if a quantile is closer to the upper or lower end of the
population within a bucket. This can be valuable trending information
in a dashboard. With any kind of interpolation, the maximum possible
error of a quantile estimation increases to the full width of a bucket
(i.e. it more than doubles for the harmonic mean approach, and
precisely doubles for the arithmetic mean approach). However, in
return the _expectation value_ of the error decreases. The increase of
the theoretical maximum only has practical relevance for pathologic
distributions. For example, if there are thousand observations within
a bucket, they could _all_ be at the upper bound of the bucket. If the
quantile calculation picks the 1st observation in the bucket as the
relevant one, an interpolation will yield a value close to the lower
bucket boundary, while the true quantile value is close to the upper
boundary.

The "fancy interpolation" approach would be one that analyses the
_actual_ distribution of samples in the histogram. A lot of statistics
could be applied based on the information we have available in the
histogram. This would include the population of neighboring (or even
all) buckets in the histogram. In general, the resolution of a native
histogram should be quite high, and therefore, those "fancy"
approaches would increase the computational cost quite a bit with very
little practical benefits (i.e. just tiny corrections of the estimated
quantile value). The results are also much harder to reason with.

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2024-08-15 05:19:16 -07:00
+											// We have hit the upper value at the lower bucket boundary.
-												promql: Add histogram_fraction function

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2022-06-16 11:44:12 -07:00
+											upperRank = rank
 											upperSet = true
 										}
 										if lowerSet && upperSet {
 											break
 										}
 										if !lowerSet && b.Lower < lower && b.Upper > lower {
-												promql(native histograms): Introduce exponential interpolation

The linear interpolation (assuming that observations are uniformly
distributed within a bucket) is a solid and simple assumption in lack
of any other information. However, the exponential bucketing used by
standard schemas of native histograms has been chosen to cover the
whole range of observations in a way that bucket populations are
spread out over buckets in a reasonably way for typical distributions
encountered in real-world scenarios.

This is the origin of the idea implemented here: If we divide a given
bucket into two (or more) smaller exponential buckets, we "most
naturally" expect that the samples in the original buckets will split
among those smaller buckets in a more or less uniform fashion. With
this assumption, we end up with an "exponential interpolation", which
therefore appears to be a better match for histograms with exponential
bucketing.

This commit leaves the linear interpolation in place for NHCB, but
changes the interpolation for exponential native histograms to
exponential. This affects `histogram_quantile` and
`histogram_fraction` (because the latter is more or less the inverse
of the former).

The zero bucket has to be treated specially because the assumption
above would lead to an "interpolation to zero" (the bucket density
approaches infinity around zero, and with the postulated uniform usage
of buckets, we would end up with an estimate of zero for all quantiles
ending up in the zero bucket). We simply fall back to linear
interpolation within the zero bucket.

At the same time, this commit makes the call to stick with the
assumption that the zero bucket only contains positive observations
for native histograms without negative buckets (and vice versa). (This
is an assumption relevant for interpolation. It is a mostly academic
point, as the zero bucket is supposed to be very small anyway.
However, in cases where it _is_ relevantly broad, the assumption helps
a lot in practice.)

This commit also updates and completes the documentation to match both
details about interpolation.

As a more high level note: The approach here attempts to strike a
balance between a more simplistic approach without any assumption, and
a more involved approach with more sophisticated assumptions. I will
shortly describe both for reference:

The "zero assumption" approach would be to not interpolate at all, but
_always_ return the harmonic mean of the bucket boundaries of the
bucket the quantile ends up in. This has the advantage of minimizing
the maximum possible relative error of the quantile estimation.
(Depending on the exact definition of the relative error of an
estimation, there is also an argument to return the arithmetic mean of
the bucket boundaries.) While limiting the maximum possible relative
error is a good property, this approach would throw away the
information if a quantile is closer to the upper or lower end of the
population within a bucket. This can be valuable trending information
in a dashboard. With any kind of interpolation, the maximum possible
error of a quantile estimation increases to the full width of a bucket
(i.e. it more than doubles for the harmonic mean approach, and
precisely doubles for the arithmetic mean approach). However, in
return the _expectation value_ of the error decreases. The increase of
the theoretical maximum only has practical relevance for pathologic
distributions. For example, if there are thousand observations within
a bucket, they could _all_ be at the upper bound of the bucket. If the
quantile calculation picks the 1st observation in the bucket as the
relevant one, an interpolation will yield a value close to the lower
bucket boundary, while the true quantile value is close to the upper
boundary.

The "fancy interpolation" approach would be one that analyses the
_actual_ distribution of samples in the histogram. A lot of statistics
could be applied based on the information we have available in the
histogram. This would include the population of neighboring (or even
all) buckets in the histogram. In general, the resolution of a native
histogram should be quite high, and therefore, those "fancy"
approaches would increase the computational cost quite a bit with very
little practical benefits (i.e. just tiny corrections of the estimated
quantile value). The results are also much harder to reason with.

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2024-08-15 05:19:16 -07:00
+											// The lower value is in this bucket.
 											if h.UsesCustomBuckets() || zeroBucket {
 												lowerRank = interpolateLinearly(lower)
 											} else {
 												lowerRank = interpolateExponentially(lower)
 											}
-												promql: Add histogram_fraction function

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2022-06-16 11:44:12 -07:00
+											lowerSet = true
 										}
 										if !upperSet && b.Lower < upper && b.Upper > upper {
-												promql(native histograms): Introduce exponential interpolation

The linear interpolation (assuming that observations are uniformly
distributed within a bucket) is a solid and simple assumption in lack
of any other information. However, the exponential bucketing used by
standard schemas of native histograms has been chosen to cover the
whole range of observations in a way that bucket populations are
spread out over buckets in a reasonably way for typical distributions
encountered in real-world scenarios.

This is the origin of the idea implemented here: If we divide a given
bucket into two (or more) smaller exponential buckets, we "most
naturally" expect that the samples in the original buckets will split
among those smaller buckets in a more or less uniform fashion. With
this assumption, we end up with an "exponential interpolation", which
therefore appears to be a better match for histograms with exponential
bucketing.

This commit leaves the linear interpolation in place for NHCB, but
changes the interpolation for exponential native histograms to
exponential. This affects `histogram_quantile` and
`histogram_fraction` (because the latter is more or less the inverse
of the former).

The zero bucket has to be treated specially because the assumption
above would lead to an "interpolation to zero" (the bucket density
approaches infinity around zero, and with the postulated uniform usage
of buckets, we would end up with an estimate of zero for all quantiles
ending up in the zero bucket). We simply fall back to linear
interpolation within the zero bucket.

At the same time, this commit makes the call to stick with the
assumption that the zero bucket only contains positive observations
for native histograms without negative buckets (and vice versa). (This
is an assumption relevant for interpolation. It is a mostly academic
point, as the zero bucket is supposed to be very small anyway.
However, in cases where it _is_ relevantly broad, the assumption helps
a lot in practice.)

This commit also updates and completes the documentation to match both
details about interpolation.

As a more high level note: The approach here attempts to strike a
balance between a more simplistic approach without any assumption, and
a more involved approach with more sophisticated assumptions. I will
shortly describe both for reference:

The "zero assumption" approach would be to not interpolate at all, but
_always_ return the harmonic mean of the bucket boundaries of the
bucket the quantile ends up in. This has the advantage of minimizing
the maximum possible relative error of the quantile estimation.
(Depending on the exact definition of the relative error of an
estimation, there is also an argument to return the arithmetic mean of
the bucket boundaries.) While limiting the maximum possible relative
error is a good property, this approach would throw away the
information if a quantile is closer to the upper or lower end of the
population within a bucket. This can be valuable trending information
in a dashboard. With any kind of interpolation, the maximum possible
error of a quantile estimation increases to the full width of a bucket
(i.e. it more than doubles for the harmonic mean approach, and
precisely doubles for the arithmetic mean approach). However, in
return the _expectation value_ of the error decreases. The increase of
the theoretical maximum only has practical relevance for pathologic
distributions. For example, if there are thousand observations within
a bucket, they could _all_ be at the upper bound of the bucket. If the
quantile calculation picks the 1st observation in the bucket as the
relevant one, an interpolation will yield a value close to the lower
bucket boundary, while the true quantile value is close to the upper
boundary.

The "fancy interpolation" approach would be one that analyses the
_actual_ distribution of samples in the histogram. A lot of statistics
could be applied based on the information we have available in the
histogram. This would include the population of neighboring (or even
all) buckets in the histogram. In general, the resolution of a native
histogram should be quite high, and therefore, those "fancy"
approaches would increase the computational cost quite a bit with very
little practical benefits (i.e. just tiny corrections of the estimated
quantile value). The results are also much harder to reason with.

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2024-08-15 05:19:16 -07:00
+											// The upper value is in this bucket.
 											if h.UsesCustomBuckets() || zeroBucket {
 												upperRank = interpolateLinearly(upper)
 											} else {
 												upperRank = interpolateExponentially(upper)
 											}
-												promql: Add histogram_fraction function

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2022-06-16 11:44:12 -07:00
+											upperSet = true
 										}
 										if lowerSet && upperSet {
 											break
 										}
 										rank += b.Count
 									}
 									if !lowerSet || lowerRank > h.Count {
 										lowerRank = h.Count
 									}
 									if !upperSet || upperRank > h.Count {
 										upperRank = h.Count
 									}
 									return (upperRank - lowerRank) / h.Count
 								}
-												In histogram_quantile merge buckets with equivalent le values (#5158)

This makes things generally more resilient, and will
help with OpenMetrics transitions (and inconsistencies).

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-02-01 02:22:44 -08:00
+								// coalesceBuckets merges buckets with the same upper bound.
 								//
 								// The input buckets must be sorted.
 								func coalesceBuckets(buckets buckets) buckets {
 									last := buckets[0]
 									i := 0
 									for _, b := range buckets[1:] {
 										if b.upperBound == last.upperBound {
 											last.count += b.count
 										} else {
 											buckets[i] = last
 											last = b
 											i++
 										}
 									}
 									buckets[i] = last
 									return buckets[:i+1]
 								}
-												Force buckets in a histogram to be monotonic for quantile estimation (#2610)

* Force buckets in a histogram to be monotonic for quantile estimation

The assumption that bucket counts increase monotonically with increasing
upperBound may be violated during:

  * Recording rule evaluation of histogram_quantile, especially when rate()
     has been applied to the underlying bucket timeseries.
  * Evaluation of histogram_quantile computed over federated bucket
     timeseries, especially when rate() has been applied

This is because scraped data is not made available to RR evalution or
federation atomically, so some buckets are computed with data from the N
most recent scrapes, but the other buckets are missing the most recent
observations.

Monotonicity is usually guaranteed because if a bucket with upper bound
u1 has count c1, then any bucket with a higher upper bound u > u1 must
have counted all c1 observations and perhaps more, so that c  >= c1.

Randomly interspersed partial sampling breaks that guarantee, and rate()
exacerbates it. Specifically, suppose bucket le=1000 has a count of 10 from
4 samples but the bucket with le=2000 has a count of 7, from 3 samples. The
monotonicity is broken. It is exacerbated by rate() because under normal
operation, cumulative counting of buckets will cause the bucket counts to
diverge such that small differences from missing samples are not a problem.
rate() removes this divergence.)

bucketQuantile depends on that monotonicity to do a binary search for the
bucket with the qth percentile count, so breaking the monotonicity
guarantee causes bucketQuantile() to return undefined (nonsense) results.

As a somewhat hacky solution until the Prometheus project is ready to
accept the changes required to make scrapes atomic, we calculate the
"envelope" of the histogram buckets, essentially removing any decreases
in the count between successive buckets.

* Fix up comment docs for ensureMonotonic

* ensureMonotonic: Use switch statement

Use switch statement rather than if/else for better readability.
Process the most frequent cases first.

											
										
										
											2017-04-14 07:21:49 -07:00
+								// The assumption that bucket counts increase monotonically with increasing
 								// upperBound may be violated during:
 								//
-												Revise according to code review

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>

											
										
										
											2023-10-06 04:09:32 -07:00
+								//   - Circumstances where data is already inconsistent at the target's side.
 								//   - Ingestion via the remote write receiver that Prometheus implements.
 								//   - Optimisation of query execution where precision is sacrificed for other
 								//     benefits, not by Prometheus but by systems built on top of it.
-												PromQL: ignore small errors for bucketQuantile (#13153)

promql: Improve histogram_quantile calculation for classic buckets

Tiny differences between classic buckets are most likely caused by floating point precision issues. With this commit, relative changes below a certain threshold are ignored. This makes the result of histogram_quantile more meaningful, and also avoids triggering the _input to histogram_quantile needed to be fixed for monotonicity_ annotations in unactionable cases.

This commit also adds explanation of the new adjustment and of the monotonicity annotation to the documentation of `histogram_quantile`.

---------

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
											
										
										
											2023-11-24 15:05:38 -08:00
+								//   - Circumstances where floating point precision errors accumulate.
-												Force buckets in a histogram to be monotonic for quantile estimation (#2610)

* Force buckets in a histogram to be monotonic for quantile estimation

The assumption that bucket counts increase monotonically with increasing
upperBound may be violated during:

  * Recording rule evaluation of histogram_quantile, especially when rate()
     has been applied to the underlying bucket timeseries.
  * Evaluation of histogram_quantile computed over federated bucket
     timeseries, especially when rate() has been applied

This is because scraped data is not made available to RR evalution or
federation atomically, so some buckets are computed with data from the N
most recent scrapes, but the other buckets are missing the most recent
observations.

Monotonicity is usually guaranteed because if a bucket with upper bound
u1 has count c1, then any bucket with a higher upper bound u > u1 must
have counted all c1 observations and perhaps more, so that c  >= c1.

Randomly interspersed partial sampling breaks that guarantee, and rate()
exacerbates it. Specifically, suppose bucket le=1000 has a count of 10 from
4 samples but the bucket with le=2000 has a count of 7, from 3 samples. The
monotonicity is broken. It is exacerbated by rate() because under normal
operation, cumulative counting of buckets will cause the bucket counts to
diverge such that small differences from missing samples are not a problem.
rate() removes this divergence.)

bucketQuantile depends on that monotonicity to do a binary search for the
bucket with the qth percentile count, so breaking the monotonicity
guarantee causes bucketQuantile() to return undefined (nonsense) results.

As a somewhat hacky solution until the Prometheus project is ready to
accept the changes required to make scrapes atomic, we calculate the
"envelope" of the histogram buckets, essentially removing any decreases
in the count between successive buckets.

* Fix up comment docs for ensureMonotonic

* ensureMonotonic: Use switch statement

Use switch statement rather than if/else for better readability.
Process the most frequent cases first.

											
										
										
											2017-04-14 07:21:49 -07:00
+								//
 								// Monotonicity is usually guaranteed because if a bucket with upper bound
 								// u1 has count c1, then any bucket with a higher upper bound u > u1 must
-												Revise according to code review

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>

											
										
										
											2023-10-06 04:09:32 -07:00
+								// have counted all c1 observations and perhaps more, so that c >= c1.
-												Force buckets in a histogram to be monotonic for quantile estimation (#2610)

* Force buckets in a histogram to be monotonic for quantile estimation

The assumption that bucket counts increase monotonically with increasing
upperBound may be violated during:

  * Recording rule evaluation of histogram_quantile, especially when rate()
     has been applied to the underlying bucket timeseries.
  * Evaluation of histogram_quantile computed over federated bucket
     timeseries, especially when rate() has been applied

This is because scraped data is not made available to RR evalution or
federation atomically, so some buckets are computed with data from the N
most recent scrapes, but the other buckets are missing the most recent
observations.

Monotonicity is usually guaranteed because if a bucket with upper bound
u1 has count c1, then any bucket with a higher upper bound u > u1 must
have counted all c1 observations and perhaps more, so that c  >= c1.

Randomly interspersed partial sampling breaks that guarantee, and rate()
exacerbates it. Specifically, suppose bucket le=1000 has a count of 10 from
4 samples but the bucket with le=2000 has a count of 7, from 3 samples. The
monotonicity is broken. It is exacerbated by rate() because under normal
operation, cumulative counting of buckets will cause the bucket counts to
diverge such that small differences from missing samples are not a problem.
rate() removes this divergence.)

bucketQuantile depends on that monotonicity to do a binary search for the
bucket with the qth percentile count, so breaking the monotonicity
guarantee causes bucketQuantile() to return undefined (nonsense) results.

As a somewhat hacky solution until the Prometheus project is ready to
accept the changes required to make scrapes atomic, we calculate the
"envelope" of the histogram buckets, essentially removing any decreases
in the count between successive buckets.

* Fix up comment docs for ensureMonotonic

* ensureMonotonic: Use switch statement

Use switch statement rather than if/else for better readability.
Process the most frequent cases first.

											
										
										
											2017-04-14 07:21:49 -07:00
+								//
 								// bucketQuantile depends on that monotonicity to do a binary search for the
 								// bucket with the φ-quantile count, so breaking the monotonicity
 								// guarantee causes bucketQuantile() to return undefined (nonsense) results.
 								//
-												PromQL: ignore small errors for bucketQuantile (#13153)

promql: Improve histogram_quantile calculation for classic buckets

Tiny differences between classic buckets are most likely caused by floating point precision issues. With this commit, relative changes below a certain threshold are ignored. This makes the result of histogram_quantile more meaningful, and also avoids triggering the _input to histogram_quantile needed to be fixed for monotonicity_ annotations in unactionable cases.

This commit also adds explanation of the new adjustment and of the monotonicity annotation to the documentation of `histogram_quantile`.

---------

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
											
										
										
											2023-11-24 15:05:38 -08:00
+								// As a somewhat hacky solution, we first silently ignore any numerically
 								// insignificant (relative delta below the requested tolerance and likely to
 								// be from floating point precision errors) differences between successive
 								// buckets regardless of the direction. Then we calculate the "envelope" of
 								// the histogram buckets, essentially removing any decreases in the count
 								// between successive buckets.
 								//
 								// We return a bool to indicate if this monotonicity was forced or not, and
 								// another bool to indicate if small deltas were ignored or not.
 								func ensureMonotonicAndIgnoreSmallDeltas(buckets buckets, tolerance float64) (bool, bool) {
 									var forcedMonotonic, fixedPrecision bool
 									prev := buckets[0].count
-												Fix off-by-one error in funcHistogramQuantile / ensureMonotonic (#7393)

* Fix off-by-one error in funcHistogramQuantile / ensureMonotonic
* Additional coverage for nonmonotonic histogram buckets

Signed-off-by: Linas Medziunas <linas.medziunas@gmail.com>
											
										
										
											2020-06-15 03:32:10 -07:00
+									for i := 1; i < len(buckets); i++ {
-												PromQL: ignore small errors for bucketQuantile (#13153)

promql: Improve histogram_quantile calculation for classic buckets

Tiny differences between classic buckets are most likely caused by floating point precision issues. With this commit, relative changes below a certain threshold are ignored. This makes the result of histogram_quantile more meaningful, and also avoids triggering the _input to histogram_quantile needed to be fixed for monotonicity_ annotations in unactionable cases.

This commit also adds explanation of the new adjustment and of the monotonicity annotation to the documentation of `histogram_quantile`.

---------

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
											
										
										
											2023-11-24 15:05:38 -08:00
+										curr := buckets[i].count // Assumed always positive.
 										if curr == prev {
 											// No correction needed if the counts are identical between buckets.
 											continue
 										}
-												refactor: extract almost.Equal() to new package

To avoid a circular reference between promql and promqltest.

Signed-off-by: Bryan Boreham <bjboreham@gmail.com>

											
										
										
											2024-04-29 04:37:32 -07:00
+										if almost.Equal(prev, curr, tolerance) {
-												PromQL: ignore small errors for bucketQuantile (#13153)

promql: Improve histogram_quantile calculation for classic buckets

Tiny differences between classic buckets are most likely caused by floating point precision issues. With this commit, relative changes below a certain threshold are ignored. This makes the result of histogram_quantile more meaningful, and also avoids triggering the _input to histogram_quantile needed to be fixed for monotonicity_ annotations in unactionable cases.

This commit also adds explanation of the new adjustment and of the monotonicity annotation to the documentation of `histogram_quantile`.

---------

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
											
										
										
											2023-11-24 15:05:38 -08:00
+											// Silently correct numerically insignificant differences from floating
 											// point precision errors, regardless of direction.
 											// Do not update the 'prev' value as we are ignoring the difference.
 											buckets[i].count = prev
 											fixedPrecision = true
 											continue
 										}
 										if curr < prev {
 											// Force monotonicity by removing any decreases regardless of magnitude.
 											// Do not update the 'prev' value as we are ignoring the decrease.
 											buckets[i].count = prev
 											forcedMonotonic = true
 											continue
-												Force buckets in a histogram to be monotonic for quantile estimation (#2610)

* Force buckets in a histogram to be monotonic for quantile estimation

The assumption that bucket counts increase monotonically with increasing
upperBound may be violated during:

  * Recording rule evaluation of histogram_quantile, especially when rate()
     has been applied to the underlying bucket timeseries.
  * Evaluation of histogram_quantile computed over federated bucket
     timeseries, especially when rate() has been applied

This is because scraped data is not made available to RR evalution or
federation atomically, so some buckets are computed with data from the N
most recent scrapes, but the other buckets are missing the most recent
observations.

Monotonicity is usually guaranteed because if a bucket with upper bound
u1 has count c1, then any bucket with a higher upper bound u > u1 must
have counted all c1 observations and perhaps more, so that c  >= c1.

Randomly interspersed partial sampling breaks that guarantee, and rate()
exacerbates it. Specifically, suppose bucket le=1000 has a count of 10 from
4 samples but the bucket with le=2000 has a count of 7, from 3 samples. The
monotonicity is broken. It is exacerbated by rate() because under normal
operation, cumulative counting of buckets will cause the bucket counts to
diverge such that small differences from missing samples are not a problem.
rate() removes this divergence.)

bucketQuantile depends on that monotonicity to do a binary search for the
bucket with the qth percentile count, so breaking the monotonicity
guarantee causes bucketQuantile() to return undefined (nonsense) results.

As a somewhat hacky solution until the Prometheus project is ready to
accept the changes required to make scrapes atomic, we calculate the
"envelope" of the histogram buckets, essentially removing any decreases
in the count between successive buckets.

* Fix up comment docs for ensureMonotonic

* ensureMonotonic: Use switch statement

Use switch statement rather than if/else for better readability.
Process the most frequent cases first.

											
										
										
											2017-04-14 07:21:49 -07:00
+										}
-												PromQL: ignore small errors for bucketQuantile (#13153)

promql: Improve histogram_quantile calculation for classic buckets

Tiny differences between classic buckets are most likely caused by floating point precision issues. With this commit, relative changes below a certain threshold are ignored. This makes the result of histogram_quantile more meaningful, and also avoids triggering the _input to histogram_quantile needed to be fixed for monotonicity_ annotations in unactionable cases.

This commit also adds explanation of the new adjustment and of the monotonicity annotation to the documentation of `histogram_quantile`.

---------

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
											
										
										
											2023-11-24 15:05:38 -08:00
+										prev = curr
-												Force buckets in a histogram to be monotonic for quantile estimation (#2610)

* Force buckets in a histogram to be monotonic for quantile estimation

The assumption that bucket counts increase monotonically with increasing
upperBound may be violated during:

  * Recording rule evaluation of histogram_quantile, especially when rate()
     has been applied to the underlying bucket timeseries.
  * Evaluation of histogram_quantile computed over federated bucket
     timeseries, especially when rate() has been applied

This is because scraped data is not made available to RR evalution or
federation atomically, so some buckets are computed with data from the N
most recent scrapes, but the other buckets are missing the most recent
observations.

Monotonicity is usually guaranteed because if a bucket with upper bound
u1 has count c1, then any bucket with a higher upper bound u > u1 must
have counted all c1 observations and perhaps more, so that c  >= c1.

Randomly interspersed partial sampling breaks that guarantee, and rate()
exacerbates it. Specifically, suppose bucket le=1000 has a count of 10 from
4 samples but the bucket with le=2000 has a count of 7, from 3 samples. The
monotonicity is broken. It is exacerbated by rate() because under normal
operation, cumulative counting of buckets will cause the bucket counts to
diverge such that small differences from missing samples are not a problem.
rate() removes this divergence.)

bucketQuantile depends on that monotonicity to do a binary search for the
bucket with the qth percentile count, so breaking the monotonicity
guarantee causes bucketQuantile() to return undefined (nonsense) results.

As a somewhat hacky solution until the Prometheus project is ready to
accept the changes required to make scrapes atomic, we calculate the
"envelope" of the histogram buckets, essentially removing any decreases
in the count between successive buckets.

* Fix up comment docs for ensureMonotonic

* ensureMonotonic: Use switch statement

Use switch statement rather than if/else for better readability.
Process the most frequent cases first.

											
										
										
											2017-04-14 07:21:49 -07:00
+									}
-												PromQL: ignore small errors for bucketQuantile (#13153)

promql: Improve histogram_quantile calculation for classic buckets

Tiny differences between classic buckets are most likely caused by floating point precision issues. With this commit, relative changes below a certain threshold are ignored. This makes the result of histogram_quantile more meaningful, and also avoids triggering the _input to histogram_quantile needed to be fixed for monotonicity_ annotations in unactionable cases.

This commit also adds explanation of the new adjustment and of the monotonicity annotation to the documentation of `histogram_quantile`.

---------

Signed-off-by: Jeanette Tan <jeanette.tan@grafana.com>
											
										
										
											2023-11-24 15:05:38 -08:00
+									return forcedMonotonic, fixedPrecision
-												Force buckets in a histogram to be monotonic for quantile estimation (#2610)

* Force buckets in a histogram to be monotonic for quantile estimation

The assumption that bucket counts increase monotonically with increasing
upperBound may be violated during:

  * Recording rule evaluation of histogram_quantile, especially when rate()
     has been applied to the underlying bucket timeseries.
  * Evaluation of histogram_quantile computed over federated bucket
     timeseries, especially when rate() has been applied

This is because scraped data is not made available to RR evalution or
federation atomically, so some buckets are computed with data from the N
most recent scrapes, but the other buckets are missing the most recent
observations.

Monotonicity is usually guaranteed because if a bucket with upper bound
u1 has count c1, then any bucket with a higher upper bound u > u1 must
have counted all c1 observations and perhaps more, so that c  >= c1.

Randomly interspersed partial sampling breaks that guarantee, and rate()
exacerbates it. Specifically, suppose bucket le=1000 has a count of 10 from
4 samples but the bucket with le=2000 has a count of 7, from 3 samples. The
monotonicity is broken. It is exacerbated by rate() because under normal
operation, cumulative counting of buckets will cause the bucket counts to
diverge such that small differences from missing samples are not a problem.
rate() removes this divergence.)

bucketQuantile depends on that monotonicity to do a binary search for the
bucket with the qth percentile count, so breaking the monotonicity
guarantee causes bucketQuantile() to return undefined (nonsense) results.

As a somewhat hacky solution until the Prometheus project is ready to
accept the changes required to make scrapes atomic, we calculate the
"envelope" of the histogram buckets, essentially removing any decreases
in the count between successive buckets.

* Fix up comment docs for ensureMonotonic

* ensureMonotonic: Use switch statement

Use switch statement rather than if/else for better readability.
Process the most frequent cases first.

											
										
										
											2017-04-14 07:21:49 -07:00
+								}
-												func name ref correct "qauntile" -> "quantile" (#5834)

Signed-off-by: ethan <guangming.wang@daocloud.io>
											
										
										
											2019-08-05 22:11:16 -07:00
+								// quantile calculates the given quantile of a vector of samples.
-												Add quantile aggregator.

											
										
										
											2016-07-08 05:48:48 -07:00
+								//
-												promql: rename vector

											
										
										
											2016-12-24 01:40:09 -08:00
+								// The Vector will be sorted.
-												Add quantile aggregator.

											
										
										
											2016-07-08 05:48:48 -07:00
+								// If 'values' has zero elements, NaN is returned.
-												add doc

Signed-off-by: jyz0309 <45495947@qq.com>

											
										
										
											2022-02-13 05:59:03 -08:00
+								// If q==NaN, NaN is returned.
-												Add quantile aggregator.

											
										
										
											2016-07-08 05:48:48 -07:00
+								// If q<0, -Inf is returned.
 								// If q>1, +Inf is returned.
-												promql: undo accidental exports

											
										
										
											2016-12-24 02:37:16 -08:00
+								func quantile(q float64, values vectorByValueHeap) float64 {
-												add doc

Signed-off-by: jyz0309 <45495947@qq.com>

											
										
										
											2022-02-13 05:59:03 -08:00
+									if len(values) == 0 || math.IsNaN(q) {
-												Add quantile aggregator.

											
										
										
											2016-07-08 05:48:48 -07:00
+										return math.NaN()
 									}
-												Factor out quantile fucntion.

											
										
										
											2016-07-08 05:33:20 -07:00
+									if q < 0 {
 										return math.Inf(-1)
 									}
 									if q > 1 {
 										return math.Inf(+1)
 									}
-												Add quantile aggregator.

											
										
										
											2016-07-08 05:48:48 -07:00
+									sort.Sort(values)
-												Factor out quantile fucntion.

											
										
										
											2016-07-08 05:33:20 -07:00
 									n := float64(len(values))
 									// When the quantile lies between two samples,
 									// we use a weighted average of the two samples.
 									rank := q * (n - 1)
 									lowerIndex := math.Max(0, math.Floor(rank))
 									upperIndex := math.Min(n-1, lowerIndex+1)
 									weight := rank - math.Floor(rank)
-												promql: Separate `Point` into `FPoint` and `HPoint`

In other words: Instead of having a “polymorphous” `Point` that can
either contain a float value or a histogram value, use an `FPoint` for
floats and an `HPoint` for histograms.

This seemingly small change has a _lot_ of repercussions throughout
the codebase.

The idea here is to avoid the increase in size of `Point` arrays that
happened after native histograms had been added.

The higher-level data structures (`Sample`, `Series`, etc.) are still
“polymorphous”. The same idea could be applied to them, but at each
step the trade-offs needed to be evaluated.

The idea with this change is to do the minimum necessary to get back
to pre-histogram performance for functions that do not touch
histograms. Here are comparisons for the `changes` function. The test
data doesn't include histograms yet. Ideally, there would be no change
in the benchmark result at all.

First runtime v2.39 compared to directly prior to this commit:

```
name                                                  old time/op    new time/op    delta
RangeQuery/expr=changes(a_one[1d]),steps=1-16            391µs ± 2%     542µs ± 1%  +38.58%  (p=0.000 n=9+8)
RangeQuery/expr=changes(a_one[1d]),steps=10-16           452µs ± 2%     617µs ± 2%  +36.48%  (p=0.000 n=10+10)
RangeQuery/expr=changes(a_one[1d]),steps=100-16         1.12ms ± 1%    1.36ms ± 2%  +21.58%  (p=0.000 n=8+10)
RangeQuery/expr=changes(a_one[1d]),steps=1000-16        7.83ms ± 1%    8.94ms ± 1%  +14.21%  (p=0.000 n=10+10)
RangeQuery/expr=changes(a_ten[1d]),steps=1-16           2.98ms ± 0%    3.30ms ± 1%  +10.67%  (p=0.000 n=9+10)
RangeQuery/expr=changes(a_ten[1d]),steps=10-16          3.66ms ± 1%    4.10ms ± 1%  +11.82%  (p=0.000 n=10+10)
RangeQuery/expr=changes(a_ten[1d]),steps=100-16         10.5ms ± 0%    11.8ms ± 1%  +12.50%  (p=0.000 n=8+10)
RangeQuery/expr=changes(a_ten[1d]),steps=1000-16        77.6ms ± 1%    87.4ms ± 1%  +12.63%  (p=0.000 n=9+9)
RangeQuery/expr=changes(a_hundred[1d]),steps=1-16       30.4ms ± 2%    32.8ms ± 1%   +8.01%  (p=0.000 n=10+10)
RangeQuery/expr=changes(a_hundred[1d]),steps=10-16      37.1ms ± 2%    40.6ms ± 2%   +9.64%  (p=0.000 n=10+10)
RangeQuery/expr=changes(a_hundred[1d]),steps=100-16      105ms ± 1%     117ms ± 1%  +11.69%  (p=0.000 n=10+10)
RangeQuery/expr=changes(a_hundred[1d]),steps=1000-16     783ms ± 3%     876ms ± 1%  +11.83%  (p=0.000 n=9+10)
```

And then runtime v2.39 compared to after this commit:

```
name                                                  old time/op    new time/op    delta
RangeQuery/expr=changes(a_one[1d]),steps=1-16            391µs ± 2%     547µs ± 1%  +39.84%  (p=0.000 n=9+8)
RangeQuery/expr=changes(a_one[1d]),steps=10-16           452µs ± 2%     616µs ± 2%  +36.15%  (p=0.000 n=10+10)
RangeQuery/expr=changes(a_one[1d]),steps=100-16         1.12ms ± 1%    1.26ms ± 1%  +12.20%  (p=0.000 n=8+10)
RangeQuery/expr=changes(a_one[1d]),steps=1000-16        7.83ms ± 1%    7.95ms ± 1%   +1.59%  (p=0.000 n=10+8)
RangeQuery/expr=changes(a_ten[1d]),steps=1-16           2.98ms ± 0%    3.38ms ± 2%  +13.49%  (p=0.000 n=9+10)
RangeQuery/expr=changes(a_ten[1d]),steps=10-16          3.66ms ± 1%    4.02ms ± 1%   +9.80%  (p=0.000 n=10+9)
RangeQuery/expr=changes(a_ten[1d]),steps=100-16         10.5ms ± 0%    10.8ms ± 1%   +3.08%  (p=0.000 n=8+10)
RangeQuery/expr=changes(a_ten[1d]),steps=1000-16        77.6ms ± 1%    78.1ms ± 1%   +0.58%  (p=0.035 n=9+10)
RangeQuery/expr=changes(a_hundred[1d]),steps=1-16       30.4ms ± 2%    33.5ms ± 4%  +10.18%  (p=0.000 n=10+10)
RangeQuery/expr=changes(a_hundred[1d]),steps=10-16      37.1ms ± 2%    40.0ms ± 1%   +7.98%  (p=0.000 n=10+10)
RangeQuery/expr=changes(a_hundred[1d]),steps=100-16      105ms ± 1%     107ms ± 1%   +1.92%  (p=0.000 n=10+10)
RangeQuery/expr=changes(a_hundred[1d]),steps=1000-16     783ms ± 3%     775ms ± 1%   -1.02%  (p=0.019 n=9+9)
```

In summary, the runtime doesn't really improve with this change for
queries with just a few steps. For queries with many steps, this
commit essentially reinstates the old performance. This is good
because the many-step queries are the one that matter most (longest
absolute runtime).

In terms of allocations, though, this commit doesn't make a dent at
all (numbers not shown). The reason is that most of the allocations
happen in the sampleRingIterator (in the storage package), which has
to be addressed in a separate commit.

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2022-10-28 07:58:40 -07:00
+									return values[int(lowerIndex)].F*(1-weight) + values[int(upperIndex)].F*weight
-												Factor out quantile fucntion.

											
										
										
											2016-07-08 05:33:20 -07:00
+								}