promql: fix rate calculation with a counter reset after the 1st histogram

If a rate (or increase) is calculated on native histograms, and there is a counter reset between the 1st and 2nd histogram, we never have to touch the 1st histogram, so it doesn't even matter if it has an incompatible bucket layout. So we should not error out in that case. This simply nulls out the 1st histogram in that case. Signed-off-by: beorn7 <beorn@grafana.com>
2025-03-05 20:59:13 -08:00 · 2025-01-29 14:47:44 +01:00 · 2025-01-29 14:47:44 +01:00 · 2581c7d057
parent a8235d5dfd
commit 2581c7d057
2 changed files with 77 additions and 29 deletions
--- a/promql/functions.go
+++ b/promql/functions.go
@ -187,35 +187,48 @@ func extrapolatedRate(vals []parser.Value, args parser.Expressions, enh *EvalNod
 // not a histogram, and a warning wrapped in an annotation in that case.
 // Otherwise, it returns the calculated histogram and an empty annotation.
 func histogramRate(points []HPoint, isCounter bool, metricName string, pos posrange.PositionRange) (*histogram.FloatHistogram, annotations.Annotations) {
-	prev := points[0].H
-	usingCustomBuckets := prev.UsesCustomBuckets()
-	last := points[len(points)-1].H
+	var (
+		prev               = points[0].H
+		usingCustomBuckets = prev.UsesCustomBuckets()
+		last               = points[len(points)-1].H
+		annos              annotations.Annotations
+	)
+
 	if last == nil {
-		return nil, annotations.New().Add(annotations.NewMixedFloatsHistogramsWarning(metricName, pos))
+		return nil, annos.Add(annotations.NewMixedFloatsHistogramsWarning(metricName, pos))
 	}

-	minSchema := prev.Schema
-	if last.Schema < minSchema {
-		minSchema = last.Schema
+	// We check for gauge type histograms in the loop below, but the loop
+	// below does not run on the first and last point, so check the first
+	// and last point now.
+	if isCounter && (prev.CounterResetHint == histogram.GaugeType || last.CounterResetHint == histogram.GaugeType) {
+		annos.Add(annotations.NewNativeHistogramNotCounterWarning(metricName, pos))
+	}
+
+	// Null out the 1st sample if there is a counter reset between the 1st
+	// and 2nd. In this case, we want to ignore any incompatibility in the
+	// bucket layout of the 1st sample because we do not need to look at it.
+	if isCounter && len(points) > 1 {
+		second := points[1].H
+		if second != nil && second.DetectReset(prev) {
+			prev = &histogram.FloatHistogram{}
+			prev.Schema = second.Schema
+			prev.CustomValues = second.CustomValues
+			usingCustomBuckets = second.UsesCustomBuckets()
+		}
 	}

 	if last.UsesCustomBuckets() != usingCustomBuckets {
-		return nil, annotations.New().Add(annotations.NewMixedExponentialCustomHistogramsWarning(metricName, pos))
-	}
-
-	var annos annotations.Annotations
-
-	// We check for gauge type histograms in the loop below, but the loop below does not run on the first and last point,
-	// so check the first and last point now.
-	if isCounter && (prev.CounterResetHint == histogram.GaugeType || last.CounterResetHint == histogram.GaugeType) {
-		annos.Add(annotations.NewNativeHistogramNotCounterWarning(metricName, pos))
+		return nil, annos.Add(annotations.NewMixedExponentialCustomHistogramsWarning(metricName, pos))
 	}

 	// First iteration to find out two things:
 	// - What's the smallest relevant schema?
 	// - Are all data points histograms?
-	//   TODO(beorn7): Find a way to check that earlier, e.g. by handing in a
-	//   []FloatPoint and a []HistogramPoint separately.
+	minSchema := prev.Schema
+	if last.Schema < minSchema {
+		minSchema = last.Schema
+	}
 	for _, currPoint := range points[1 : len(points)-1] {
 		curr := currPoint.H
 		if curr == nil {
--- a/promql/promqltest/testdata/native_histograms.test
+++ b/promql/promqltest/testdata/native_histograms.test
@ -1013,7 +1013,7 @@ eval instant at 5m sum(custom_buckets_histogram)

 clear

-# Test 'this native histogram metric is not a gauge' warning for rate
+# Test 'this native histogram metric is not a counter' warning for rate
 load 30s
    some_metric {{schema:0 sum:1 count:1 buckets:[1] counter_reset_hint:gauge}} {{schema:0 sum:2 count:2 buckets:[2] counter_reset_hint:gauge}} {{schema:0 sum:3 count:3 buckets:[3] counter_reset_hint:gauge}}

@ -1022,7 +1022,7 @@ eval_warn instant at 30s rate(some_metric[1m])
    {} {{count:0.03333333333333333 sum:0.03333333333333333 buckets:[0.03333333333333333]}}

 # Test the case where we have more than two points for rate
-eval_warn instant at 1m rate(some_metric[1m])
+eval_warn instant at 1m rate(some_metric[1m30s])
    {} {{count:0.03333333333333333 sum:0.03333333333333333 buckets:[0.03333333333333333]}}

 clear
@ -1032,20 +1032,20 @@ load 30s
    some_metric {{schema:0 sum:1 count:1 buckets:[1]}} {{schema:-53 sum:1 count:1 custom_values:[5 10] buckets:[1]}} {{schema:0 sum:5 count:4 buckets:[1 2 1]}} {{schema:-53 sum:1 count:1 custom_values:[5 10] buckets:[1]}}

 # Start and end with exponential, with custom in the middle.
-eval_warn instant at 1m rate(some_metric[1m])
+eval_warn instant at 1m rate(some_metric[1m30s])
    # Should produce no results.

 # Start and end with custom, with exponential in the middle.
-eval_warn instant at 1m30s rate(some_metric[1m])
+eval_warn instant at 1m30s rate(some_metric[1m30s])
    # Should produce no results.

-# Start with custom, end with exponential.
-eval_warn instant at 1m rate(some_metric[1m])
-    # Should produce no results.
+# Start with custom, end with exponential. Return the exponential histogram divided by 30.
+eval instant at 1m rate(some_metric[1m])
+    {} {{schema:0 sum:0.16666666666666666 count:0.13333333333333333 buckets:[0.03333333333333333 0.06666666666666667 0.03333333333333333]}}

-# Start with exponential, end with custom.
-eval_warn instant at 30s rate(some_metric[1m])
-    # Should produce no results.
+# Start with exponential, end with custom. Return the custom buckets histogram divided by 30.
+eval instant at 30s rate(some_metric[1m])
+    {} {{schema:-53 sum:0.03333333333333333 count:0.03333333333333333 custom_values:[5 10] buckets:[0.03333333333333333]}}

 clear

@ -1179,7 +1179,10 @@ eval_info range from 0 to 6m step 6m metric2 > metric2
 clear

 load 6m
-  nhcb_metric {{schema:-53 sum:1 count:1 custom_values:[2] buckets:[1]}} {{schema:-53 sum:1 count:1 custom_values:[5 10] buckets:[1]}} {{schema:-53 sum:1 count:1 custom_values:[5 10] buckets:[1]}}
+  nhcb_metric {{schema:-53 sum:1 count:1 custom_values:[2] buckets:[1]}} {{schema:-53 sum:1 count:1 custom_values:[2] buckets:[1]}} {{schema:-53 sum:1 count:1 custom_values:[5 10] buckets:[1]}} {{schema:-53 sum:1 count:1 custom_values:[5 10] buckets:[1]}}
+
+# If evaluating at 12m, the first two NHCBs have the same custom values
+# while the 3rd one has different ones.

 eval_warn instant at 12m sum_over_time(nhcb_metric[13m])

@ -1206,6 +1209,38 @@ eval_warn instant at 12m rate(nhcb_metric[13m])
 eval instant at 12m resets(nhcb_metric[13m])
 {} 1

+# Now doing the same again, but at 18m, where the first NHCB has
+# different custom_values compared to the other two. This now
+# works with no warning for increase() and rate(). No change
+# otherwise.
+
+eval_warn instant at 18m sum_over_time(nhcb_metric[13m])
+
+eval_warn instant at 18m avg_over_time(nhcb_metric[13m])
+
+eval instant at 18m last_over_time(nhcb_metric[13m])
+nhcb_metric{} {{schema:-53 sum:1 count:1 custom_values:[5 10] buckets:[1]}}
+
+eval instant at 18m count_over_time(nhcb_metric[13m])
+{} 3
+
+eval instant at 18m present_over_time(nhcb_metric[13m])
+{} 1
+
+eval instant at 18m changes(nhcb_metric[13m])
+{} 1
+
+eval_warn instant at 18m delta(nhcb_metric[13m])
+
+eval instant at 18m increase(nhcb_metric[13m])
+{} {{schema:-53 count:1.0833333333333333 sum:1.0833333333333333 custom_values:[5 10] buckets:[1.0833333333333333]}}
+
+eval instant at 18m rate(nhcb_metric[13m])
+{} {{schema:-53 count:0.0013888888888888887 sum:0.0013888888888888887 custom_values:[5 10] buckets:[0.0013888888888888887]}}
+
+eval instant at 18m resets(nhcb_metric[13m])
+{} 1
+
 clear

 load 1m