tsdb: Fix histogram validation

So far, `ValidateHistogram` would not detect if the count did not include the count in the zero bucket. This commit fixes the problem and updates all the tests that have been undetected offenders so far. Note that this problem would only ever create false negatives, so we never falsely rejected to store a histogram because of it. On the other hand, `ValidateFloatHistogram` has been to strict with the count being at least as large as the sum of the counts in all the buckets. Float precision issues could create false positives here, see products of PromQL evaluations, it's actually quite hard to put an upper limit no the floating point imprecision. Users could produce the weirdest expressions, maxing out float precision problems. Therefore, this commit simply removes that particular check from `ValidateFloatHistogram`. Signed-off-by: beorn7 <beorn@grafana.com>
2025-03-05 20:59:13 -08:00 · 2023-08-22 21:51:56 +02:00 · 2023-08-22 21:51:56 +02:00 · aa82fe198f
parent b6e8d6dfae
commit aa82fe198f
9 changed files with 59 additions and 50 deletions
--- a/promql/engine_test.go
+++ b/promql/engine_test.go
@ -3182,7 +3182,7 @@ func TestNativeHistogramRate(t *testing.T) {
 		Schema:           1,
 		ZeroThreshold:    0.001,
 		ZeroCount:        1. / 15.,
-		Count:            8. / 15.,
+		Count:            9. / 15.,
 		Sum:              1.226666666666667,
 		PositiveSpans:    []histogram.Span{{Offset: 0, Length: 2}, {Offset: 1, Length: 2}},
 		PositiveBuckets:  []float64{1. / 15., 1. / 15., 1. / 15., 1. / 15.},
@ -3223,7 +3223,7 @@ func TestNativeFloatHistogramRate(t *testing.T) {
 		Schema:           1,
 		ZeroThreshold:    0.001,
 		ZeroCount:        1. / 15.,
-		Count:            8. / 15.,
+		Count:            9. / 15.,
 		Sum:              1.226666666666667,
 		PositiveSpans:    []histogram.Span{{Offset: 0, Length: 2}, {Offset: 1, Length: 2}},
 		PositiveBuckets:  []float64{1. / 15., 1. / 15., 1. / 15., 1. / 15.},
@ -3996,7 +3996,7 @@ func TestNativeHistogram_Sum_Count_Add_AvgOperator(t *testing.T) {
 				{
 					CounterResetHint: histogram.GaugeType,
 					Schema:           0,
-					Count:            21,
+					Count:            25,
 					Sum:              1234.5,
 					ZeroThreshold:    0.001,
 					ZeroCount:        4,
@ -4014,7 +4014,7 @@ func TestNativeHistogram_Sum_Count_Add_AvgOperator(t *testing.T) {
 				{
 					CounterResetHint: histogram.GaugeType,
 					Schema:           0,
-					Count:            36,
+					Count:            41,
 					Sum:              2345.6,
 					ZeroThreshold:    0.001,
 					ZeroCount:        5,
@ -4034,7 +4034,7 @@ func TestNativeHistogram_Sum_Count_Add_AvgOperator(t *testing.T) {
 				{
 					CounterResetHint: histogram.GaugeType,
 					Schema:           0,
-					Count:            36,
+					Count:            41,
 					Sum:              1111.1,
 					ZeroThreshold:    0.001,
 					ZeroCount:        5,
@ -4061,7 +4061,7 @@ func TestNativeHistogram_Sum_Count_Add_AvgOperator(t *testing.T) {
 				Schema:           0,
 				ZeroThreshold:    0.001,
 				ZeroCount:        14,
-				Count:            93,
+				Count:            107,
 				Sum:              4691.2,
 				PositiveSpans: []histogram.Span{
 					{Offset: 0, Length: 7},
@ -4078,7 +4078,7 @@ func TestNativeHistogram_Sum_Count_Add_AvgOperator(t *testing.T) {
 				Schema:           0,
 				ZeroThreshold:    0.001,
 				ZeroCount:        3.5,
-				Count:            23.25,
+				Count:            26.75,
 				Sum:              1172.8,
 				PositiveSpans: []histogram.Span{
 					{Offset: 0, Length: 7},
@ -4189,7 +4189,7 @@ func TestNativeHistogram_SubOperator(t *testing.T) {
 			histograms: []histogram.Histogram{
 				{
 					Schema:        0,
-					Count:         36,
+					Count:         41,
 					Sum:           2345.6,
 					ZeroThreshold: 0.001,
 					ZeroCount:     5,
@ -4224,7 +4224,7 @@ func TestNativeHistogram_SubOperator(t *testing.T) {
 			},
 			expected: histogram.FloatHistogram{
 				Schema:        0,
-				Count:         25,
+				Count:         30,
 				Sum:           1111.1,
 				ZeroThreshold: 0.001,
 				ZeroCount:     2,
@ -4245,7 +4245,7 @@ func TestNativeHistogram_SubOperator(t *testing.T) {
 			histograms: []histogram.Histogram{
 				{
 					Schema:        0,
-					Count:         36,
+					Count:         41,
 					Sum:           2345.6,
 					ZeroThreshold: 0.001,
 					ZeroCount:     5,
@ -4280,7 +4280,7 @@ func TestNativeHistogram_SubOperator(t *testing.T) {
 			},
 			expected: histogram.FloatHistogram{
 				Schema:        0,
-				Count:         25,
+				Count:         30,
 				Sum:           1111.1,
 				ZeroThreshold: 0.001,
 				ZeroCount:     2,
@ -4315,7 +4315,7 @@ func TestNativeHistogram_SubOperator(t *testing.T) {
 				},
 				{
 					Schema:        0,
-					Count:         36,
+					Count:         41,
 					Sum:           2345.6,
 					ZeroThreshold: 0.001,
 					ZeroCount:     5,
@ -4335,7 +4335,7 @@ func TestNativeHistogram_SubOperator(t *testing.T) {
 			},
 			expected: histogram.FloatHistogram{
 				Schema:        0,
-				Count:         -25,
+				Count:         -30,
 				Sum:           -1111.1,
 				ZeroThreshold: 0.001,
 				ZeroCount:     -2,
--- a/storage/remote/read_handler_test.go
+++ b/storage/remote/read_handler_test.go
--- a/tsdb/block_test.go
+++ b/tsdb/block_test.go
@ -621,7 +621,7 @@ func genSeries(totalSeries, labelCount int, mint, maxt int64) []storage.Series {
 func genHistogramSeries(totalSeries, labelCount int, mint, maxt, step int64, floatHistogram bool) []storage.Series {
 	return genSeriesFromSampleGenerator(totalSeries, labelCount, mint, maxt, step, func(ts int64) tsdbutil.Sample {
 		h := &histogram.Histogram{
-			Count:         5 + uint64(ts*4),
+			Count:         7 + uint64(ts*5),
 			ZeroCount:     2 + uint64(ts),
 			ZeroThreshold: 0.001,
 			Sum:           18.4 * rand.Float64(),
@ -660,7 +660,7 @@ func genHistogramAndFloatSeries(totalSeries, labelCount int, mint, maxt, step in
 			s = sample{t: ts, f: rand.Float64()}
 		} else {
 			h := &histogram.Histogram{
-				Count:         5 + uint64(ts*4),
+				Count:         7 + uint64(ts*5),
 				ZeroCount:     2 + uint64(ts),
 				ZeroThreshold: 0.001,
 				Sum:           18.4 * rand.Float64(),
--- a/tsdb/compact_test.go
+++ b/tsdb/compact_test.go
@ -1364,7 +1364,7 @@ func TestHeadCompactionWithHistograms(t *testing.T) {
 				exp1, exp2, exp3, exp4 []tsdbutil.Sample
 			)
 			h := &histogram.Histogram{
-				Count:         11,
+				Count:         15,
 				ZeroCount:     4,
 				ZeroThreshold: 0.001,
 				Sum:           35.5,
--- a/tsdb/db_test.go
+++ b/tsdb/db_test.go
@ -6133,7 +6133,7 @@ func testHistogramAppendAndQueryHelper(t *testing.T, floatHistogram bool) {
 	}

 	baseH := &histogram.Histogram{
-		Count:         11,
+		Count:         15,
 		ZeroCount:     4,
 		ZeroThreshold: 0.001,
 		Sum:           35.5,
@ -6506,7 +6506,7 @@ func TestNativeHistogramFlag(t *testing.T) {
 		require.NoError(t, db.Close())
 	})
 	h := &histogram.Histogram{
-		Count:         6,
+		Count:         10,
 		ZeroCount:     4,
 		ZeroThreshold: 0.001,
 		Sum:           35.5,
--- a/tsdb/head_append.go
+++ b/tsdb/head_append.go
@ -659,7 +659,7 @@ func ValidateHistogram(h *histogram.Histogram) error {
 		return errors.Wrap(err, "positive side")
 	}

-	if c := nCount + pCount; c > h.Count {
+	if c := nCount + pCount + h.ZeroCount; c > h.Count {
 		return errors.Wrap(
 			storage.ErrHistogramCountNotBigEnough,
 			fmt.Sprintf("%d observations found in buckets, but the Count field is %d", c, h.Count),
@ -686,12 +686,9 @@ func ValidateFloatHistogram(h *histogram.FloatHistogram) error {
 		return errors.Wrap(err, "positive side")
 	}

-	if c := nCount + pCount; c > h.Count {
-		return errors.Wrap(
-			storage.ErrHistogramCountNotBigEnough,
-			fmt.Sprintf("%f observations found in buckets, but the Count field is %f", c, h.Count),
-		)
-	}
+	// We do not check for h.Count being at least as large as the sum of the
+	// counts in the buckets because floating point precision issues can
+	// create false positives here.

 	return nil
 }
--- a/tsdb/head_test.go
+++ b/tsdb/head_test.go
@ -4710,42 +4710,42 @@ func TestReplayAfterMmapReplayError(t *testing.T) {

 func TestHistogramValidation(t *testing.T) {
 	tests := map[string]struct {
-		h           *histogram.Histogram
-		errMsg      string
-		errMsgFloat string // To be considered for float histogram only if it is non-empty.
+		h         *histogram.Histogram
+		errMsg    string
+		skipFloat bool
 	}{
 		"valid histogram": {
 			h: tsdbutil.GenerateTestHistograms(1)[0],
 		},
-		"rejects histogram who has too few negative buckets": {
+		"rejects histogram that has too few negative buckets": {
 			h: &histogram.Histogram{
 				NegativeSpans:   []histogram.Span{{Offset: 0, Length: 1}},
 				NegativeBuckets: []int64{},
 			},
 			errMsg: `negative side: spans need 1 buckets, have 0 buckets`,
 		},
-		"rejects histogram who has too few positive buckets": {
+		"rejects histogram that has too few positive buckets": {
 			h: &histogram.Histogram{
 				PositiveSpans:   []histogram.Span{{Offset: 0, Length: 1}},
 				PositiveBuckets: []int64{},
 			},
 			errMsg: `positive side: spans need 1 buckets, have 0 buckets`,
 		},
-		"rejects histogram who has too many negative buckets": {
+		"rejects histogram that has too many negative buckets": {
 			h: &histogram.Histogram{
 				NegativeSpans:   []histogram.Span{{Offset: 0, Length: 1}},
 				NegativeBuckets: []int64{1, 2},
 			},
 			errMsg: `negative side: spans need 1 buckets, have 2 buckets`,
 		},
-		"rejects histogram who has too many positive buckets": {
+		"rejects histogram that has too many positive buckets": {
 			h: &histogram.Histogram{
 				PositiveSpans:   []histogram.Span{{Offset: 0, Length: 1}},
 				PositiveBuckets: []int64{1, 2},
 			},
 			errMsg: `positive side: spans need 1 buckets, have 2 buckets`,
 		},
-		"rejects a histogram which has a negative span with a negative offset": {
+		"rejects a histogram that has a negative span with a negative offset": {
 			h: &histogram.Histogram{
 				NegativeSpans:   []histogram.Span{{Offset: -1, Length: 1}, {Offset: -1, Length: 1}},
 				NegativeBuckets: []int64{1, 2},
@ -4759,21 +4759,21 @@ func TestHistogramValidation(t *testing.T) {
 			},
 			errMsg: `positive side: span number 2 with offset -1`,
 		},
-		"rejects a histogram which has a negative bucket with a negative count": {
+		"rejects a histogram that has a negative bucket with a negative count": {
 			h: &histogram.Histogram{
 				NegativeSpans:   []histogram.Span{{Offset: -1, Length: 1}},
 				NegativeBuckets: []int64{-1},
 			},
 			errMsg: `negative side: bucket number 1 has observation count of -1`,
 		},
-		"rejects a histogram which has a positive bucket with a negative count": {
+		"rejects a histogram that has a positive bucket with a negative count": {
 			h: &histogram.Histogram{
 				PositiveSpans:   []histogram.Span{{Offset: -1, Length: 1}},
 				PositiveBuckets: []int64{-1},
 			},
 			errMsg: `positive side: bucket number 1 has observation count of -1`,
 		},
-		"rejects a histogram which which has a lower count than count in buckets": {
+		"rejects a histogram that has a lower count than count in buckets": {
 			h: &histogram.Histogram{
 				Count:           0,
 				NegativeSpans:   []histogram.Span{{Offset: -1, Length: 1}},
@ -4781,25 +4781,36 @@ func TestHistogramValidation(t *testing.T) {
 				NegativeBuckets: []int64{1},
 				PositiveBuckets: []int64{1},
 			},
-			errMsg:      `2 observations found in buckets, but the Count field is 0`,
-			errMsgFloat: `2.000000 observations found in buckets, but the Count field is 0.000000`,
+			errMsg:    `2 observations found in buckets, but the Count field is 0`,
+			skipFloat: true,
+		},
+		"rejects a histogram that doesn't count the zero bucket in its count": {
+			h: &histogram.Histogram{
+				Count:           2,
+				ZeroCount:       1,
+				NegativeSpans:   []histogram.Span{{Offset: -1, Length: 1}},
+				PositiveSpans:   []histogram.Span{{Offset: -1, Length: 1}},
+				NegativeBuckets: []int64{1},
+				PositiveBuckets: []int64{1},
+			},
+			errMsg:    `3 observations found in buckets, but the Count field is 2`,
+			skipFloat: true,
 		},
 	}

 	for testName, tc := range tests {
 		t.Run(testName, func(t *testing.T) {
-			switch err := ValidateHistogram(tc.h); {
-			case tc.errMsg != "":
+			if err := ValidateHistogram(tc.h); tc.errMsg != "" {
 				require.ErrorContains(t, err, tc.errMsg)
-			default:
+			} else {
 				require.NoError(t, err)
 			}
-			switch err := ValidateFloatHistogram(tc.h.ToFloat()); {
-			case tc.errMsgFloat != "":
-				require.ErrorContains(t, err, tc.errMsgFloat)
-			case tc.errMsg != "":
+			if tc.skipFloat {
+				return
+			}
+			if err := ValidateFloatHistogram(tc.h.ToFloat()); tc.errMsg != "" {
 				require.ErrorContains(t, err, tc.errMsg)
-			default:
+			} else {
 				require.NoError(t, err)
 			}
 		})
--- a/tsdb/tsdbutil/histogram.go
+++ b/tsdb/tsdbutil/histogram.go
@ -33,7 +33,7 @@ func GenerateTestHistograms(n int) (r []*histogram.Histogram) {
 // GenerateTestHistogram but it is up to the user to set any known counter reset hint.
 func GenerateTestHistogram(i int) *histogram.Histogram {
 	return &histogram.Histogram{
-		Count:         10 + uint64(i*8),
+		Count:         12 + uint64(i*9),
 		ZeroCount:     2 + uint64(i),
 		ZeroThreshold: 0.001,
 		Sum:           18.4 * float64(i+1),
@ -78,7 +78,7 @@ func GenerateTestFloatHistograms(n int) (r []*histogram.FloatHistogram) {
 // GenerateTestFloatHistogram but it is up to the user to set any known counter reset hint.
 func GenerateTestFloatHistogram(i int) *histogram.FloatHistogram {
 	return &histogram.FloatHistogram{
-		Count:         10 + float64(i*8),
+		Count:         12 + float64(i*9),
 		ZeroCount:     2 + float64(i),
 		ZeroThreshold: 0.001,
 		Sum:           18.4 * float64(i+1),
--- a/web/federate_test.go
+++ b/web/federate_test.go
@ -306,7 +306,7 @@ func TestFederationWithNativeHistograms(t *testing.T) {

 	db := storage.DB
 	hist := &histogram.Histogram{
-		Count:         10,
+		Count:         12,
 		ZeroCount:     2,
 		ZeroThreshold: 0.001,
 		Sum:           39.4,
@ -359,6 +359,7 @@ func TestFederationWithNativeHistograms(t *testing.T) {
 			})
 		default:
 			hist.ZeroCount++
+			hist.Count++
 			_, err = app.AppendHistogram(0, l, 100*60*1000, hist.Copy(), nil)
 			expVec = append(expVec, promql.Sample{
 				T:      100 * 60 * 1000,