Simplify trickier estimation cases by not estimating at all.

2025-03-05 20:59:13 -08:00 · 2023-07-06 15:25:11 +10:00 · 2023-07-06 15:25:11 +10:00 · 31235c351f
parent f71a97b460
commit 31235c351f
4 changed files with 34 additions and 133 deletions
--- a/storage/merge.go
+++ b/storage/merge.go
@ -659,98 +659,39 @@ func NewCompactingChunkSeriesMerger(mergeFunc VerticalSeriesMergeFunc) VerticalC
 			return nil
 		}
 		chunkIteratorFn := func(chunks.Iterator) chunks.Iterator {
 			iterators := make([]chunks.Iterator, 0, len(series))
 			for _, s := range series {
 				iterators = append(iterators, s.Iterator(nil))
 			}
 			return &compactChunkIterator{
 				mergeFunc: mergeFunc,
 				iterators: iterators,
 			}
 		}
 		return &ChunkSeriesEntry{
-			Lset: series[0].Labels(),
+			Lset:            series[0].Labels(),
-			ChunkIteratorFn: func(chunks.Iterator) chunks.Iterator {
+			ChunkIteratorFn: chunkIteratorFn,
 				iterators := make([]chunks.Iterator, 0, len(series))
 				for _, s := range series {
 					iterators = append(iterators, s.Iterator(nil))
 				}
 				return &compactChunkIterator{
 					mergeFunc: mergeFunc,
 					iterators: iterators,
 				}
 			},
 			ChunkCountFn: func() int {
-				return estimateCompactedChunkCount(series)
+				// This method is expensive, but we don't expect to ever actually use this on the ingester query path in Mimir -
 				// it's just here to ensure things don't break if this assumption ever changes.
 				// Ingesters return uncompacted chunks to queriers, so this method is never called.
 				return countChunks(chunkIteratorFn)
 			},
 		}
 	}
 }
-// estimateCompactedChunkCount computes an estimate of the resulting number of chunks
+func countChunks(chunkIteratorFn func(chunks.Iterator) chunks.Iterator) int {
-// after compacting series.
+	chunkCount := 0
-//
+	it := chunkIteratorFn(nil)
 // The estimate is imperfect in a few ways, due to the fact it does not examine individual samples:
 //   - it does not check for duplicate samples in overlapping chunks, and so may overestimate
 //     the resulting number of chunks if duplicate samples are present, or if an entire chunk is
 //     duplicated
 //   - it does not check if overlapping chunks have samples that swap back and forth between
 //     different encodings over time, and so may underestimate the resulting number of chunks
 //     (we don't expect this to happen often though, as switching from float samples to histograms
 //     involves changing the instrumentation)
 func estimateCompactedChunkCount(series []ChunkSeries) int {
 	h := make(chunkIteratorHeap, 0, len(series))
-	for _, s := range series {
+	for it.Next() {
-		iter := s.Iterator(nil)
+		chunkCount++
 		if iter.Next() {
 			heap.Push(&h, iter)
 		}
 	}
-	totalChunkCount := 0
+	return chunkCount
 	for len(h) > 0 {
 		iter := heap.Pop(&h).(chunks.Iterator)
 		prev := iter.At()
 		if iter.Next() {
 			heap.Push(&h, iter)
 		}
 		chunkCountForThisTimePeriod := 0
 		sampleCount := prev.Chunk.NumSamples()
 		maxTime := prev.MaxTime
 		// Find all overlapping chunks and estimate the number of resulting chunks.
 		for len(h) > 0 && h[0].At().MinTime <= maxTime {
 			iter := heap.Pop(&h).(chunks.Iterator)
 			next := iter.At()
 			if iter.Next() {
 				heap.Push(&h, iter)
 			}
 			if next.MaxTime > maxTime {
 				maxTime = next.MaxTime
 			}
 			if prev.Chunk.Encoding() != next.Chunk.Encoding() {
 				// If we have more than seriesToChunkEncoderSplit samples, account for the additional chunks we'll create.
 				chunkCountForThisTimePeriod += sampleCount / seriesToChunkEncoderSplit
 				if sampleCount%seriesToChunkEncoderSplit > 0 {
 					chunkCountForThisTimePeriod++
 				}
 				sampleCount = 0
 			}
 			sampleCount += next.Chunk.NumSamples()
 			prev = next
 		}
 		// If we have more than seriesToChunkEncoderSplit samples, account for the additional chunks we'll create.
 		chunkCountForThisTimePeriod += sampleCount / seriesToChunkEncoderSplit
 		if sampleCount%seriesToChunkEncoderSplit > 0 {
 			chunkCountForThisTimePeriod++
 		}
 		if chunkCountForThisTimePeriod == 0 {
 			chunkCountForThisTimePeriod = 1 // We'll always create at least one chunk.
 		}
 		totalChunkCount += chunkCountForThisTimePeriod
 	}
 	return totalChunkCount
 }
 // compactChunkIterator is responsible to compact chunks from different iterators of the same time series into single chainSeries.
--- a/storage/merge_test.go
+++ b/storage/merge_test.go
@ -394,10 +394,9 @@ func TestCompactingChunkSeriesMerger(t *testing.T) {
 	}
 	for _, tc := range []struct {
-		name                   string
+		name     string
-		input                  []ChunkSeries
+		input    []ChunkSeries
-		expected               ChunkSeries
+		expected ChunkSeries
 		expectedChunksEstimate int
 	}{
 		{
 			name: "single empty series",
@ -492,7 +491,6 @@ func TestCompactingChunkSeriesMerger(t *testing.T) {
 			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
 				tsdbutil.GenerateSamples(0, 110),
 			),
 			expectedChunksEstimate: 2, // Estimation doesn't consider duplicate series when estimating the number of chunks.
 		},
 		{
 			name: "150 overlapping samples, split chunk",
@ -530,7 +528,6 @@ func TestCompactingChunkSeriesMerger(t *testing.T) {
 				[]tsdbutil.Sample{fSample{12, 12}, fSample{14, 14}},
 				[]tsdbutil.Sample{histogramSample(15)},
 			),
 			expectedChunksEstimate: 4, // Estimation assumes overlapping chunks don't swap back and forth between different encodings.
 		},
 		{
 			name: "float histogram chunks overlapping",
@ -557,7 +554,6 @@ func TestCompactingChunkSeriesMerger(t *testing.T) {
 				[]tsdbutil.Sample{fSample{12, 12}, fSample{14, 14}},
 				[]tsdbutil.Sample{floatHistogramSample(15)},
 			),
 			expectedChunksEstimate: 4, // Estimation assumes overlapping chunks don't swap back and forth between different encodings.
 		},
 		{
 			name: "float histogram chunks overlapping with histogram chunks",
@ -572,7 +568,6 @@ func TestCompactingChunkSeriesMerger(t *testing.T) {
 				[]tsdbutil.Sample{histogramSample(12), histogramSample(14)},
 				[]tsdbutil.Sample{floatHistogramSample(15)},
 			),
 			expectedChunksEstimate: 4, // Estimation assumes overlapping chunks don't swap back and forth between different encodings.
 		},
 	} {
 		t.Run(tc.name, func(t *testing.T) {
@ -584,11 +579,7 @@ func TestCompactingChunkSeriesMerger(t *testing.T) {
 			require.Equal(t, expErr, actErr)
 			require.Equal(t, expChks, actChks)
-			if tc.expectedChunksEstimate == 0 {
+			require.Len(t, actChks, merged.EstimatedChunkCount())
 				tc.expectedChunksEstimate = len(actChks)
 			}
 			require.Equalf(t, tc.expectedChunksEstimate, merged.EstimatedChunkCount(), "expected estimate of %v chunks, actual chunks are: %v", tc.expectedChunksEstimate, actChks)
 		})
 	}
 }
--- a/storage/series.go
+++ b/storage/series.go
@ -403,35 +403,15 @@ func (s *seriesToChunkEncoder) Iterator(it chunks.Iterator) chunks.Iterator {
 }
 // EstimatedChunkCount returns an estimate of the number of chunks produced by Iterator.
 //
 // It is perfectly accurate except when histograms are present in series. When histograms are
 // present, EstimatedChunkCount will underestimate the number of chunks produced, as the
 // estimation does not consider individual samples and so triggers for new chunks such as
 // counter resets, changes to the bucket schema and changes to the zero threshold are not
 // taken into account.
 func (s *seriesToChunkEncoder) EstimatedChunkCount() int {
 	// This method is expensive, but we don't expect to ever actually use this on the ingester query path in Mimir -
 	// it's just here to ensure things don't break if this assumption ever changes.
 	chunkCount := 0
-	seriesIter := s.Series.Iterator(nil)
+	it := s.Iterator(nil)
 	lastType := chunkenc.ValNone
 	samplesInChunk := 0
-	for typ := seriesIter.Next(); typ != chunkenc.ValNone; typ = seriesIter.Next() {
+	for it.Next() {
-		if chunkCount == 0 {
+		chunkCount++
 			// We'll always have at least one chunk if there's at least one sample.
 			chunkCount++
 		} else if lastType != typ {
 			// If the sample type changes, then we'll cut a new chunk.
 			chunkCount++
 			samplesInChunk = 0
 		}
 		if samplesInChunk == seriesToChunkEncoderSplit {
 			chunkCount++
 			samplesInChunk = 0
 		}
 		lastType = typ
 		samplesInChunk++
 	}
 	return chunkCount
--- a/storage/series_test.go
+++ b/storage/series_test.go
@ -232,7 +232,6 @@ func TestSeriesToChunks(t *testing.T) {
 type histogramTest struct {
 	samples                     []tsdbutil.Sample
 	expectedCounterResetHeaders []chunkenc.CounterResetHeader
 	expectedChunkCountEstimate  int
 }
 func TestHistogramSeriesToChunks(t *testing.T) {
@ -393,7 +392,6 @@ func TestHistogramSeriesToChunks(t *testing.T) {
 				hSample{t: 2, h: h1},
 			},
 			expectedCounterResetHeaders: []chunkenc.CounterResetHeader{chunkenc.UnknownCounterReset, chunkenc.CounterReset},
 			expectedChunkCountEstimate:  1, // Estimate doesn't consider counter resets.
 		},
 		"histogram and stale sample encoded to two chunks": {
 			samples: []tsdbutil.Sample{
@ -401,7 +399,6 @@ func TestHistogramSeriesToChunks(t *testing.T) {
 				hSample{t: 2, h: h1},
 			},
 			expectedCounterResetHeaders: []chunkenc.CounterResetHeader{chunkenc.UnknownCounterReset, chunkenc.UnknownCounterReset},
 			expectedChunkCountEstimate:  1, // Estimate doesn't consider stale markers.
 		},
 		"histogram and reduction in bucket encoded to two chunks": {
 			samples: []tsdbutil.Sample{
@ -409,7 +406,6 @@ func TestHistogramSeriesToChunks(t *testing.T) {
 				hSample{t: 2, h: h2down},
 			},
 			expectedCounterResetHeaders: []chunkenc.CounterResetHeader{chunkenc.UnknownCounterReset, chunkenc.CounterReset},
 			expectedChunkCountEstimate:  1, // Estimate doesn't consider counter resets.
 		},
 		// Float histograms.
 		"single float histogram to single chunk": {
@ -431,7 +427,6 @@ func TestHistogramSeriesToChunks(t *testing.T) {
 				fhSample{t: 2, fh: fh1},
 			},
 			expectedCounterResetHeaders: []chunkenc.CounterResetHeader{chunkenc.UnknownCounterReset, chunkenc.CounterReset},
 			expectedChunkCountEstimate:  1, // Estimate doesn't consider counter resets.
 		},
 		"float histogram and stale sample encoded to two chunks": {
 			samples: []tsdbutil.Sample{
@ -439,7 +434,6 @@ func TestHistogramSeriesToChunks(t *testing.T) {
 				fhSample{t: 2, fh: fh1},
 			},
 			expectedCounterResetHeaders: []chunkenc.CounterResetHeader{chunkenc.UnknownCounterReset, chunkenc.UnknownCounterReset},
 			expectedChunkCountEstimate:  1, // Estimate doesn't consider stale markers.
 		},
 		"float histogram and reduction in bucket encoded to two chunks": {
 			samples: []tsdbutil.Sample{
@ -447,7 +441,6 @@ func TestHistogramSeriesToChunks(t *testing.T) {
 				fhSample{t: 2, fh: fh2down},
 			},
 			expectedCounterResetHeaders: []chunkenc.CounterResetHeader{chunkenc.UnknownCounterReset, chunkenc.CounterReset},
 			expectedChunkCountEstimate:  1, // Estimate doesn't consider counter resets.
 		},
 		// Mixed.
 		"histogram and float histogram encoded to two chunks": {
@ -541,11 +534,7 @@ func testHistogramsSeriesToChunks(t *testing.T, test histogramTest) {
 	require.NoError(t, err)
 	require.Equal(t, len(test.expectedCounterResetHeaders), len(chks))
-	if test.expectedChunkCountEstimate == 0 {
+	require.Len(t, chks, encoder.EstimatedChunkCount())
 		test.expectedChunkCountEstimate = len(chks)
 	}
 	require.Equal(t, test.expectedChunkCountEstimate, encoder.EstimatedChunkCount())
 	// Decode all encoded samples and assert they are equal to the original ones.
 	encodedSamples := expandChunks(chks)