track OOO-ness in tsdb into a histogram (#56)

* track OOO-ness in tsdb into a histogram This should help with building an understanding of customer requirements as far as how far back samples tend to go. Note: this has already been discussed, reviewed and approved on: https://github.com/grafana/mimir/pull/488 Signed-off-by: Dieter Plaetinck <dieter@grafana.com> * update the test files which were not in mimir/vendor
2025-03-05 20:59:13 -08:00 · 2021-11-29 06:14:35 -05:00 · 2021-11-29 06:14:35 -05:00 · 757f57e509
parent 848f3df106
commit 757f57e509
4 changed files with 48 additions and 32 deletions
--- a/tsdb/head.go
+++ b/tsdb/head.go
@ -282,6 +282,7 @@ type headMetrics struct {
 	checkpointCreationTotal  prometheus.Counter
 	mmapChunkCorruptionTotal prometheus.Counter
 	snapshotReplayErrorTotal prometheus.Counter // Will be either 0 or 1.
+	oooHistogram             prometheus.Histogram
 }

 func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
@ -380,6 +381,20 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
 			Name: "prometheus_tsdb_snapshot_replay_error_total",
 			Help: "Total number snapshot replays that failed.",
 		}),
+		oooHistogram: prometheus.NewHistogram(prometheus.HistogramOpts{
+			Name: "prometheus_tsdb_sample_ooo_delta",
+			Help: "Delta in seconds by which a sample is considered out of order.",
+			Buckets: []float64{
+				// Note that mimir distributor only gives us a range of wallclock-12h to wallclock+15min
+				60 * 10,      // 10 min
+				60 * 30,      // 30 min
+				60 * 60,      // 60 min
+				60 * 60 * 2,  // 2h
+				60 * 60 * 3,  // 3h
+				60 * 60 * 6,  // 6h
+				60 * 60 * 12, // 12h
+			},
+		}),
 	}

 	if r != nil {
--- a/tsdb/head_append.go
+++ b/tsdb/head_append.go
@ -273,10 +273,11 @@ func (a *headAppender) Append(ref storage.SeriesRef, lset labels.Labels, t int64
 	}

 	s.Lock()
-	if err := s.appendable(t, v); err != nil {
+	if delta, err := s.appendable(t, v); err != nil {
 		s.Unlock()
 		if err == storage.ErrOutOfOrderSample {
 			a.head.metrics.outOfOrderSamples.Inc()
+			a.head.metrics.oooHistogram.Observe(float64(delta) / 1000)
 		}
 		return 0, err
 	}
@ -300,24 +301,23 @@ func (a *headAppender) Append(ref storage.SeriesRef, lset labels.Labels, t int64
 }

 // appendable checks whether the given sample is valid for appending to the series.
-func (s *memSeries) appendable(t int64, v float64) error {
+func (s *memSeries) appendable(t int64, v float64) (int64, error) {
 	c := s.head()
 	if c == nil {
-		return nil
+		return 0, nil
 	}
-
 	if t > c.maxTime {
-		return nil
+		return 0, nil
 	}
 	if t < c.maxTime {
-		return storage.ErrOutOfOrderSample
+		return c.maxTime - t, storage.ErrOutOfOrderSample
 	}
 	// We are allowing exact duplicates as we can encounter them in valid cases
 	// like federation and erroring out at that time would be extremely noisy.
 	if math.Float64bits(s.sampleBuf[3].v) != math.Float64bits(v) {
-		return storage.ErrDuplicateSampleForTimestamp
+		return 0, storage.ErrDuplicateSampleForTimestamp
 	}
-	return nil
+	return 0, nil
 }

 // AppendExemplar for headAppender assumes the series ref already exists, and so it doesn't
@ -455,13 +455,14 @@ func (a *headAppender) Commit() (err error) {
 	for i, s := range a.samples {
 		series = a.sampleSeries[i]
 		series.Lock()
-		ok, chunkCreated := series.append(s.T, s.V, a.appendID, a.head.chunkDiskMapper)
+		delta, ok, chunkCreated := series.append(s.T, s.V, a.appendID, a.head.chunkDiskMapper)
 		series.cleanupAppendIDsBelow(a.cleanupAppendIDsBelow)
 		series.pendingCommit = false
 		series.Unlock()

 		if !ok {
 			total--
+			a.head.metrics.oooHistogram.Observe(float64(delta) / 1000)
 			a.head.metrics.outOfOrderSamples.Inc()
 		}
 		if chunkCreated {
@ -480,7 +481,7 @@ func (a *headAppender) Commit() (err error) {
 // the appendID for isolation. (The appendID can be zero, which results in no
 // isolation for this append.)
 // It is unsafe to call this concurrently with s.iterator(...) without holding the series lock.
-func (s *memSeries) append(t int64, v float64, appendID uint64, chunkDiskMapper *chunks.ChunkDiskMapper) (sampleInOrder, chunkCreated bool) {
+func (s *memSeries) append(t int64, v float64, appendID uint64, chunkDiskMapper *chunks.ChunkDiskMapper) (delta int64, sampleInOrder, chunkCreated bool) {
 	// Based on Gorilla white papers this offers near-optimal compression ratio
 	// so anything bigger that this has diminishing returns and increases
 	// the time range within which we have to decompress all samples.
@ -491,7 +492,7 @@ func (s *memSeries) append(t int64, v float64, appendID uint64, chunkDiskMapper
 	if c == nil {
 		if len(s.mmappedChunks) > 0 && s.mmappedChunks[len(s.mmappedChunks)-1].maxTime >= t {
 			// Out of order sample. Sample timestamp is already in the mmapped chunks, so ignore it.
-			return false, false
+			return s.mmappedChunks[len(s.mmappedChunks)-1].maxTime - t, false, false
 		}
 		// There is no chunk in this series yet, create the first chunk for the sample.
 		c = s.cutNewHeadChunk(t, chunkDiskMapper)
@ -500,7 +501,7 @@ func (s *memSeries) append(t int64, v float64, appendID uint64, chunkDiskMapper

 	// Out of order sample.
 	if c.maxTime >= t {
-		return false, chunkCreated
+		return c.maxTime - t, false, chunkCreated
 	}

 	numSamples := c.chunk.NumSamples()
@ -538,7 +539,7 @@ func (s *memSeries) append(t int64, v float64, appendID uint64, chunkDiskMapper
 		s.txs.add(appendID)
 	}

-	return true, chunkCreated
+	return 0, true, chunkCreated
 }

 // computeChunkEndTime estimates the end timestamp based the beginning of a
--- a/tsdb/head_test.go
+++ b/tsdb/head_test.go
@ -557,7 +557,7 @@ func TestMemSeries_truncateChunks(t *testing.T) {
 	s := newMemSeries(lbls, 1, lbls.Hash(), 2000, 0, &memChunkPool, defaultIsolationDisabled)

 	for i := 0; i < 4000; i += 5 {
-		ok, _ := s.append(int64(i), float64(i), 0, chunkDiskMapper)
+		_, ok, _ := s.append(int64(i), float64(i), 0, chunkDiskMapper)
 		require.True(t, ok, "sample append failed")
 	}

@ -1098,19 +1098,19 @@ func TestMemSeries_append(t *testing.T) {
 	// Add first two samples at the very end of a chunk range and the next two
 	// on and after it.
 	// New chunk must correctly be cut at 1000.
-	ok, chunkCreated := s.append(998, 1, 0, chunkDiskMapper)
+	_, ok, chunkCreated := s.append(998, 1, 0, chunkDiskMapper)
 	require.True(t, ok, "append failed")
 	require.True(t, chunkCreated, "first sample created chunk")

-	ok, chunkCreated = s.append(999, 2, 0, chunkDiskMapper)
+	_, ok, chunkCreated = s.append(999, 2, 0, chunkDiskMapper)
 	require.True(t, ok, "append failed")
 	require.False(t, chunkCreated, "second sample should use same chunk")

-	ok, chunkCreated = s.append(1000, 3, 0, chunkDiskMapper)
+	_, ok, chunkCreated = s.append(1000, 3, 0, chunkDiskMapper)
 	require.True(t, ok, "append failed")
 	require.True(t, chunkCreated, "expected new chunk on boundary")

-	ok, chunkCreated = s.append(1001, 4, 0, chunkDiskMapper)
+	_, ok, chunkCreated = s.append(1001, 4, 0, chunkDiskMapper)
 	require.True(t, ok, "append failed")
 	require.False(t, chunkCreated, "second sample should use same chunk")

@ -1123,7 +1123,7 @@ func TestMemSeries_append(t *testing.T) {
 	// Fill the range [1000,2000) with many samples. Intermediate chunks should be cut
 	// at approximately 120 samples per chunk.
 	for i := 1; i < 1000; i++ {
-		ok, _ := s.append(1001+int64(i), float64(i), 0, chunkDiskMapper)
+		_, ok, _ := s.append(1001+int64(i), float64(i), 0, chunkDiskMapper)
 		require.True(t, ok, "append failed")
 	}

@ -1149,18 +1149,18 @@ func TestGCChunkAccess(t *testing.T) {
 	s, _, _ := h.getOrCreate(1, labels.FromStrings("a", "1"))

 	// Appending 2 samples for the first chunk.
-	ok, chunkCreated := s.append(0, 0, 0, h.chunkDiskMapper)
+	_, ok, chunkCreated := s.append(0, 0, 0, h.chunkDiskMapper)
 	require.True(t, ok, "series append failed")
 	require.True(t, chunkCreated, "chunks was not created")
-	ok, chunkCreated = s.append(999, 999, 0, h.chunkDiskMapper)
+	_, ok, chunkCreated = s.append(999, 999, 0, h.chunkDiskMapper)
 	require.True(t, ok, "series append failed")
 	require.False(t, chunkCreated, "chunks was created")

 	// A new chunks should be created here as it's beyond the chunk range.
-	ok, chunkCreated = s.append(1000, 1000, 0, h.chunkDiskMapper)
+	_, ok, chunkCreated = s.append(1000, 1000, 0, h.chunkDiskMapper)
 	require.True(t, ok, "series append failed")
 	require.True(t, chunkCreated, "chunks was not created")
-	ok, chunkCreated = s.append(1999, 1999, 0, h.chunkDiskMapper)
+	_, ok, chunkCreated = s.append(1999, 1999, 0, h.chunkDiskMapper)
 	require.True(t, ok, "series append failed")
 	require.False(t, chunkCreated, "chunks was created")

@ -1203,18 +1203,18 @@ func TestGCSeriesAccess(t *testing.T) {
 	s, _, _ := h.getOrCreate(1, labels.FromStrings("a", "1"))

 	// Appending 2 samples for the first chunk.
-	ok, chunkCreated := s.append(0, 0, 0, h.chunkDiskMapper)
+	_, ok, chunkCreated := s.append(0, 0, 0, h.chunkDiskMapper)
 	require.True(t, ok, "series append failed")
 	require.True(t, chunkCreated, "chunks was not created")
-	ok, chunkCreated = s.append(999, 999, 0, h.chunkDiskMapper)
+	_, ok, chunkCreated = s.append(999, 999, 0, h.chunkDiskMapper)
 	require.True(t, ok, "series append failed")
 	require.False(t, chunkCreated, "chunks was created")

 	// A new chunks should be created here as it's beyond the chunk range.
-	ok, chunkCreated = s.append(1000, 1000, 0, h.chunkDiskMapper)
+	_, ok, chunkCreated = s.append(1000, 1000, 0, h.chunkDiskMapper)
 	require.True(t, ok, "series append failed")
 	require.True(t, chunkCreated, "chunks was not created")
-	ok, chunkCreated = s.append(1999, 1999, 0, h.chunkDiskMapper)
+	_, ok, chunkCreated = s.append(1999, 1999, 0, h.chunkDiskMapper)
 	require.True(t, ok, "series append failed")
 	require.False(t, chunkCreated, "chunks was created")

@ -1459,10 +1459,10 @@ func TestHeadReadWriterRepair(t *testing.T) {
 		require.True(t, created, "series was not created")

 		for i := 0; i < 7; i++ {
-			ok, chunkCreated := s.append(int64(i*chunkRange), float64(i*chunkRange), 0, h.chunkDiskMapper)
+			_, ok, chunkCreated := s.append(int64(i*chunkRange), float64(i*chunkRange), 0, h.chunkDiskMapper)
 			require.True(t, ok, "series append failed")
 			require.True(t, chunkCreated, "chunk was not created")
-			ok, chunkCreated = s.append(int64(i*chunkRange)+chunkRange-1, float64(i*chunkRange), 0, h.chunkDiskMapper)
+			_, ok, chunkCreated = s.append(int64(i*chunkRange)+chunkRange-1, float64(i*chunkRange), 0, h.chunkDiskMapper)
 			require.True(t, ok, "series append failed")
 			require.False(t, chunkCreated, "chunk was created")
 			require.NoError(t, h.chunkDiskMapper.CutNewFile())
@ -1808,7 +1808,7 @@ func TestIsolationAppendIDZeroIsNoop(t *testing.T) {

 	s, _, _ := h.getOrCreate(1, labels.FromStrings("a", "1"))

-	ok, _ := s.append(0, 0, 0, h.chunkDiskMapper)
+	_, ok, _ := s.append(0, 0, 0, h.chunkDiskMapper)
 	require.True(t, ok, "Series append failed.")
 	require.Equal(t, 0, s.txs.txIDCount, "Series should not have an appendID after append with appendID=0.")
 }
@ -2347,7 +2347,7 @@ func TestMemSafeIteratorSeekIntoBuffer(t *testing.T) {
 	s := newMemSeries(lbls, 1, lbls.Hash(), 500, 0, nil, defaultIsolationDisabled)

 	for i := 0; i < 7; i++ {
-		ok, _ := s.append(int64(i), float64(i), 0, chunkDiskMapper)
+		_, ok, _ := s.append(int64(i), float64(i), 0, chunkDiskMapper)
 		require.True(t, ok, "sample append failed")
 	}

--- a/tsdb/head_wal.go
+++ b/tsdb/head_wal.go
@ -412,7 +412,7 @@ func (wp *walSubsetProcessor) processWALSamples(h *Head) (unknownRefs uint64) {
 			if s.T <= ms.mmMaxTime {
 				continue
 			}
-			if _, chunkCreated := ms.append(s.T, s.V, 0, h.chunkDiskMapper); chunkCreated {
+			if _, _, chunkCreated := ms.append(s.T, s.V, 0, h.chunkDiskMapper); chunkCreated {
 				h.metrics.chunksCreated.Inc()
 				h.metrics.chunks.Inc()
 			}