From 05e411a8ebbeb4c0882a05c00372e67f6d340fc7 Mon Sep 17 00:00:00 2001
From: Fabian Reinartz <fab.reinartz@gmail.com>
Date: Wed, 7 Jun 2017 13:42:53 +0200
Subject: [PATCH] Improve heuristic to spread chunks across block

---
 cmd/tsdb/main.go |  8 ++++---
 compact.go       |  3 ++-
 head.go          | 58 ++++++++++++++++++++++++++++++++++--------------
 head_test.go     | 42 +++++++++++++++++++++++++++++++++++
 4 files changed, 90 insertions(+), 21 deletions(-)

diff --git a/cmd/tsdb/main.go b/cmd/tsdb/main.go
index a12dc675d..97280ffe4 100644
--- a/cmd/tsdb/main.go
+++ b/cmd/tsdb/main.go
@@ -113,7 +113,7 @@ func (b *writeBenchmark) run(cmd *cobra.Command, args []string) {
 	st, err := tsdb.Open(dir, nil, nil, &tsdb.Options{
 		WALFlushInterval:  200 * time.Millisecond,
 		RetentionDuration: 2 * 24 * 60 * 60 * 1000, // 1 days in milliseconds
-		MinBlockDuration:  3 * 60 * 60 * 1000,      // 2 hours in milliseconds
+		MinBlockDuration:  3 * 60 * 60 * 1000,      // 3 hours in milliseconds
 		MaxBlockDuration:  27 * 60 * 60 * 1000,     // 1 days in milliseconds
 	})
 	if err != nil {
@@ -157,6 +157,8 @@ func (b *writeBenchmark) run(cmd *cobra.Command, args []string) {
 	})
 }
 
+const timeDelta = 30000
+
 func (b *writeBenchmark) ingestScrapes(lbls []labels.Labels, scrapeCount int) (uint64, error) {
 	var mu sync.Mutex
 	var total uint64
@@ -174,7 +176,7 @@ func (b *writeBenchmark) ingestScrapes(lbls []labels.Labels, scrapeCount int) (u
 
 			wg.Add(1)
 			go func() {
-				n, err := b.ingestScrapesShard(batch, 100, int64(30000*i))
+				n, err := b.ingestScrapesShard(batch, 100, int64(timeDelta*i))
 				if err != nil {
 					// exitWithError(err)
 					fmt.Println(" err", err)
@@ -212,7 +214,7 @@ func (b *writeBenchmark) ingestScrapesShard(metrics []labels.Labels, scrapeCount
 
 	for i := 0; i < scrapeCount; i++ {
 		app := b.storage.Appender()
-		ts += int64(30000)
+		ts += timeDelta
 
 		for _, s := range scrape {
 			s.value += 1000
diff --git a/compact.go b/compact.go
index ff694e33a..ca646a084 100644
--- a/compact.go
+++ b/compact.go
@@ -314,6 +314,8 @@ func populateBlock(blocks []Block, indexw IndexWriter, chunkw ChunkWriter) (*Blo
 	var metas []BlockMeta
 
 	for i, b := range blocks {
+		metas = append(metas, b.Meta())
+
 		all, err := b.Index().Postings("", "")
 		if err != nil {
 			return nil, err
@@ -328,7 +330,6 @@ func populateBlock(blocks []Block, indexw IndexWriter, chunkw ChunkWriter) (*Blo
 		if err != nil {
 			return nil, err
 		}
-		metas = append(metas, b.Meta())
 	}
 
 	// We fully rebuild the postings list index from merged series.
diff --git a/head.go b/head.go
index 8bf0762bb..db4be4c2f 100644
--- a/head.go
+++ b/head.go
@@ -719,12 +719,7 @@ func (h *HeadBlock) get(hash uint64, lset labels.Labels) *memSeries {
 }
 
 func (h *HeadBlock) create(hash uint64, lset labels.Labels) *memSeries {
-	s := &memSeries{
-		lset: lset,
-		ref:  uint32(len(h.series)),
-	}
-	// create the initial chunk and appender
-	s.cut()
+	s := newMemSeries(lset, uint32(len(h.series)), h.meta.MaxTime)
 
 	// Allocate empty space until we can insert at the given index.
 	h.series = append(h.series, s)
@@ -759,15 +754,18 @@ type memSeries struct {
 	lset   labels.Labels
 	chunks []*memChunk
 
+	nextAt    int64 // timestamp at which to cut the next chunk.
+	maxt      int64 // maximum timestamp for the series.
 	lastValue float64
 	sampleBuf [4]sample
 
 	app chunks.Appender // Current appender for the chunk.
 }
 
-func (s *memSeries) cut() *memChunk {
+func (s *memSeries) cut(mint int64) *memChunk {
 	c := &memChunk{
 		chunk:   chunks.NewXORChunk(),
+		minTime: mint,
 		maxTime: math.MinInt64,
 	}
 	s.chunks = append(s.chunks, c)
@@ -776,32 +774,47 @@ func (s *memSeries) cut() *memChunk {
 	if err != nil {
 		panic(err)
 	}
-
 	s.app = app
 	return c
 }
 
+func newMemSeries(lset labels.Labels, id uint32, maxt int64) *memSeries {
+	s := &memSeries{
+		lset:   lset,
+		ref:    id,
+		maxt:   maxt,
+		nextAt: math.MinInt64,
+	}
+	return s
+}
+
 func (s *memSeries) append(t int64, v float64) bool {
+	const samplesPerChunk = 120
+
 	s.mtx.Lock()
 	defer s.mtx.Unlock()
 
 	var c *memChunk
 
-	if s.head().samples > 130 {
-		c = s.cut()
-		c.minTime = t
-	} else {
-		c = s.head()
-		// Skip duplicate and out of order samples.
-		if c.maxTime >= t {
-			return false
-		}
+	if len(s.chunks) == 0 {
+		c = s.cut(t)
+	}
+	c = s.head()
+	if c.maxTime >= t {
+		return false
+	}
+	if c.samples > samplesPerChunk/4 && t >= s.nextAt {
+		c = s.cut(t)
 	}
 	s.app.Append(t, v)
 
 	c.maxTime = t
 	c.samples++
 
+	if c.samples == samplesPerChunk/4 {
+		s.nextAt = computeChunkEndTime(c.minTime, c.maxTime, s.maxt)
+	}
+
 	s.lastValue = v
 
 	s.sampleBuf[0] = s.sampleBuf[1]
@@ -812,6 +825,17 @@ func (s *memSeries) append(t int64, v float64) bool {
 	return true
 }
 
+// computeChunkEndTime estimates the end timestamp based the beginning of a chunk,
+// its current timestamp and the upper bound up to which we insert data.
+// It assumes that the time range is 1/4 full.
+func computeChunkEndTime(start, cur, max int64) int64 {
+	a := (max - start) / ((cur - start + 1) * 4)
+	if a == 0 {
+		return max
+	}
+	return start + (max-start)/a
+}
+
 func (s *memSeries) iterator(i int) chunks.Iterator {
 	c := s.chunks[i]
 
diff --git a/head_test.go b/head_test.go
index 0463d8430..421fe3992 100644
--- a/head_test.go
+++ b/head_test.go
@@ -731,3 +731,45 @@ Outer:
 
 	return ds
 }
+
+func TestComputeChunkEndTime(t *testing.T) {
+	cases := []struct {
+		start, cur, max int64
+		res             int64
+	}{
+		{
+			start: 0,
+			cur:   250,
+			max:   1000,
+			res:   1000,
+		},
+		{
+			start: 100,
+			cur:   200,
+			max:   1000,
+			res:   550,
+		},
+		// Case where we fit floored 0 chunks. Must catch division by 0
+		// and default to maximum time.
+		{
+			start: 0,
+			cur:   500,
+			max:   1000,
+			res:   1000,
+		},
+		// Catch divison by zero for cur == start. Strictly not a possible case.
+		{
+			start: 100,
+			cur:   100,
+			max:   1000,
+			res:   104,
+		},
+	}
+
+	for _, c := range cases {
+		got := computeChunkEndTime(c.start, c.cur, c.max)
+		if got != c.res {
+			t.Errorf("expected %d for (start: %d, cur: %d, max: %d), got %d", c.res, c.start, c.cur, c.max, got)
+		}
+	}
+}