From c065fa6957bd3196f237c69c9471168be02b2e99 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bartek=20P=C5=82otka?= <bwplotka@gmail.com>
Date: Fri, 11 Jan 2019 17:31:26 +0000
Subject: [PATCH 01/23] Exposed helper methods for reading index bytes. (#492)

Changes:
* Make `NewReader` method useful. It was impossible to use it, because closer was always nil.
* ReadSymbols, TOC and ReadOffsetTable are not public functions (used by Thanos).
* decbufXXX are now functions.
* More verbose errors.
* Removed unused crc32 field.
* Some var name changes to make it more verbose:
  * symbols -> allocatedSymbols
  * symbolsSlice -> symbolsV1
  * symbols -> symbolsV2
  *
* Pre-calculate symbolsTableSize.
* Initialized symbols for Symbols() method with valid length.
* Added test for Symbol method.
* Made Decoder LookupSymbol method public. Kept Decode public as it is useful as helper from index package.

Signed-off-by: Bartek Plotka <bwplotka@gmail.com>
---
 index/encoding_helpers.go |  56 +++++++
 index/index.go            | 309 ++++++++++++++++----------------------
 index/index_test.go       |  21 ++-
 3 files changed, 203 insertions(+), 183 deletions(-)

diff --git a/index/encoding_helpers.go b/index/encoding_helpers.go
index 602498f115..9104f1cb5f 100644
--- a/index/encoding_helpers.go
+++ b/index/encoding_helpers.go
@@ -18,6 +18,8 @@ import (
 	"hash"
 	"hash/crc32"
 	"unsafe"
+
+	"github.com/pkg/errors"
 )
 
 // enbuf is a helper type to populate a byte slice with various types.
@@ -86,6 +88,60 @@ type decbuf struct {
 	e error
 }
 
+// newDecbufAt returns a new decoding buffer. It expects the first 4 bytes
+// after offset to hold the big endian encoded content length, followed by the contents and the expected
+// checksum.
+func newDecbufAt(bs ByteSlice, off int) decbuf {
+	if bs.Len() < off+4 {
+		return decbuf{e: errInvalidSize}
+	}
+	b := bs.Range(off, off+4)
+	l := int(binary.BigEndian.Uint32(b))
+
+	if bs.Len() < off+4+l+4 {
+		return decbuf{e: errInvalidSize}
+	}
+
+	// Load bytes holding the contents plus a CRC32 checksum.
+	b = bs.Range(off+4, off+4+l+4)
+	dec := decbuf{b: b[:len(b)-4]}
+
+	if exp := binary.BigEndian.Uint32(b[len(b)-4:]); dec.crc32() != exp {
+		return decbuf{e: errInvalidChecksum}
+	}
+	return dec
+}
+
+// decbufUvarintAt returns a new decoding buffer. It expects the first bytes
+// after offset to hold the uvarint-encoded buffers length, followed by the contents and the expected
+// checksum.
+func newDecbufUvarintAt(bs ByteSlice, off int) decbuf {
+	// We never have to access this method at the far end of the byte slice. Thus just checking
+	// against the MaxVarintLen32 is sufficient.
+	if bs.Len() < off+binary.MaxVarintLen32 {
+		return decbuf{e: errInvalidSize}
+	}
+	b := bs.Range(off, off+binary.MaxVarintLen32)
+
+	l, n := binary.Uvarint(b)
+	if n <= 0 || n > binary.MaxVarintLen32 {
+		return decbuf{e: errors.Errorf("invalid uvarint %d", n)}
+	}
+
+	if bs.Len() < off+n+int(l)+4 {
+		return decbuf{e: errInvalidSize}
+	}
+
+	// Load bytes holding the contents plus a CRC32 checksum.
+	b = bs.Range(off+n, off+n+int(l)+4)
+	dec := decbuf{b: b[:len(b)-4]}
+
+	if dec.crc32() != binary.BigEndian.Uint32(b[len(b)-4:]) {
+		return decbuf{e: errInvalidChecksum}
+	}
+	return dec
+}
+
 func (d *decbuf) uvarint() int      { return int(d.uvarint64()) }
 func (d *decbuf) uvarint32() uint32 { return uint32(d.uvarint64()) }
 func (d *decbuf) be32int() int      { return int(d.be32()) }
diff --git a/index/index.go b/index/index.go
index 6413a9fca3..cce29d9b18 100644
--- a/index/index.go
+++ b/index/index.go
@@ -20,6 +20,7 @@ import (
 	"hash"
 	"hash/crc32"
 	"io"
+	"io/ioutil"
 	"math"
 	"os"
 	"path/filepath"
@@ -35,9 +36,13 @@ import (
 const (
 	// MagicIndex 4 bytes at the head of an index file.
 	MagicIndex = 0xBAAAD700
+	// HeaderLen represents number of bytes reserved of index for header.
+	HeaderLen = 5
 
-	indexFormatV1 = 1
-	indexFormatV2 = 2
+	// FormatV1 represents 1 version of index.
+	FormatV1 = 1
+	// FormatV2 represents 2 version of index.
+	FormatV2 = 2
 
 	labelNameSeperator = "\xff"
 )
@@ -108,7 +113,7 @@ type Writer struct {
 	fbuf *bufio.Writer
 	pos  uint64
 
-	toc   indexTOC
+	toc   TOC
 	stage indexWriterStage
 
 	// Reusable memory.
@@ -129,13 +134,42 @@ type Writer struct {
 	Version int
 }
 
-type indexTOC struct {
-	symbols           uint64
-	series            uint64
-	labelIndices      uint64
-	labelIndicesTable uint64
-	postings          uint64
-	postingsTable     uint64
+// TOC represents index Table Of Content that states where each section of index starts.
+type TOC struct {
+	Symbols           uint64
+	Series            uint64
+	LabelIndices      uint64
+	LabelIndicesTable uint64
+	Postings          uint64
+	PostingsTable     uint64
+}
+
+// NewTOCFromByteSlice return parsed TOC from given index byte slice.
+func NewTOCFromByteSlice(bs ByteSlice) (*TOC, error) {
+	if bs.Len() < indexTOCLen {
+		return nil, errInvalidSize
+	}
+	b := bs.Range(bs.Len()-indexTOCLen, bs.Len())
+
+	expCRC := binary.BigEndian.Uint32(b[len(b)-4:])
+	d := decbuf{b: b[:len(b)-4]}
+
+	if d.crc32() != expCRC {
+		return nil, errors.Wrap(errInvalidChecksum, "read TOC")
+	}
+
+	if err := d.err(); err != nil {
+		return nil, err
+	}
+
+	return &TOC{
+		Symbols:           d.be64(),
+		Series:            d.be64(),
+		LabelIndices:      d.be64(),
+		LabelIndicesTable: d.be64(),
+		Postings:          d.be64(),
+		PostingsTable:     d.be64(),
+	}, nil
 }
 
 // NewWriter returns a new Writer to the given filename. It serializes data in format version 2.
@@ -223,22 +257,22 @@ func (w *Writer) ensureStage(s indexWriterStage) error {
 	// Mark start of sections in table of contents.
 	switch s {
 	case idxStageSymbols:
-		w.toc.symbols = w.pos
+		w.toc.Symbols = w.pos
 	case idxStageSeries:
-		w.toc.series = w.pos
+		w.toc.Series = w.pos
 
 	case idxStageLabelIndex:
-		w.toc.labelIndices = w.pos
+		w.toc.LabelIndices = w.pos
 
 	case idxStagePostings:
-		w.toc.postings = w.pos
+		w.toc.Postings = w.pos
 
 	case idxStageDone:
-		w.toc.labelIndicesTable = w.pos
+		w.toc.LabelIndicesTable = w.pos
 		if err := w.writeOffsetTable(w.labelIndexes); err != nil {
 			return err
 		}
-		w.toc.postingsTable = w.pos
+		w.toc.PostingsTable = w.pos
 		if err := w.writeOffsetTable(w.postings); err != nil {
 			return err
 		}
@@ -254,7 +288,7 @@ func (w *Writer) ensureStage(s indexWriterStage) error {
 func (w *Writer) writeMeta() error {
 	w.buf1.reset()
 	w.buf1.putBE32(MagicIndex)
-	w.buf1.putByte(indexFormatV2)
+	w.buf1.putByte(FormatV2)
 
 	return w.write(w.buf1.get())
 }
@@ -346,8 +380,6 @@ func (w *Writer) AddSymbols(sym map[string]struct{}) error {
 	}
 	sort.Strings(symbols)
 
-	const headerSize = 4
-
 	w.buf1.reset()
 	w.buf2.reset()
 
@@ -438,12 +470,12 @@ const indexTOCLen = 6*8 + 4
 func (w *Writer) writeTOC() error {
 	w.buf1.reset()
 
-	w.buf1.putBE64(w.toc.symbols)
-	w.buf1.putBE64(w.toc.series)
-	w.buf1.putBE64(w.toc.labelIndices)
-	w.buf1.putBE64(w.toc.labelIndicesTable)
-	w.buf1.putBE64(w.toc.postings)
-	w.buf1.putBE64(w.toc.postingsTable)
+	w.buf1.putBE64(w.toc.Symbols)
+	w.buf1.putBE64(w.toc.Series)
+	w.buf1.putBE64(w.toc.LabelIndices)
+	w.buf1.putBE64(w.toc.LabelIndicesTable)
+	w.buf1.putBE64(w.toc.Postings)
+	w.buf1.putBE64(w.toc.PostingsTable)
 
 	w.buf1.putHash(w.crc32)
 
@@ -535,15 +567,14 @@ type StringTuples interface {
 }
 
 type Reader struct {
-	// The underlying byte slice holding the encoded series data.
-	b   ByteSlice
-	toc indexTOC
+	b ByteSlice
 
 	// Close that releases the underlying resources of the byte slice.
 	c io.Closer
 
 	// Cached hashmaps of section offsets.
-	labels   map[string]uint64
+	labels map[string]uint64
+	// LabelName to LabelValue to offset map.
 	postings map[string]map[string]uint64
 	// Cache of read symbols. Strings that are returned when reading from the
 	// block are always backed by true strings held in here rather than
@@ -551,19 +582,17 @@ type Reader struct {
 	// prevents memory faults when applications work with read symbols after
 	// the block has been unmapped. The older format has sparse indexes so a map
 	// must be used, but the new format is not so we can use a slice.
-	symbols     map[uint32]string
-	symbolSlice []string
+	symbolsV1        map[uint32]string
+	symbolsV2        []string
+	symbolsTableSize uint64
 
 	dec *Decoder
 
-	crc32 hash.Hash32
-
 	version int
 }
 
 var (
 	errInvalidSize     = fmt.Errorf("invalid size")
-	errInvalidFlag     = fmt.Errorf("invalid flag")
 	errInvalidChecksum = fmt.Errorf("invalid checksum")
 )
 
@@ -587,10 +616,10 @@ func (b realByteSlice) Sub(start, end int) ByteSlice {
 	return b[start:end]
 }
 
-// NewReader returns a new IndexReader on the given byte slice. It automatically
+// NewReader returns a new index reader on the given byte slice. It automatically
 // handles different format versions.
 func NewReader(b ByteSlice) (*Reader, error) {
-	return newReader(b, nil)
+	return newReader(b, ioutil.NopCloser(nil))
 }
 
 // NewFileReader returns a new index reader against the given index file.
@@ -606,14 +635,12 @@ func newReader(b ByteSlice, c io.Closer) (*Reader, error) {
 	r := &Reader{
 		b:        b,
 		c:        c,
-		symbols:  map[uint32]string{},
 		labels:   map[string]uint64{},
 		postings: map[string]map[string]uint64{},
-		crc32:    newCRC32(),
 	}
 
 	// Verify header.
-	if b.Len() < 5 {
+	if r.b.Len() < HeaderLen {
 		return nil, errors.Wrap(errInvalidSize, "index header")
 	}
 	if m := binary.BigEndian.Uint32(r.b.Range(0, 4)); m != MagicIndex {
@@ -621,54 +648,59 @@ func newReader(b ByteSlice, c io.Closer) (*Reader, error) {
 	}
 	r.version = int(r.b.Range(4, 5)[0])
 
-	if r.version != 1 && r.version != 2 {
+	if r.version != FormatV1 && r.version != FormatV2 {
 		return nil, errors.Errorf("unknown index file version %d", r.version)
 	}
 
-	if err := r.readTOC(); err != nil {
+	toc, err := NewTOCFromByteSlice(b)
+	if err != nil {
 		return nil, errors.Wrap(err, "read TOC")
 	}
-	if err := r.readSymbols(int(r.toc.symbols)); err != nil {
+
+	r.symbolsV2, r.symbolsV1, err = ReadSymbols(r.b, r.version, int(toc.Symbols))
+	if err != nil {
 		return nil, errors.Wrap(err, "read symbols")
 	}
-	var err error
 
 	// Use the strings already allocated by symbols, rather than
 	// re-allocating them again below.
-	symbols := make(map[string]string, len(r.symbols)+len(r.symbolSlice))
-	for _, s := range r.symbols {
-		symbols[s] = s
+	// Additionally, calculate symbolsTableSize.
+	allocatedSymbols := make(map[string]string, len(r.symbolsV1)+len(r.symbolsV2))
+	for _, s := range r.symbolsV1 {
+		r.symbolsTableSize += uint64(len(s) + 8)
+		allocatedSymbols[s] = s
 	}
-	for _, s := range r.symbolSlice {
-		symbols[s] = s
+	for _, s := range r.symbolsV2 {
+		r.symbolsTableSize += uint64(len(s) + 8)
+		allocatedSymbols[s] = s
 	}
 
-	err = r.readOffsetTable(r.toc.labelIndicesTable, func(key []string, off uint64) error {
+	if err := ReadOffsetTable(r.b, toc.LabelIndicesTable, func(key []string, off uint64) error {
 		if len(key) != 1 {
-			return errors.Errorf("unexpected key length %d", len(key))
+			return errors.Errorf("unexpected key length for label indices table %d", len(key))
 		}
-		r.labels[symbols[key[0]]] = off
+
+		r.labels[allocatedSymbols[key[0]]] = off
 		return nil
-	})
-	if err != nil {
+	}); err != nil {
 		return nil, errors.Wrap(err, "read label index table")
 	}
+
 	r.postings[""] = map[string]uint64{}
-	err = r.readOffsetTable(r.toc.postingsTable, func(key []string, off uint64) error {
+	if err := ReadOffsetTable(r.b, toc.PostingsTable, func(key []string, off uint64) error {
 		if len(key) != 2 {
-			return errors.Errorf("unexpected key length %d", len(key))
+			return errors.Errorf("unexpected key length for posting table %d", len(key))
 		}
 		if _, ok := r.postings[key[0]]; !ok {
-			r.postings[symbols[key[0]]] = map[string]uint64{}
+			r.postings[allocatedSymbols[key[0]]] = map[string]uint64{}
 		}
-		r.postings[key[0]][symbols[key[1]]] = off
+		r.postings[key[0]][allocatedSymbols[key[1]]] = off
 		return nil
-	})
-	if err != nil {
+	}); err != nil {
 		return nil, errors.Wrap(err, "read postings table")
 	}
 
-	r.dec = &Decoder{lookupSymbol: r.lookupSymbol}
+	r.dec = &Decoder{LookupSymbol: r.lookupSymbol}
 
 	return r, nil
 }
@@ -690,7 +722,7 @@ func (r *Reader) PostingsRanges() (map[labels.Label]Range, error) {
 
 	for k, e := range r.postings {
 		for v, start := range e {
-			d := r.decbufAt(int(start))
+			d := newDecbufAt(r.b, int(start))
 			if d.err() != nil {
 				return nil, d.err()
 			}
@@ -703,121 +735,45 @@ func (r *Reader) PostingsRanges() (map[labels.Label]Range, error) {
 	return m, nil
 }
 
-func (r *Reader) readTOC() error {
-	if r.b.Len() < indexTOCLen {
-		return errInvalidSize
-	}
-	b := r.b.Range(r.b.Len()-indexTOCLen, r.b.Len())
-
-	expCRC := binary.BigEndian.Uint32(b[len(b)-4:])
-	d := decbuf{b: b[:len(b)-4]}
-
-	if d.crc32() != expCRC {
-		return errors.Wrap(errInvalidChecksum, "read TOC")
-	}
-
-	r.toc.symbols = d.be64()
-	r.toc.series = d.be64()
-	r.toc.labelIndices = d.be64()
-	r.toc.labelIndicesTable = d.be64()
-	r.toc.postings = d.be64()
-	r.toc.postingsTable = d.be64()
-
-	return d.err()
-}
-
-// decbufAt returns a new decoding buffer. It expects the first 4 bytes
-// after offset to hold the big endian encoded content length, followed by the contents and the expected
-// checksum.
-func (r *Reader) decbufAt(off int) decbuf {
-	if r.b.Len() < off+4 {
-		return decbuf{e: errInvalidSize}
-	}
-	b := r.b.Range(off, off+4)
-	l := int(binary.BigEndian.Uint32(b))
-
-	if r.b.Len() < off+4+l+4 {
-		return decbuf{e: errInvalidSize}
-	}
-
-	// Load bytes holding the contents plus a CRC32 checksum.
-	b = r.b.Range(off+4, off+4+l+4)
-	dec := decbuf{b: b[:len(b)-4]}
-
-	if exp := binary.BigEndian.Uint32(b[len(b)-4:]); dec.crc32() != exp {
-		return decbuf{e: errInvalidChecksum}
-	}
-	return dec
-}
-
-// decbufUvarintAt returns a new decoding buffer. It expects the first bytes
-// after offset to hold the uvarint-encoded buffers length, followed by the contents and the expected
-// checksum.
-func (r *Reader) decbufUvarintAt(off int) decbuf {
-	// We never have to access this method at the far end of the byte slice. Thus just checking
-	// against the MaxVarintLen32 is sufficient.
-	if r.b.Len() < off+binary.MaxVarintLen32 {
-		return decbuf{e: errInvalidSize}
-	}
-	b := r.b.Range(off, off+binary.MaxVarintLen32)
-
-	l, n := binary.Uvarint(b)
-	if n <= 0 || n > binary.MaxVarintLen32 {
-		return decbuf{e: errors.Errorf("invalid uvarint %d", n)}
-	}
-
-	if r.b.Len() < off+n+int(l)+4 {
-		return decbuf{e: errInvalidSize}
-	}
-
-	// Load bytes holding the contents plus a CRC32 checksum.
-	b = r.b.Range(off+n, off+n+int(l)+4)
-	dec := decbuf{b: b[:len(b)-4]}
-
-	if dec.crc32() != binary.BigEndian.Uint32(b[len(b)-4:]) {
-		return decbuf{e: errInvalidChecksum}
-	}
-	return dec
-}
-
-// readSymbols reads the symbol table fully into memory and allocates proper strings for them.
+// ReadSymbols reads the symbol table fully into memory and allocates proper strings for them.
 // Strings backed by the mmap'd memory would cause memory faults if applications keep using them
 // after the reader is closed.
-func (r *Reader) readSymbols(off int) error {
+func ReadSymbols(bs ByteSlice, version int, off int) ([]string, map[uint32]string, error) {
 	if off == 0 {
-		return nil
+		return nil, nil, nil
 	}
-	d := r.decbufAt(off)
+	d := newDecbufAt(bs, off)
 
 	var (
-		origLen = d.len()
-		cnt     = d.be32int()
-		basePos = uint32(off) + 4
-		nextPos = basePos + uint32(origLen-d.len())
+		origLen     = d.len()
+		cnt         = d.be32int()
+		basePos     = uint32(off) + 4
+		nextPos     = basePos + uint32(origLen-d.len())
+		symbolSlice []string
+		symbols     = map[uint32]string{}
 	)
-	if r.version == 2 {
-		r.symbolSlice = make([]string, 0, cnt)
+	if version == 2 {
+		symbolSlice = make([]string, 0, cnt)
 	}
 
 	for d.err() == nil && d.len() > 0 && cnt > 0 {
 		s := d.uvarintStr()
 
-		if r.version == 2 {
-			r.symbolSlice = append(r.symbolSlice, s)
+		if version == FormatV2 {
+			symbolSlice = append(symbolSlice, s)
 		} else {
-			r.symbols[nextPos] = s
+			symbols[nextPos] = s
 			nextPos = basePos + uint32(origLen-d.len())
 		}
 		cnt--
 	}
-	return errors.Wrap(d.err(), "read symbols")
+	return symbolSlice, symbols, errors.Wrap(d.err(), "read symbols")
 }
 
-// readOffsetTable reads an offset table at the given position calls f for each
-// found entry.f
-// If f returns an error it stops decoding and returns the received error,
-func (r *Reader) readOffsetTable(off uint64, f func([]string, uint64) error) error {
-	d := r.decbufAt(int(off))
+// ReadOffsetTable reads an offset table and at the given position calls f for each
+// found entry. If f returns an error it stops decoding and returns the received error.
+func ReadOffsetTable(bs ByteSlice, off uint64, f func([]string, uint64) error) error {
+	d := newDecbufAt(bs, int(off))
 	cnt := d.be32()
 
 	for d.err() == nil && d.len() > 0 && cnt > 0 {
@@ -845,10 +801,10 @@ func (r *Reader) Close() error {
 }
 
 func (r *Reader) lookupSymbol(o uint32) (string, error) {
-	if int(o) < len(r.symbolSlice) {
-		return r.symbolSlice[o], nil
+	if int(o) < len(r.symbolsV2) {
+		return r.symbolsV2[o], nil
 	}
-	s, ok := r.symbols[o]
+	s, ok := r.symbolsV1[o]
 	if !ok {
 		return "", errors.Errorf("unknown symbol offset %d", o)
 	}
@@ -857,27 +813,20 @@ func (r *Reader) lookupSymbol(o uint32) (string, error) {
 
 // Symbols returns a set of symbols that exist within the index.
 func (r *Reader) Symbols() (map[string]struct{}, error) {
-	res := make(map[string]struct{}, len(r.symbols))
+	res := make(map[string]struct{}, len(r.symbolsV1)+len(r.symbolsV2))
 
-	for _, s := range r.symbols {
+	for _, s := range r.symbolsV1 {
 		res[s] = struct{}{}
 	}
-	for _, s := range r.symbolSlice {
+	for _, s := range r.symbolsV2 {
 		res[s] = struct{}{}
 	}
 	return res, nil
 }
 
-// SymbolTableSize returns the symbol table that is used to resolve symbol references.
+// SymbolTableSize returns the symbol table size in bytes.
 func (r *Reader) SymbolTableSize() uint64 {
-	var size int
-	for _, s := range r.symbols {
-		size += len(s) + 8
-	}
-	for _, s := range r.symbolSlice {
-		size += len(s) + 8
-	}
-	return uint64(size)
+	return r.symbolsTableSize
 }
 
 // LabelValues returns value tuples that exist for the given label name tuples.
@@ -892,7 +841,7 @@ func (r *Reader) LabelValues(names ...string) (StringTuples, error) {
 		//return nil, fmt.Errorf("label index doesn't exist")
 	}
 
-	d := r.decbufAt(int(off))
+	d := newDecbufAt(r.b, int(off))
 
 	nc := d.be32int()
 	d.be32() // consume unused value entry count.
@@ -916,7 +865,7 @@ func (emptyStringTuples) Len() int                   { return 0 }
 // LabelIndices returns a slice of label names for which labels or label tuples value indices exist.
 // NOTE: This is deprecated. Use `LabelNames()` instead.
 func (r *Reader) LabelIndices() ([][]string, error) {
-	res := [][]string{}
+	var res [][]string
 	for s := range r.labels {
 		res = append(res, strings.Split(s, labelNameSeperator))
 	}
@@ -928,10 +877,10 @@ func (r *Reader) Series(id uint64, lbls *labels.Labels, chks *[]chunks.Meta) err
 	offset := id
 	// In version 2 series IDs are no longer exact references but series are 16-byte padded
 	// and the ID is the multiple of 16 of the actual position.
-	if r.version == 2 {
+	if r.version == FormatV2 {
 		offset = id * 16
 	}
-	d := r.decbufUvarintAt(int(offset))
+	d := newDecbufUvarintAt(r.b, int(offset))
 	if d.err() != nil {
 		return d.err()
 	}
@@ -948,7 +897,7 @@ func (r *Reader) Postings(name, value string) (Postings, error) {
 	if !ok {
 		return EmptyPostings(), nil
 	}
-	d := r.decbufAt(int(off))
+	d := newDecbufAt(r.b, int(off))
 	if d.err() != nil {
 		return nil, errors.Wrap(d.err(), "get postings entry")
 	}
@@ -1062,7 +1011,7 @@ func (t *serializedStringTuples) At(i int) ([]string, error) {
 // It currently does not contain decoding methods for all entry types but can be extended
 // by them if there's demand.
 type Decoder struct {
-	lookupSymbol func(uint32) (string, error)
+	LookupSymbol func(uint32) (string, error)
 }
 
 // Postings returns a postings list for b and its number of elements.
@@ -1090,11 +1039,11 @@ func (dec *Decoder) Series(b []byte, lbls *labels.Labels, chks *[]chunks.Meta) e
 			return errors.Wrap(d.err(), "read series label offsets")
 		}
 
-		ln, err := dec.lookupSymbol(lno)
+		ln, err := dec.LookupSymbol(lno)
 		if err != nil {
 			return errors.Wrap(err, "lookup label name")
 		}
-		lv, err := dec.lookupSymbol(lvo)
+		lv, err := dec.LookupSymbol(lvo)
 		if err != nil {
 			return errors.Wrap(err, "lookup label value")
 		}
diff --git a/index/index_test.go b/index/index_test.go
index f915bca289..2edd3956a4 100644
--- a/index/index_test.go
+++ b/index/index_test.go
@@ -380,13 +380,28 @@ func TestPersistence_index_e2e(t *testing.T) {
 		}
 	}
 
+	gotSymbols, err := ir.Symbols()
+	testutil.Ok(t, err)
+
+	testutil.Equals(t, len(mi.symbols), len(gotSymbols))
+	for s := range mi.symbols {
+		_, ok := gotSymbols[s]
+		testutil.Assert(t, ok, "")
+	}
+
 	testutil.Ok(t, ir.Close())
 }
 
+func TestDecbufUvariantWithInvalidBuffer(t *testing.T) {
+	b := realByteSlice([]byte{0x81, 0x81, 0x81, 0x81, 0x81, 0x81})
+
+	db := newDecbufUvarintAt(b, 0)
+	testutil.NotOk(t, db.err())
+}
+
 func TestReaderWithInvalidBuffer(t *testing.T) {
 	b := realByteSlice([]byte{0x81, 0x81, 0x81, 0x81, 0x81, 0x81})
-	r := &Reader{b: b}
 
-	db := r.decbufUvarintAt(0)
-	testutil.NotOk(t, db.err())
+	_, err := NewReader(b)
+	testutil.NotOk(t, err)
 }

From a360aa3e86dc3ff67e6bd242abdb2429ec3ef5fb Mon Sep 17 00:00:00 2001
From: Ye Ben <ben.ye@daocloud.io>
Date: Mon, 14 Jan 2019 16:44:32 +0800
Subject: [PATCH 02/23] change variable name metrics to labels (#496)

Signed-off-by: yeya24 <ben.ye@daocloud.io>
---
 cmd/tsdb/main.go | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cmd/tsdb/main.go b/cmd/tsdb/main.go
index 01d85d590d..e319b490dd 100644
--- a/cmd/tsdb/main.go
+++ b/cmd/tsdb/main.go
@@ -138,7 +138,7 @@ func (b *writeBenchmark) run() {
 	}
 	b.storage = st
 
-	var metrics []labels.Labels
+	var labels []labels.Labels
 
 	measureTime("readData", func() {
 		f, err := os.Open(b.samplesFile)
@@ -147,7 +147,7 @@ func (b *writeBenchmark) run() {
 		}
 		defer f.Close()
 
-		metrics, err = readPrometheusLabels(f, b.numMetrics)
+		labels, err = readPrometheusLabels(f, b.numMetrics)
 		if err != nil {
 			exitWithError(err)
 		}
@@ -157,7 +157,7 @@ func (b *writeBenchmark) run() {
 
 	dur := measureTime("ingestScrapes", func() {
 		b.startProfiling()
-		total, err = b.ingestScrapes(metrics, 3000)
+		total, err = b.ingestScrapes(labels, 3000)
 		if err != nil {
 			exitWithError(err)
 		}
@@ -213,7 +213,7 @@ func (b *writeBenchmark) ingestScrapes(lbls []labels.Labels, scrapeCount int) (u
 	return total, nil
 }
 
-func (b *writeBenchmark) ingestScrapesShard(metrics []labels.Labels, scrapeCount int, baset int64) (uint64, error) {
+func (b *writeBenchmark) ingestScrapesShard(lbls []labels.Labels, scrapeCount int, baset int64) (uint64, error) {
 	ts := baset
 
 	type sample struct {
@@ -222,9 +222,9 @@ func (b *writeBenchmark) ingestScrapesShard(metrics []labels.Labels, scrapeCount
 		ref    *uint64
 	}
 
-	scrape := make([]*sample, 0, len(metrics))
+	scrape := make([]*sample, 0, len(lbls))
 
-	for _, m := range metrics {
+	for _, m := range lbls {
 		scrape = append(scrape, &sample{
 			labels: m,
 			value:  123456789,

From bff5aa4d21113b52b7894d5a72e658a07aa2a77c Mon Sep 17 00:00:00 2001
From: naivewong <867245430@qq.com>
Date: Mon, 14 Jan 2019 23:28:03 +0800
Subject: [PATCH 03/23] Missing the len of crc32 when calculating maxLen in
 WriteChunks (#494)

Signed-off-by: naivewong <867245430@qq.com>
---
 chunks/chunks.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/chunks/chunks.go b/chunks/chunks.go
index 5eab23982d..569aeddc24 100644
--- a/chunks/chunks.go
+++ b/chunks/chunks.go
@@ -205,6 +205,7 @@ func (w *Writer) WriteChunks(chks ...Meta) error {
 	for _, c := range chks {
 		maxLen += binary.MaxVarintLen32 + 1 // The number of bytes in the chunk and its encoding.
 		maxLen += int64(len(c.Chunk.Bytes()))
+		maxLen += 4 // The 4 bytes of crc32
 	}
 	newsz := w.n + maxLen
 

From ebf5d74325a5309a7bd53b654e575c1acf6b7fb1 Mon Sep 17 00:00:00 2001
From: mknapphrt <39998367+mknapphrt@users.noreply.github.com>
Date: Wed, 16 Jan 2019 05:03:52 -0500
Subject: [PATCH 04/23] Added storage size based retention method and new
 metrics (#343)

Added methods needed to retain data based on a byte limitation rather than time. Limitation is only applied if the flag is set (defaults to 0). Both blocks that are older than the retention period and the blocks that make the size of the storage too large are removed.

2 new metrics for keeping track of the size of the local storage folder and the amount of times data has been deleted because the size restriction was exceeded.
Signed-off-by: Mark Knapp <mknapp@hudson-trading.com>
---
 CHANGELOG.md       |   4 +
 block.go           |  38 +++++-
 block_test.go      |   4 +-
 chunks/chunks.go   |  24 ++--
 compact.go         |   2 +-
 db.go              | 286 ++++++++++++++++++++++++++++-----------------
 db_test.go         | 142 ++++++++++++++--------
 index/index.go     |   5 +
 querier_test.go    |   4 +-
 tombstones.go      |  36 ++++--
 tombstones_test.go |   2 +-
 11 files changed, 360 insertions(+), 187 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index af44615e7c..a44471681a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,9 @@
 ## master / unreleased
  - [CHANGE] New `WALSegmentSize` option to override the `DefaultOptions.WALSegmentSize`. Added to allow using smaller wal files. For example using tmpfs on a RPI to minimise the SD card wear out from the constant WAL writes. As part of this change the `DefaultOptions.WALSegmentSize` constant was also exposed.
+ - [FEATURE]  Size base retention through `Options.MaxBytes`.  As part of this change:
+    - added new metrics - `prometheus_tsdb_storage_blocks_bytes_total`, `prometheus_tsdb_size_retentions_total`, `prometheus_tsdb_time_retentions_total`
+    - new public interface `SizeReader: Size() int64`
+    - `OpenBlock` signature changed to take a logger.
  - [REMOVED] `PrefixMatcher` is considered unused so was removed.
  - [CLEANUP] `Options.WALFlushInterval` is removed as it wasn't used anywhere.
 
diff --git a/block.go b/block.go
index e5a66bd9e9..837f4a763d 100644
--- a/block.go
+++ b/block.go
@@ -21,6 +21,8 @@ import (
 	"path/filepath"
 	"sync"
 
+	"github.com/go-kit/kit/log"
+	"github.com/go-kit/kit/log/level"
 	"github.com/oklog/ulid"
 	"github.com/pkg/errors"
 	"github.com/prometheus/tsdb/chunkenc"
@@ -140,6 +142,12 @@ type Appendable interface {
 	Appender() Appender
 }
 
+// SizeReader returns the size of the object in bytes.
+type SizeReader interface {
+	// Size returns the size in bytes.
+	Size() int64
+}
+
 // BlockMeta provides meta information about a block.
 type BlockMeta struct {
 	// Unique identifier for the block and its contents. Changes on compaction.
@@ -166,6 +174,7 @@ type BlockStats struct {
 	NumSeries     uint64 `json:"numSeries,omitempty"`
 	NumChunks     uint64 `json:"numChunks,omitempty"`
 	NumTombstones uint64 `json:"numTombstones,omitempty"`
+	NumBytes      int64  `json:"numBytes,omitempty"`
 }
 
 // BlockDesc describes a block by ULID and time range.
@@ -257,7 +266,10 @@ type Block struct {
 
 // OpenBlock opens the block in the directory. It can be passed a chunk pool, which is used
 // to instantiate chunk structs.
-func OpenBlock(dir string, pool chunkenc.Pool) (*Block, error) {
+func OpenBlock(logger log.Logger, dir string, pool chunkenc.Pool) (*Block, error) {
+	if logger == nil {
+		logger = log.NewNopLogger()
+	}
 	meta, err := readMetaFile(dir)
 	if err != nil {
 		return nil, err
@@ -272,11 +284,20 @@ func OpenBlock(dir string, pool chunkenc.Pool) (*Block, error) {
 		return nil, err
 	}
 
-	tr, err := readTombstones(dir)
+	tr, tsr, err := readTombstones(dir)
 	if err != nil {
 		return nil, err
 	}
 
+	// TODO refactor to set this at block creation time as
+	// that would be the logical place for a block size to be calculated.
+	bs := blockSize(cr, ir, tsr)
+	meta.Stats.NumBytes = bs
+	err = writeMetaFile(dir, meta)
+	if err != nil {
+		level.Warn(logger).Log("msg", "couldn't write the meta file for the block size", "block", dir, "err", err)
+	}
+
 	pb := &Block{
 		dir:             dir,
 		meta:            *meta,
@@ -288,6 +309,16 @@ func OpenBlock(dir string, pool chunkenc.Pool) (*Block, error) {
 	return pb, nil
 }
 
+func blockSize(rr ...SizeReader) int64 {
+	var total int64
+	for _, r := range rr {
+		if r != nil {
+			total += r.Size()
+		}
+	}
+	return total
+}
+
 // Close closes the on-disk block. It blocks as long as there are readers reading from the block.
 func (pb *Block) Close() error {
 	pb.mtx.Lock()
@@ -315,6 +346,9 @@ func (pb *Block) Dir() string { return pb.dir }
 // Meta returns meta information about the block.
 func (pb *Block) Meta() BlockMeta { return pb.meta }
 
+// Size returns the number of bytes that the block takes up.
+func (pb *Block) Size() int64 { return pb.meta.Stats.NumBytes }
+
 // ErrClosing is returned when a block is in the process of being closed.
 var ErrClosing = errors.New("block is closing")
 
diff --git a/block_test.go b/block_test.go
index cf8716062e..789aebaa71 100644
--- a/block_test.go
+++ b/block_test.go
@@ -46,14 +46,14 @@ func TestSetCompactionFailed(t *testing.T) {
 	defer os.RemoveAll(tmpdir)
 
 	blockDir := createBlock(t, tmpdir, 0, 0, 0)
-	b, err := OpenBlock(blockDir, nil)
+	b, err := OpenBlock(nil, blockDir, nil)
 	testutil.Ok(t, err)
 	testutil.Equals(t, false, b.meta.Compaction.Failed)
 	testutil.Ok(t, b.setCompactionFailed())
 	testutil.Equals(t, true, b.meta.Compaction.Failed)
 	testutil.Ok(t, b.Close())
 
-	b, err = OpenBlock(blockDir, nil)
+	b, err = OpenBlock(nil, blockDir, nil)
 	testutil.Ok(t, err)
 	testutil.Equals(t, true, b.meta.Compaction.Failed)
 	testutil.Ok(t, b.Close())
diff --git a/chunks/chunks.go b/chunks/chunks.go
index 569aeddc24..8fb288384e 100644
--- a/chunks/chunks.go
+++ b/chunks/chunks.go
@@ -285,17 +285,15 @@ func (b realByteSlice) Sub(start, end int) ByteSlice {
 // Reader implements a SeriesReader for a serialized byte stream
 // of series data.
 type Reader struct {
-	// The underlying bytes holding the encoded series data.
-	bs []ByteSlice
-
-	// Closers for resources behind the byte slices.
-	cs []io.Closer
-
+	bs   []ByteSlice // The underlying bytes holding the encoded series data.
+	cs   []io.Closer // Closers for resources behind the byte slices.
+	size int64       // The total size of bytes in the reader.
 	pool chunkenc.Pool
 }
 
 func newReader(bs []ByteSlice, cs []io.Closer, pool chunkenc.Pool) (*Reader, error) {
 	cr := Reader{pool: pool, bs: bs, cs: cs}
+	var totalSize int64
 
 	for i, b := range cr.bs {
 		if b.Len() < 4 {
@@ -305,7 +303,9 @@ func newReader(bs []ByteSlice, cs []io.Closer, pool chunkenc.Pool) (*Reader, err
 		if m := binary.BigEndian.Uint32(b.Range(0, 4)); m != MagicChunks {
 			return nil, errors.Errorf("invalid magic number %x", m)
 		}
+		totalSize += int64(b.Len())
 	}
+	cr.size = totalSize
 	return &cr, nil
 }
 
@@ -328,9 +328,10 @@ func NewDirReader(dir string, pool chunkenc.Pool) (*Reader, error) {
 		pool = chunkenc.NewPool()
 	}
 
-	var bs []ByteSlice
-	var cs []io.Closer
-
+	var (
+		bs []ByteSlice
+		cs []io.Closer
+	)
 	for _, fn := range files {
 		f, err := fileutil.OpenMmapFile(fn)
 		if err != nil {
@@ -346,6 +347,11 @@ func (s *Reader) Close() error {
 	return closeAll(s.cs...)
 }
 
+// Size returns the size of the chunks.
+func (s *Reader) Size() int64 {
+	return s.size
+}
+
 func (s *Reader) Chunk(ref uint64) (chunkenc.Chunk, error) {
 	var (
 		seq = int(ref >> 32)
diff --git a/compact.go b/compact.go
index f8e6ff545c..49d4e5868c 100644
--- a/compact.go
+++ b/compact.go
@@ -347,7 +347,7 @@ func (c *LeveledCompactor) Compact(dest string, dirs []string, open []*Block) (u
 
 		if b == nil {
 			var err error
-			b, err = OpenBlock(d, c.chunkPool)
+			b, err = OpenBlock(c.logger, d, c.chunkPool)
 			if err != nil {
 				return uid, err
 			}
diff --git a/db.go b/db.go
index 43cfadd46a..6307004393 100644
--- a/db.go
+++ b/db.go
@@ -58,6 +58,13 @@ type Options struct {
 	// Duration of persisted data to keep.
 	RetentionDuration uint64
 
+	// Maximum number of bytes in blocks to be retained.
+	// 0 or less means disabled.
+	// NOTE: For proper storage calculations need to consider
+	// the size of the WAL folder which is not added when calculating
+	// the current size of the database.
+	MaxBytes int64
+
 	// The sizes of the Blocks.
 	BlockRanges []int64
 
@@ -127,11 +134,12 @@ type dbMetrics struct {
 	reloads              prometheus.Counter
 	reloadsFailed        prometheus.Counter
 	compactionsTriggered prometheus.Counter
+	timeRetentionCount   prometheus.Counter
 	compactionsSkipped   prometheus.Counter
-	cutoffs              prometheus.Counter
-	cutoffsFailed        prometheus.Counter
 	startTime            prometheus.GaugeFunc
 	tombCleanTimer       prometheus.Histogram
+	blocksBytes          prometheus.Gauge
+	sizeRetentionCount   prometheus.Counter
 }
 
 func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics {
@@ -170,18 +178,14 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics {
 		Name: "prometheus_tsdb_compactions_triggered_total",
 		Help: "Total number of triggered compactions for the partition.",
 	})
+	m.timeRetentionCount = prometheus.NewCounter(prometheus.CounterOpts{
+		Name: "prometheus_tsdb_time_retentions_total",
+		Help: "The number of times that blocks were deleted because the maximum time limit was exceeded.",
+	})
 	m.compactionsSkipped = prometheus.NewCounter(prometheus.CounterOpts{
 		Name: "prometheus_tsdb_compactions_skipped_total",
 		Help: "Total number of skipped compactions due to disabled auto compaction.",
 	})
-	m.cutoffs = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "prometheus_tsdb_retention_cutoffs_total",
-		Help: "Number of times the database cut off block data from disk.",
-	})
-	m.cutoffsFailed = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "prometheus_tsdb_retention_cutoffs_failures_total",
-		Help: "Number of times the database failed to cut off block data from disk.",
-	})
 	m.startTime = prometheus.NewGaugeFunc(prometheus.GaugeOpts{
 		Name: "prometheus_tsdb_lowest_timestamp",
 		Help: "Lowest timestamp value stored in the database. The unit is decided by the library consumer.",
@@ -197,6 +201,14 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics {
 		Name: "prometheus_tsdb_tombstone_cleanup_seconds",
 		Help: "The time taken to recompact blocks to remove tombstones.",
 	})
+	m.blocksBytes = prometheus.NewGauge(prometheus.GaugeOpts{
+		Name: "prometheus_tsdb_storage_blocks_bytes_total",
+		Help: "The number of bytes that are currently used for local storage by all blocks.",
+	})
+	m.sizeRetentionCount = prometheus.NewCounter(prometheus.CounterOpts{
+		Name: "prometheus_tsdb_size_retentions_total",
+		Help: "The number of times that blocks were deleted because the maximum number of bytes was exceeded.",
+	})
 
 	if r != nil {
 		r.MustRegister(
@@ -204,11 +216,12 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics {
 			m.symbolTableSize,
 			m.reloads,
 			m.reloadsFailed,
-			m.cutoffs,
-			m.cutoffsFailed,
+			m.timeRetentionCount,
 			m.compactionsTriggered,
 			m.startTime,
 			m.tombCleanTimer,
+			m.blocksBytes,
+			m.sizeRetentionCount,
 		)
 	}
 	return m
@@ -340,25 +353,6 @@ func (db *DB) run() {
 	}
 }
 
-func (db *DB) beyondRetention(meta *BlockMeta) bool {
-	if db.opts.RetentionDuration == 0 {
-		return false
-	}
-
-	db.mtx.RLock()
-	blocks := db.blocks[:]
-	db.mtx.RUnlock()
-
-	if len(blocks) == 0 {
-		return false
-	}
-
-	last := blocks[len(db.blocks)-1]
-	mint := last.Meta().MaxTime - int64(db.opts.RetentionDuration)
-
-	return meta.MaxTime < mint
-}
-
 // Appender opens a new appender against the database.
 func (db *DB) Appender() Appender {
 	return dbAppender{db: db, Appender: db.head.Appender()}
@@ -474,8 +468,7 @@ func (db *DB) getBlock(id ulid.ULID) (*Block, bool) {
 	return nil, false
 }
 
-// reload on-disk blocks and trigger head truncation if new blocks appeared. It takes
-// a list of block directories which should be deleted during reload.
+// reload blocks and trigger head truncation if new blocks appeared.
 // Blocks that are obsolete due to replacement or retention will be deleted.
 func (db *DB) reload() (err error) {
 	defer func() {
@@ -485,112 +478,187 @@ func (db *DB) reload() (err error) {
 		db.metrics.reloads.Inc()
 	}()
 
-	dirs, err := blockDirs(db.dir)
+	loadable, corrupted, err := db.openBlocks()
 	if err != nil {
-		return errors.Wrap(err, "find blocks")
+		return err
 	}
-	// We delete old blocks that have been superseded by new ones by gathering all parents
-	// from existing blocks. Those parents all have newer replacements and can be safely deleted
-	// after we loaded the other blocks.
-	// This makes us resilient against the process crashing towards the end of a compaction.
-	// Creation of a new block and deletion of its parents cannot happen atomically. By creating
-	// blocks with their parents, we can pick up the deletion where it left off during a crash.
-	var (
-		blocks     []*Block
-		corrupted  = map[ulid.ULID]error{}
-		opened     = map[ulid.ULID]struct{}{}
-		deleteable = map[ulid.ULID]struct{}{}
-	)
-	for _, dir := range dirs {
-		meta, err := readMetaFile(dir)
-		if err != nil {
-			// The block was potentially in the middle of being deleted during a crash.
-			// Skip it since we may delete it properly further down again.
-			level.Warn(db.logger).Log("msg", "read meta information", "err", err, "dir", dir)
 
-			ulid, err2 := ulid.Parse(filepath.Base(dir))
-			if err2 != nil {
-				level.Error(db.logger).Log("msg", "not a block dir", "dir", dir)
-				continue
-			}
-			corrupted[ulid] = err
-			continue
-		}
-		if db.beyondRetention(meta) {
-			deleteable[meta.ULID] = struct{}{}
-			continue
-		}
-		for _, b := range meta.Compaction.Parents {
-			deleteable[b.ULID] = struct{}{}
+	deletable := db.deletableBlocks(loadable)
+
+	// Corrupted blocks that have been replaced by parents can be safely ignored and deleted.
+	// This makes it resilient against the process crashing towards the end of a compaction.
+	// Creation of a new block and deletion of its parents cannot happen atomically.
+	// By creating blocks with their parents, we can pick up the deletion where it left off during a crash.
+	for _, block := range loadable {
+		for _, b := range block.Meta().Compaction.Parents {
+			delete(corrupted, b.ULID)
+			deletable[b.ULID] = nil
 		}
 	}
-	// Blocks we failed to open should all be those we are want to delete anyway.
-	for c, err := range corrupted {
-		if _, ok := deleteable[c]; !ok {
-			return errors.Wrapf(err, "unexpected corrupted block %s", c)
-		}
+	if len(corrupted) > 0 {
+		return errors.Wrap(err, "unexpected corrupted block")
 	}
-	// Load new blocks into memory.
-	for _, dir := range dirs {
-		meta, err := readMetaFile(dir)
-		if err != nil {
-			return errors.Wrapf(err, "read meta information %s", dir)
-		}
-		// Don't load blocks that are scheduled for deletion.
-		if _, ok := deleteable[meta.ULID]; ok {
+
+	// All deletable blocks should not be loaded.
+	var (
+		bb         []*Block
+		blocksSize int64
+	)
+	for _, block := range loadable {
+		if _, ok := deletable[block.Meta().ULID]; ok {
+			deletable[block.Meta().ULID] = block
 			continue
 		}
-		// See if we already have the block in memory or open it otherwise.
-		b, ok := db.getBlock(meta.ULID)
-		if !ok {
-			b, err = OpenBlock(dir, db.chunkPool)
-			if err != nil {
-				return errors.Wrapf(err, "open block %s", dir)
-			}
-		}
-		blocks = append(blocks, b)
-		opened[meta.ULID] = struct{}{}
+		bb = append(bb, block)
+		blocksSize += block.Size()
+
 	}
-	sort.Slice(blocks, func(i, j int) bool {
-		return blocks[i].Meta().MinTime < blocks[j].Meta().MinTime
+	loadable = bb
+	db.metrics.blocksBytes.Set(float64(blocksSize))
+
+	sort.Slice(loadable, func(i, j int) bool {
+		return loadable[i].Meta().MaxTime < loadable[j].Meta().MaxTime
 	})
-	if err := validateBlockSequence(blocks); err != nil {
+	if err := validateBlockSequence(loadable); err != nil {
 		return errors.Wrap(err, "invalid block sequence")
 	}
 
-	// Swap in new blocks first for subsequently created readers to be seen.
-	// Then close previous blocks, which may block for pending readers to complete.
+	// Swap new blocks first for subsequently created readers to be seen.
 	db.mtx.Lock()
 	oldBlocks := db.blocks
-	db.blocks = blocks
+	db.blocks = loadable
 	db.mtx.Unlock()
 
-	// Drop old blocks from memory.
 	for _, b := range oldBlocks {
-		if _, ok := opened[b.Meta().ULID]; ok {
-			continue
-		}
-		if err := b.Close(); err != nil {
-			level.Warn(db.logger).Log("msg", "closing block failed", "err", err)
+		if _, ok := deletable[b.Meta().ULID]; ok {
+			deletable[b.Meta().ULID] = b
 		}
 	}
-	// Delete all obsolete blocks. None of them are opened any longer.
-	for ulid := range deleteable {
-		if err := os.RemoveAll(filepath.Join(db.dir, ulid.String())); err != nil {
-			return errors.Wrapf(err, "delete obsolete block %s", ulid)
-		}
+
+	if err := db.deleteBlocks(deletable); err != nil {
+		return err
 	}
 
 	// Garbage collect data in the head if the most recent persisted block
 	// covers data of its current time range.
-	if len(blocks) == 0 {
+	if len(loadable) == 0 {
 		return nil
 	}
-	maxt := blocks[len(blocks)-1].Meta().MaxTime
+
+	maxt := loadable[len(loadable)-1].Meta().MaxTime
 
 	return errors.Wrap(db.head.Truncate(maxt), "head truncate failed")
 }
 
+func (db *DB) openBlocks() (blocks []*Block, corrupted map[ulid.ULID]error, err error) {
+	dirs, err := blockDirs(db.dir)
+	if err != nil {
+		return nil, nil, errors.Wrap(err, "find blocks")
+	}
+
+	corrupted = make(map[ulid.ULID]error)
+	for _, dir := range dirs {
+		meta, err := readMetaFile(dir)
+		if err != nil {
+			level.Error(db.logger).Log("msg", "not a block dir", "dir", dir)
+			continue
+		}
+
+		// See if we already have the block in memory or open it otherwise.
+		block, ok := db.getBlock(meta.ULID)
+		if !ok {
+			block, err = OpenBlock(db.logger, dir, db.chunkPool)
+			if err != nil {
+				corrupted[meta.ULID] = err
+				continue
+			}
+		}
+		blocks = append(blocks, block)
+	}
+	return blocks, corrupted, nil
+}
+
+// deletableBlocks returns all blocks past retention policy.
+func (db *DB) deletableBlocks(blocks []*Block) map[ulid.ULID]*Block {
+	deletable := make(map[ulid.ULID]*Block)
+
+	// Sort the blocks by time - newest to oldest (largest to smallest timestamp).
+	// This ensures that the retentions will remove the oldest  blocks.
+	sort.Slice(blocks, func(i, j int) bool {
+		return blocks[i].Meta().MaxTime > blocks[j].Meta().MaxTime
+	})
+
+	for ulid, block := range db.beyondTimeRetention(blocks) {
+		deletable[ulid] = block
+	}
+
+	for ulid, block := range db.beyondSizeRetention(blocks) {
+		deletable[ulid] = block
+	}
+
+	return deletable
+}
+
+func (db *DB) beyondTimeRetention(blocks []*Block) (deleteable map[ulid.ULID]*Block) {
+	// Time retention is disabled or no blocks to work with.
+	if len(db.blocks) == 0 || db.opts.RetentionDuration == 0 {
+		return
+	}
+
+	deleteable = make(map[ulid.ULID]*Block)
+	for i, block := range blocks {
+		// The difference between the first block and this block is larger than
+		// the retention period so any blocks after that are added as deleteable.
+		if i > 0 && blocks[0].Meta().MaxTime-block.Meta().MaxTime > int64(db.opts.RetentionDuration) {
+			for _, b := range blocks[i:] {
+				deleteable[b.meta.ULID] = b
+			}
+			db.metrics.timeRetentionCount.Inc()
+			break
+		}
+	}
+	return deleteable
+}
+
+func (db *DB) beyondSizeRetention(blocks []*Block) (deleteable map[ulid.ULID]*Block) {
+	// Size retention is disabled or no blocks to work with.
+	if len(db.blocks) == 0 || db.opts.MaxBytes <= 0 {
+		return
+	}
+
+	deleteable = make(map[ulid.ULID]*Block)
+	blocksSize := int64(0)
+	for i, block := range blocks {
+		blocksSize += block.Size()
+		if blocksSize > db.opts.MaxBytes {
+			// Add this and all following blocks for deletion.
+			for _, b := range blocks[i:] {
+				deleteable[b.meta.ULID] = b
+			}
+			db.metrics.sizeRetentionCount.Inc()
+			break
+		}
+	}
+	return deleteable
+}
+
+// deleteBlocks closes and deletes blocks from the disk.
+// When the map contains a non nil block object it means it is loaded in memory
+// so needs to be closed first as it might need to wait for pending readers to complete.
+func (db *DB) deleteBlocks(blocks map[ulid.ULID]*Block) error {
+	for ulid, block := range blocks {
+		if block != nil {
+			if err := block.Close(); err != nil {
+				level.Warn(db.logger).Log("msg", "closing block failed", "err", err)
+			}
+		}
+		if err := os.RemoveAll(filepath.Join(db.dir, ulid.String())); err != nil {
+			return errors.Wrapf(err, "delete obsolete block %s", ulid)
+		}
+	}
+	return nil
+}
+
 // validateBlockSequence returns error if given block meta files indicate that some blocks overlaps within sequence.
 func validateBlockSequence(bs []*Block) error {
 	if len(bs) <= 1 {
diff --git a/db_test.go b/db_test.go
index 92d487eecb..2beef0c523 100644
--- a/db_test.go
+++ b/db_test.go
@@ -92,6 +92,9 @@ func TestDB_reloadOrder(t *testing.T) {
 
 	testutil.Ok(t, db.reload())
 	blocks := db.Blocks()
+	for _, b := range blocks {
+		b.meta.Stats.NumBytes = 0
+	}
 	testutil.Equals(t, 3, len(blocks))
 	testutil.Equals(t, metas[1].MinTime, blocks[0].Meta().MinTime)
 	testutil.Equals(t, metas[1].MaxTime, blocks[0].Meta().MaxTime)
@@ -834,7 +837,7 @@ func TestTombstoneCleanFail(t *testing.T) {
 	totalBlocks := 2
 	for i := 0; i < totalBlocks; i++ {
 		blockDir := createBlock(t, db.Dir(), 0, 0, 0)
-		block, err := OpenBlock(blockDir, nil)
+		block, err := OpenBlock(nil, blockDir, nil)
 		testutil.Ok(t, err)
 		// Add some some fake tombstones to trigger the compaction.
 		tomb := newMemTombstones()
@@ -877,7 +880,7 @@ func (c *mockCompactorFailing) Write(dest string, b BlockReader, mint, maxt int6
 		return ulid.ULID{}, fmt.Errorf("the compactor already did the maximum allowed blocks so it is time to fail")
 	}
 
-	block, err := OpenBlock(createBlock(c.t, dest, 0, 0, 0), nil)
+	block, err := OpenBlock(nil, createBlock(c.t, dest, 0, 0, 0), nil)
 	testutil.Ok(c.t, err)
 	testutil.Ok(c.t, block.Close()) // Close block as we won't be using anywhere.
 	c.blocks = append(c.blocks, block)
@@ -901,59 +904,98 @@ func (*mockCompactorFailing) Compact(dest string, dirs []string, open []*Block)
 
 }
 
-func TestDB_Retention(t *testing.T) {
-	db, close := openTestDB(t, nil)
-	defer close()
-
-	lbls := labels.Labels{labels.Label{Name: "labelname", Value: "labelvalue"}}
-
-	app := db.Appender()
-	_, err := app.Add(lbls, 0, 1)
-	testutil.Ok(t, err)
-	testutil.Ok(t, app.Commit())
-
-	// create snapshot to make it create a block.
-	// TODO(gouthamve): Add a method to compact headblock.
-	snap, err := ioutil.TempDir("", "snap")
-	testutil.Ok(t, err)
-
-	defer os.RemoveAll(snap)
-	testutil.Ok(t, db.Snapshot(snap, true))
-	testutil.Ok(t, db.Close())
-
-	// reopen DB from snapshot
-	db, err = Open(snap, nil, nil, nil)
-	testutil.Ok(t, err)
-
-	testutil.Equals(t, 1, len(db.blocks))
-
-	app = db.Appender()
-	_, err = app.Add(lbls, 100, 1)
-	testutil.Ok(t, err)
-	testutil.Ok(t, app.Commit())
-
-	// Snapshot again to create another block.
-	snap, err = ioutil.TempDir("", "snap")
-	testutil.Ok(t, err)
-	defer os.RemoveAll(snap)
-
-	testutil.Ok(t, db.Snapshot(snap, true))
-	testutil.Ok(t, db.Close())
-
-	// reopen DB from snapshot
-	db, err = Open(snap, nil, nil, &Options{
-		RetentionDuration: 10,
-		BlockRanges:       []int64{50},
+func TestTimeRetention(t *testing.T) {
+	db, close := openTestDB(t, &Options{
+		BlockRanges: []int64{1000},
 	})
-	testutil.Ok(t, err)
+	defer close()
 	defer db.Close()
 
-	testutil.Equals(t, 2, len(db.blocks))
+	blocks := []*BlockMeta{
+		{MinTime: 500, MaxTime: 900}, // Oldest block
+		{MinTime: 1000, MaxTime: 1500},
+		{MinTime: 1500, MaxTime: 2000}, // Newest Block
+	}
 
-	// Reload blocks, which should drop blocks beyond the retention boundary.
+	for _, m := range blocks {
+		createBlock(t, db.Dir(), 10, m.MinTime, m.MaxTime)
+	}
+
+	testutil.Ok(t, db.reload())                       // Reload the db to register the new blocks.
+	testutil.Equals(t, len(blocks), len(db.Blocks())) // Ensure all blocks are registered.
+
+	db.opts.RetentionDuration = uint64(blocks[2].MaxTime - blocks[1].MinTime)
 	testutil.Ok(t, db.reload())
-	testutil.Equals(t, 1, len(db.blocks))
-	testutil.Equals(t, int64(100), db.blocks[0].meta.MaxTime) // To verify its the right block.
+
+	expBlocks := blocks[1:]
+	actBlocks := db.Blocks()
+
+	testutil.Equals(t, 1, int(prom_testutil.ToFloat64(db.metrics.timeRetentionCount)), "metric retention count mismatch")
+	testutil.Equals(t, len(expBlocks), len(actBlocks))
+	testutil.Equals(t, expBlocks[0].MaxTime, actBlocks[0].meta.MaxTime)
+	testutil.Equals(t, expBlocks[len(expBlocks)-1].MaxTime, actBlocks[len(actBlocks)-1].meta.MaxTime)
+}
+
+func TestSizeRetention(t *testing.T) {
+	db, close := openTestDB(t, &Options{
+		BlockRanges: []int64{100},
+	})
+	defer close()
+	defer db.Close()
+
+	blocks := []*BlockMeta{
+		{MinTime: 100, MaxTime: 200}, // Oldest block
+		{MinTime: 200, MaxTime: 300},
+		{MinTime: 300, MaxTime: 400},
+		{MinTime: 400, MaxTime: 500},
+		{MinTime: 500, MaxTime: 600}, // Newest Block
+	}
+
+	for _, m := range blocks {
+		createBlock(t, db.Dir(), 100, m.MinTime, m.MaxTime)
+	}
+
+	// Test that registered size matches the actual disk size.
+	testutil.Ok(t, db.reload())                                       // Reload the db to register the new db size.
+	testutil.Equals(t, len(blocks), len(db.Blocks()))                 // Ensure all blocks are registered.
+	expSize := int64(prom_testutil.ToFloat64(db.metrics.blocksBytes)) // Use the the actual internal metrics.
+	actSize := dbDiskSize(db.Dir())
+	testutil.Equals(t, expSize, actSize, "registered size doesn't match actual disk size")
+
+	// Decrease the max bytes limit so that a delete is triggered.
+	// Check total size, total count and check that the oldest block was deleted.
+	firstBlockSize := db.Blocks()[0].Size()
+	sizeLimit := actSize - firstBlockSize
+	db.opts.MaxBytes = sizeLimit // Set the new db size limit one block smaller that the actual size.
+	testutil.Ok(t, db.reload())  // Reload the db to register the new db size.
+
+	expBlocks := blocks[1:]
+	actBlocks := db.Blocks()
+	expSize = int64(prom_testutil.ToFloat64(db.metrics.blocksBytes))
+	actRetentCount := int(prom_testutil.ToFloat64(db.metrics.sizeRetentionCount))
+	actSize = dbDiskSize(db.Dir())
+
+	testutil.Equals(t, 1, actRetentCount, "metric retention count mismatch")
+	testutil.Equals(t, actSize, expSize, "metric db size doesn't match actual disk size")
+	testutil.Assert(t, expSize <= sizeLimit, "actual size (%v) is expected to be less than or equal to limit (%v)", expSize, sizeLimit)
+	testutil.Equals(t, len(blocks)-1, len(actBlocks), "new block count should be decreased from:%v to:%v", len(blocks), len(blocks)-1)
+	testutil.Equals(t, expBlocks[0].MaxTime, actBlocks[0].meta.MaxTime, "maxT mismatch of the first block")
+	testutil.Equals(t, expBlocks[len(expBlocks)-1].MaxTime, actBlocks[len(actBlocks)-1].meta.MaxTime, "maxT mismatch of the last block")
+
+}
+
+func dbDiskSize(dir string) int64 {
+	var statSize int64
+	filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
+		// Include only index,tombstone and chunks.
+		if filepath.Dir(path) == chunkDir(filepath.Dir(filepath.Dir(path))) ||
+			info.Name() == indexFilename ||
+			info.Name() == tombstoneFilename {
+			statSize += info.Size()
+		}
+		return nil
+	})
+	return statSize
 }
 
 func TestNotMatcherSelectsLabelsUnsetSeries(t *testing.T) {
diff --git a/index/index.go b/index/index.go
index cce29d9b18..74e08d4651 100644
--- a/index/index.go
+++ b/index/index.go
@@ -914,6 +914,11 @@ func (r *Reader) SortedPostings(p Postings) Postings {
 	return p
 }
 
+// Size returns the size of an index file.
+func (r *Reader) Size() int64 {
+	return int64(r.b.Len())
+}
+
 // LabelNames returns all the unique label names present in the index.
 func (r *Reader) LabelNames() ([]string, error) {
 	labelNamesMap := make(map[string]struct{}, len(r.labels))
diff --git a/querier_test.go b/querier_test.go
index 79dfbff7a0..69ca84602b 100644
--- a/querier_test.go
+++ b/querier_test.go
@@ -1227,12 +1227,12 @@ func BenchmarkMergedSeriesSet(b *testing.B) {
 
 func BenchmarkPersistedQueries(b *testing.B) {
 	for _, nSeries := range []int{10, 100} {
-		for _, nSamples := range []int{1000, 10000, 100000} {
+		for _, nSamples := range []int64{1000, 10000, 100000} {
 			b.Run(fmt.Sprintf("series=%d,samplesPerSeries=%d", nSeries, nSamples), func(b *testing.B) {
 				dir, err := ioutil.TempDir("", "bench_persisted")
 				testutil.Ok(b, err)
 				defer os.RemoveAll(dir)
-				block, err := OpenBlock(createBlock(b, dir, nSeries, 1, int64(nSamples)), nil)
+				block, err := OpenBlock(nil, createBlock(b, dir, nSeries, 1, int64(nSamples)), nil)
 				testutil.Ok(b, err)
 				defer block.Close()
 
diff --git a/tombstones.go b/tombstones.go
index a1f30b59c7..0781404067 100644
--- a/tombstones.go
+++ b/tombstones.go
@@ -113,37 +113,41 @@ type Stone struct {
 	intervals Intervals
 }
 
-func readTombstones(dir string) (TombstoneReader, error) {
+func readTombstones(dir string) (TombstoneReader, SizeReader, error) {
 	b, err := ioutil.ReadFile(filepath.Join(dir, tombstoneFilename))
 	if os.IsNotExist(err) {
-		return newMemTombstones(), nil
+		return newMemTombstones(), nil, nil
 	} else if err != nil {
-		return nil, err
+		return nil, nil, err
+	}
+
+	sr := &TombstoneFile{
+		size: int64(len(b)),
 	}
 
 	if len(b) < 5 {
-		return nil, errors.Wrap(errInvalidSize, "tombstones header")
+		return nil, sr, errors.Wrap(errInvalidSize, "tombstones header")
 	}
 
 	d := &decbuf{b: b[:len(b)-4]} // 4 for the checksum.
 	if mg := d.be32(); mg != MagicTombstone {
-		return nil, fmt.Errorf("invalid magic number %x", mg)
+		return nil, sr, fmt.Errorf("invalid magic number %x", mg)
 	}
 	if flag := d.byte(); flag != tombstoneFormatV1 {
-		return nil, fmt.Errorf("invalid tombstone format %x", flag)
+		return nil, sr, fmt.Errorf("invalid tombstone format %x", flag)
 	}
 
 	if d.err() != nil {
-		return nil, d.err()
+		return nil, sr, d.err()
 	}
 
 	// Verify checksum.
 	hash := newCRC32()
 	if _, err := hash.Write(d.get()); err != nil {
-		return nil, errors.Wrap(err, "write to hash")
+		return nil, sr, errors.Wrap(err, "write to hash")
 	}
 	if binary.BigEndian.Uint32(b[len(b)-4:]) != hash.Sum32() {
-		return nil, errors.New("checksum did not match")
+		return nil, sr, errors.New("checksum did not match")
 	}
 
 	stonesMap := newMemTombstones()
@@ -153,13 +157,13 @@ func readTombstones(dir string) (TombstoneReader, error) {
 		mint := d.varint64()
 		maxt := d.varint64()
 		if d.err() != nil {
-			return nil, d.err()
+			return nil, sr, d.err()
 		}
 
 		stonesMap.addInterval(k, Interval{mint, maxt})
 	}
 
-	return stonesMap, nil
+	return stonesMap, sr, nil
 }
 
 type memTombstones struct {
@@ -210,6 +214,16 @@ func (t *memTombstones) addInterval(ref uint64, itvs ...Interval) {
 	}
 }
 
+// TombstoneFile holds information about the tombstone file.
+type TombstoneFile struct {
+	size int64
+}
+
+// Size returns the tombstone file size.
+func (t *TombstoneFile) Size() int64 {
+	return t.size
+}
+
 func (*memTombstones) Close() error {
 	return nil
 }
diff --git a/tombstones_test.go b/tombstones_test.go
index e12574f113..2a106d705a 100644
--- a/tombstones_test.go
+++ b/tombstones_test.go
@@ -46,7 +46,7 @@ func TestWriteAndReadbackTombStones(t *testing.T) {
 
 	testutil.Ok(t, writeTombstoneFile(tmpdir, stones))
 
-	restr, err := readTombstones(tmpdir)
+	restr, _, err := readTombstones(tmpdir)
 	testutil.Ok(t, err)
 
 	// Compare the two readers.

From 3929359302170bd79567f9808bafcb7fdf116acf Mon Sep 17 00:00:00 2001
From: Callum Styan <callumstyan@gmail.com>
Date: Wed, 16 Jan 2019 10:09:08 -0800
Subject: [PATCH 05/23] add live reader for WAL (#481)

* add live reader for WAL

Signed-off-by: Callum Styan <callumstyan@gmail.com>
---
 CHANGELOG.md    |   1 +
 wal/wal.go      | 247 +++++++++++++++++++++++++++++---
 wal/wal_test.go | 368 ++++++++++++++++++++++++++++++++++++------------
 3 files changed, 506 insertions(+), 110 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a44471681a..5975d8b2b0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,7 @@
     - `OpenBlock` signature changed to take a logger.
  - [REMOVED] `PrefixMatcher` is considered unused so was removed.
  - [CLEANUP] `Options.WALFlushInterval` is removed as it wasn't used anywhere.
+ - [FEATURE] Add new `LiveReader` to WAL pacakge. Added to allow live tailing of a WAL segment, used by Prometheus Remote Write after refactor. The main difference between the new reader and the existing `Reader` is that for `LiveReader` a call to `Next()` that returns false does not mean that there will never be more data to read.
 
 ## 0.3.1
  - [BUGFIX] Fixed most windows test and some actual bugs for unclosed file readers.
diff --git a/wal/wal.go b/wal/wal.go
index 5433fd0423..fd90eb90ec 100644
--- a/wal/wal.go
+++ b/wal/wal.go
@@ -832,28 +832,13 @@ func (r *Reader) next() (err error) {
 		}
 		r.rec = append(r.rec, buf[:length]...)
 
-		switch r.curRecTyp {
-		case recFull:
-			if i != 0 {
-				return errors.New("unexpected full record")
-			}
-			return nil
-		case recFirst:
-			if i != 0 {
-				return errors.New("unexpected first record")
-			}
-		case recMiddle:
-			if i == 0 {
-				return errors.New("unexpected middle record")
-			}
-		case recLast:
-			if i == 0 {
-				return errors.New("unexpected last record")
-			}
-			return nil
-		default:
-			return errors.Errorf("unexpected record type %d", r.curRecTyp)
+		if err := validateRecord(r.curRecTyp, i); err != nil {
+			return err
 		}
+		if r.curRecTyp == recLast || r.curRecTyp == recFull {
+			return nil
+		}
+
 		// Only increment i for non-zero records since we use it
 		// to determine valid content record sequences.
 		i++
@@ -904,6 +889,226 @@ func (r *Reader) Offset() int64 {
 	return r.total
 }
 
+// NewLiveReader returns a new live reader.
+func NewLiveReader(r io.Reader) *LiveReader {
+	return &LiveReader{rdr: r}
+}
+
+// Reader reads WAL records from an io.Reader. It buffers partial record data for
+// the next read.
+type LiveReader struct {
+	rdr        io.Reader
+	err        error
+	rec        []byte
+	hdr        [recordHeaderSize]byte
+	buf        [pageSize]byte
+	readIndex  int   // Index in buf to start at for next read.
+	writeIndex int   // Index in buf to start at for next write.
+	total      int64 // Total bytes processed during reading in calls to Next().
+	index      int   // Used to track partial records, should be 0 at the start of every new record.
+}
+
+func (r *LiveReader) Err() error {
+	return r.err
+}
+
+func (r *LiveReader) TotalRead() int64 {
+	return r.total
+}
+
+func (r *LiveReader) fillBuffer() error {
+	n, err := r.rdr.Read(r.buf[r.writeIndex:len(r.buf)])
+	r.writeIndex += n
+	return err
+}
+
+// Shift the buffer up to the read index.
+func (r *LiveReader) shiftBuffer() {
+	copied := copy(r.buf[0:], r.buf[r.readIndex:r.writeIndex])
+	r.readIndex = 0
+	r.writeIndex = copied
+}
+
+// Next returns true if r.rec will contain a full record.
+// False does not indicate that there will never be more data to
+// read for the current io.Reader.
+func (r *LiveReader) Next() bool {
+	for {
+		if r.buildRecord() {
+			return true
+		}
+		if r.err != nil && r.err != io.EOF {
+			return false
+		}
+		if r.readIndex == pageSize {
+			r.shiftBuffer()
+		}
+		if r.writeIndex != pageSize {
+			if err := r.fillBuffer(); err != nil {
+				// We expect to get EOF, since we're reading the segment file as it's being written.
+				if err != io.EOF {
+					r.err = err
+				}
+				return false
+			}
+		}
+	}
+}
+
+// Record returns the current record.
+// The returned byte slice is only valid until the next call to Next.
+func (r *LiveReader) Record() []byte {
+	return r.rec
+}
+
+// Rebuild a full record from potentially partial records. Returns false
+// if there was an error or if we weren't able to read a record for any reason.
+// Returns true if we read a full record. Any record data is appeneded to
+// LiveReader.rec
+func (r *LiveReader) buildRecord() bool {
+	for {
+		// Check that we have data in the internal buffer to read.
+		if r.writeIndex <= r.readIndex {
+			return false
+		}
+
+		// Attempt to read a record, partial or otherwise.
+		temp, n, err := readRecord(r.buf[r.readIndex:r.writeIndex], r.hdr[:], r.total)
+		r.readIndex += n
+		r.total += int64(n)
+		if err != nil {
+			r.err = err
+			return false
+		}
+
+		if temp == nil {
+			return false
+		}
+
+		rt := recType(r.hdr[0])
+
+		if rt == recFirst || rt == recFull {
+			r.rec = r.rec[:0]
+		}
+		r.rec = append(r.rec, temp...)
+
+		if err := validateRecord(rt, r.index); err != nil {
+			r.err = err
+			r.index = 0
+			return false
+		}
+		if rt == recLast || rt == recFull {
+			r.index = 0
+			return true
+		}
+		// Only increment i for non-zero records since we use it
+		// to determine valid content record sequences.
+		r.index++
+	}
+}
+
+// Returns an error if the recType and i indicate an invalid record sequence.
+// As an example, if i is > 0 because we've read some amount of a partial record
+// (recFirst, recMiddle, etc. but not recLast) and then we get another recFirst or recFull
+// instead of a recLast or recMiddle we would have an invalid record.
+func validateRecord(typ recType, i int) error {
+	switch typ {
+	case recFull:
+		if i != 0 {
+			return errors.New("unexpected full record")
+		}
+		return nil
+	case recFirst:
+		if i != 0 {
+			return errors.New("unexpected first record, dropping buffer")
+		}
+		return nil
+	case recMiddle:
+		if i == 0 {
+			return errors.New("unexpected middle record, dropping buffer")
+		}
+		return nil
+	case recLast:
+		if i == 0 {
+			return errors.New("unexpected last record, dropping buffer")
+		}
+		return nil
+	default:
+		return errors.Errorf("unexpected record type %d", typ)
+	}
+}
+
+// Read a sub-record (see recType) from the buffer. It could potentially
+// be a full record (recFull) if the record fits within the bounds of a single page.
+// Returns a byte slice of the record data read, the number of bytes read, and an error
+// if there's a non-zero byte in a page term record or the record checksum fails.
+// TODO(callum) the EOF errors we're returning from this function should theoretically
+// never happen, add a metric for them.
+func readRecord(buf []byte, header []byte, total int64) ([]byte, int, error) {
+	readIndex := 0
+	header[0] = buf[0]
+	readIndex++
+	total++
+
+	// The rest of this function is mostly from Reader.Next().
+	typ := recType(header[0])
+	// Gobble up zero bytes.
+	if typ == recPageTerm {
+		// We are pedantic and check whether the zeros are actually up to a page boundary.
+		// It's not strictly necessary but may catch sketchy state early.
+		k := pageSize - (total % pageSize)
+		if k == pageSize {
+			return nil, 1, nil // Initial 0 byte was last page byte.
+		}
+
+		if k <= int64(len(buf)-readIndex) {
+			for _, v := range buf[readIndex : int64(readIndex)+k] {
+				readIndex++
+				if v != 0 {
+					return nil, readIndex, errors.New("unexpected non-zero byte in page term bytes")
+				}
+			}
+			return nil, readIndex, nil
+		}
+		// Not enough bytes to read the rest of the page term rec.
+		// This theoretically should never happen, since we're only shifting the
+		// internal buffer of the live reader when we read to the end of page.
+		// Treat this the same as an EOF, it's an error we would expect to see.
+		return nil, 0, io.EOF
+	}
+
+	if readIndex+recordHeaderSize-1 > len(buf) {
+		// Treat this the same as an EOF, it's an error we would expect to see.
+		return nil, 0, io.EOF
+	}
+
+	copy(header[1:], buf[readIndex:readIndex+len(header[1:])])
+	readIndex += recordHeaderSize - 1
+	total += int64(recordHeaderSize - 1)
+	var (
+		length = binary.BigEndian.Uint16(header[1:])
+		crc    = binary.BigEndian.Uint32(header[3:])
+	)
+	readTo := int(length) + readIndex
+	if readTo > len(buf) {
+		if (readTo - readIndex) > pageSize {
+			return nil, 0, errors.Errorf("invalid record, record size would be larger than max page size: %d", int(length))
+		}
+		// Not enough data to read all of the record data.
+		// Treat this the same as an EOF, it's an error we would expect to see.
+		return nil, 0, io.EOF
+	}
+	recData := buf[readIndex:readTo]
+	readIndex += int(length)
+	total += int64(length)
+
+	// TODO(callum) what should we do here, throw out the record? We should add a metric at least.
+	if c := crc32.Checksum(recData, castagnoliTable); c != crc {
+		return recData, readIndex, errors.Errorf("unexpected checksum %x, expected %x", c, crc)
+	}
+	return recData, readIndex, nil
+}
+
 func min(i, j int) int {
 	if i < j {
 		return i
diff --git a/wal/wal_test.go b/wal/wal_test.go
index 8ac14ebc82..f95b212394 100644
--- a/wal/wal_test.go
+++ b/wal/wal_test.go
@@ -17,15 +17,107 @@ package wal
 import (
 	"bytes"
 	"encoding/binary"
+	"fmt"
 	"hash/crc32"
+	"io"
 	"io/ioutil"
 	"math/rand"
 	"os"
+	"path"
+	"sync"
 	"testing"
+	"time"
 
 	"github.com/prometheus/tsdb/testutil"
 )
 
+type record struct {
+	t recType
+	b []byte
+}
+
+var data = make([]byte, 100000)
+var testReaderCases = []struct {
+	t    []record
+	exp  [][]byte
+	fail bool
+}{
+	// Sequence of valid records.
+	{
+		t: []record{
+			{recFull, data[0:200]},
+			{recFirst, data[200:300]},
+			{recLast, data[300:400]},
+			{recFirst, data[400:800]},
+			{recMiddle, data[800:900]},
+			{recPageTerm, make([]byte, pageSize-900-recordHeaderSize*5-1)}, // exactly lines up with page boundary.
+			{recLast, data[900:900]},
+			{recFirst, data[900:1000]},
+			{recMiddle, data[1000:1200]},
+			{recMiddle, data[1200:30000]},
+			{recMiddle, data[30000:30001]},
+			{recMiddle, data[30001:30001]},
+			{recLast, data[30001:32000]},
+		},
+		exp: [][]byte{
+			data[0:200],
+			data[200:400],
+			data[400:900],
+			data[900:32000],
+		},
+	},
+	// Exactly at the limit of one page minus the header size
+	{
+		t: []record{
+			{recFull, data[0 : pageSize-recordHeaderSize]},
+		},
+		exp: [][]byte{
+			data[:pageSize-recordHeaderSize],
+		},
+	},
+	// More than a full page, this exceeds our buffer and can never happen
+	// when written by the WAL.
+	{
+		t: []record{
+			{recFull, data[0 : pageSize+1]},
+		},
+		fail: true,
+	},
+	// Invalid orders of record types.
+	{
+		t:    []record{{recMiddle, data[:200]}},
+		fail: true,
+	},
+	{
+		t:    []record{{recLast, data[:200]}},
+		fail: true,
+	},
+	{
+		t: []record{
+			{recFirst, data[:200]},
+			{recFull, data[200:400]},
+		},
+		fail: true,
+	},
+	{
+		t: []record{
+			{recFirst, data[:100]},
+			{recMiddle, data[100:200]},
+			{recFull, data[200:400]},
+		},
+		fail: true,
+	},
+	// Non-zero data after page termination.
+	{
+		t: []record{
+			{recFull, data[:100]},
+			{recPageTerm, append(make([]byte, 1000), 1)},
+		},
+		exp:  [][]byte{data[:100]},
+		fail: true,
+	},
+}
+
 func encodedRecord(t recType, b []byte) []byte {
 	if t == recPageTerm {
 		return append([]byte{0}, b...)
@@ -39,95 +131,7 @@ func encodedRecord(t recType, b []byte) []byte {
 
 // TestReader feeds the reader a stream of encoded records with different types.
 func TestReader(t *testing.T) {
-	data := make([]byte, 100000)
-	_, err := rand.Read(data)
-	testutil.Ok(t, err)
-
-	type record struct {
-		t recType
-		b []byte
-	}
-	cases := []struct {
-		t    []record
-		exp  [][]byte
-		fail bool
-	}{
-		// Sequence of valid records.
-		{
-			t: []record{
-				{recFull, data[0:200]},
-				{recFirst, data[200:300]},
-				{recLast, data[300:400]},
-				{recFirst, data[400:800]},
-				{recMiddle, data[800:900]},
-				{recPageTerm, make([]byte, pageSize-900-recordHeaderSize*5-1)}, // exactly lines up with page boundary.
-				{recLast, data[900:900]},
-				{recFirst, data[900:1000]},
-				{recMiddle, data[1000:1200]},
-				{recMiddle, data[1200:30000]},
-				{recMiddle, data[30000:30001]},
-				{recMiddle, data[30001:30001]},
-				{recLast, data[30001:32000]},
-			},
-			exp: [][]byte{
-				data[0:200],
-				data[200:400],
-				data[400:900],
-				data[900:32000],
-			},
-		},
-		// Exactly at the limit of one page minus the header size
-		{
-			t: []record{
-				{recFull, data[0 : pageSize-recordHeaderSize]},
-			},
-			exp: [][]byte{
-				data[:pageSize-recordHeaderSize],
-			},
-		},
-		// More than a full page, this exceeds our buffer and can never happen
-		// when written by the WAL.
-		{
-			t: []record{
-				{recFull, data[0 : pageSize+1]},
-			},
-			fail: true,
-		},
-		// Invalid orders of record types.
-		{
-			t:    []record{{recMiddle, data[:200]}},
-			fail: true,
-		},
-		{
-			t:    []record{{recLast, data[:200]}},
-			fail: true,
-		},
-		{
-			t: []record{
-				{recFirst, data[:200]},
-				{recFull, data[200:400]},
-			},
-			fail: true,
-		},
-		{
-			t: []record{
-				{recFirst, data[:100]},
-				{recMiddle, data[100:200]},
-				{recFull, data[200:400]},
-			},
-			fail: true,
-		},
-		// Non-zero data after page termination.
-		{
-			t: []record{
-				{recFull, data[:100]},
-				{recPageTerm, append(make([]byte, 1000), 1)},
-			},
-			exp:  [][]byte{data[:100]},
-			fail: true,
-		},
-	}
-	for i, c := range cases {
+	for i, c := range testReaderCases {
 		t.Logf("test %d", i)
 
 		var buf []byte
@@ -154,6 +158,192 @@ func TestReader(t *testing.T) {
 	}
 }
 
+func TestReader_Live(t *testing.T) {
+	for i, c := range testReaderCases {
+		t.Logf("test %d", i)
+		dir, err := ioutil.TempDir("", fmt.Sprintf("live_reader_%d", i))
+		t.Logf("created dir %s", dir)
+		testutil.Ok(t, err)
+		defer os.RemoveAll(dir)
+
+		// we're never going to have more than a single segment file per test case right now
+		f, err := os.Create(path.Join(dir, "00000000"))
+		testutil.Ok(t, err)
+
+		// live reader doesn't work on readers created from bytes buffers,
+		// since we need to be able to write more data to the thing we're
+		// reading from after the reader has been created
+		wg := sync.WaitGroup{}
+		// make sure the reader doesn't start until at least one record is written
+		wg.Add(1)
+		go func() {
+			for i, rec := range c.t {
+				rec := encodedRecord(rec.t, rec.b)
+				n, err := f.Write(rec)
+				testutil.Ok(t, err)
+				testutil.Assert(t, n > 0, "no bytes were written to wal")
+				if i == 0 {
+					wg.Done()
+				}
+			}
+		}()
+		sr, err := OpenReadSegment(SegmentName(dir, 0))
+		testutil.Ok(t, err)
+		lr := NewLiveReader(sr)
+		j := 0
+		wg.Wait()
+	caseLoop:
+		for {
+			for ; lr.Next(); j++ {
+				rec := lr.Record()
+				t.Log("j: ", j)
+				testutil.Equals(t, c.exp[j], rec, "Bytes within record did not match expected Bytes")
+				if j == len(c.exp)-1 {
+					break caseLoop
+				}
+
+			}
+
+			// Because reads and writes are happening concurrently, unless we get an error we should
+			// attempt to read records again.
+			if j == 0 && lr.Err() == nil {
+				continue
+			}
+
+			if !c.fail && lr.Err() != nil {
+				t.Fatalf("unexpected error: %s", lr.Err())
+			}
+			if c.fail && lr.Err() == nil {
+				t.Fatalf("expected error but got none:\n\tinput: %+v", c.t)
+			}
+			if lr.Err() != nil {
+				t.Log("err: ", lr.Err())
+				break
+			}
+		}
+	}
+}
+
+func TestWAL_FuzzWriteRead_Live(t *testing.T) {
+	const count = 5000
+	const segmentSize = int64(128 * 1024 * 1204)
+	var input [][]byte
+	lock := sync.RWMutex{}
+	var recs [][]byte
+	var index int
+
+	// Get size of segment.
+	getSegmentSize := func(dir string, index int) (int64, error) {
+		i := int64(-1)
+		fi, err := os.Stat(SegmentName(dir, index))
+		if err == nil {
+			i = fi.Size()
+		}
+		return i, err
+	}
+
+	readSegment := func(r *LiveReader) {
+		for r.Next() {
+			rec := r.Record()
+			lock.RLock()
+			l := len(input)
+			lock.RUnlock()
+			if index >= l {
+				t.Fatalf("read too many records")
+			}
+			lock.RLock()
+			if !bytes.Equal(input[index], rec) {
+				t.Fatalf("record %d (len %d) does not match (expected len %d)",
+					index, len(rec), len(input[index]))
+			}
+			lock.RUnlock()
+			index++
+		}
+		if r.Err() != io.EOF {
+			testutil.Ok(t, r.Err())
+		}
+	}
+
+	dir, err := ioutil.TempDir("", "wal_fuzz_live")
+	t.Log("created dir: ", dir)
+	testutil.Ok(t, err)
+	defer func() {
+		os.RemoveAll(dir)
+	}()
+
+	w, err := NewSize(nil, nil, dir, 128*pageSize)
+	testutil.Ok(t, err)
+
+	go func() {
+		for i := 0; i < count; i++ {
+			var sz int64
+			switch i % 5 {
+			case 0, 1:
+				sz = 50
+			case 2, 3:
+				sz = pageSize
+			default:
+				sz = pageSize * 8
+			}
+
+			rec := make([]byte, rand.Int63n(sz))
+			_, err := rand.Read(rec)
+			testutil.Ok(t, err)
+			lock.Lock()
+			input = append(input, rec)
+			lock.Unlock()
+			recs = append(recs, rec)
+
+			// Randomly batch up records.
+			if rand.Intn(4) < 3 {
+				testutil.Ok(t, w.Log(recs...))
+				recs = recs[:0]
+			}
+		}
+		testutil.Ok(t, w.Log(recs...))
+	}()
+
+	m, _, err := w.Segments()
+	testutil.Ok(t, err)
+
+	seg, err := OpenReadSegment(SegmentName(dir, m))
+	testutil.Ok(t, err)
+
+	r := NewLiveReader(seg)
+	segmentTicker := time.NewTicker(100 * time.Millisecond)
+	readTicker := time.NewTicker(10 * time.Millisecond)
+	for {
+		select {
+		case <-segmentTicker.C:
+			// check if new segments exist
+			_, last, err := w.Segments()
+			testutil.Ok(t, err)
+			if last > seg.i {
+				for {
+					readSegment(r)
+					if r.Err() != io.EOF {
+						testutil.Ok(t, r.Err())
+					}
+					size, err := getSegmentSize(dir, seg.i)
+					testutil.Ok(t, err)
+					// make sure we've read all of the current segment before rotating
+					if r.TotalRead() == size {
+						break
+					}
+				}
+				seg, err = OpenReadSegment(SegmentName(dir, seg.i+1))
+				testutil.Ok(t, err)
+				r = NewLiveReader(seg)
+			}
+		case <-readTicker.C:
+			readSegment(r)
+		}
+		if index == count {
+			break
+		}
+	}
+	testutil.Ok(t, r.Err())
+}
 func TestWAL_FuzzWriteRead(t *testing.T) {
 	const count = 25000
 

From 1a9d08adc548b3a3643038589af1e00489ac6883 Mon Sep 17 00:00:00 2001
From: Ganesh Vernekar <cs15btech11018@iith.ac.in>
Date: Fri, 18 Jan 2019 14:05:16 +0530
Subject: [PATCH 06/23] Don't write empty blocks (#374)

* Dont write empty blocks when a compaction results in a block with no samples.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
---
 CHANGELOG.md  |   1 +
 block.go      |   3 ++
 block_test.go |   3 +-
 compact.go    |  66 ++++++++++++++++++++-------
 db.go         |  17 ++++++-
 db_test.go    | 120 ++++++++++++++++++++++++++++++++++++++++++++++----
 6 files changed, 183 insertions(+), 27 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5975d8b2b0..1238644bd6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,6 @@
 ## master / unreleased
  - [CHANGE] New `WALSegmentSize` option to override the `DefaultOptions.WALSegmentSize`. Added to allow using smaller wal files. For example using tmpfs on a RPI to minimise the SD card wear out from the constant WAL writes. As part of this change the `DefaultOptions.WALSegmentSize` constant was also exposed.
+ - [CHANGE] Empty blocks are not written during compaction [#374](https://github.com/prometheus/tsdb/pull/374)
  - [FEATURE]  Size base retention through `Options.MaxBytes`.  As part of this change:
     - added new metrics - `prometheus_tsdb_storage_blocks_bytes_total`, `prometheus_tsdb_size_retentions_total`, `prometheus_tsdb_time_retentions_total`
     - new public interface `SizeReader: Size() int64`
diff --git a/block.go b/block.go
index 837f4a763d..42e11d9513 100644
--- a/block.go
+++ b/block.go
@@ -191,6 +191,9 @@ type BlockMetaCompaction struct {
 	Level int `json:"level"`
 	// ULIDs of all source head blocks that went into the block.
 	Sources []ulid.ULID `json:"sources,omitempty"`
+	// Indicates that during compaction it resulted in a block without any samples
+	// so it should be deleted on the next reload.
+	Deletable bool `json:"deletable,omitempty"`
 	// Short descriptions of the direct blocks that were used to create
 	// this block.
 	Parents []BlockDesc `json:"parents,omitempty"`
diff --git a/block_test.go b/block_test.go
index 789aebaa71..724ab37817 100644
--- a/block_test.go
+++ b/block_test.go
@@ -45,7 +45,7 @@ func TestSetCompactionFailed(t *testing.T) {
 	testutil.Ok(t, err)
 	defer os.RemoveAll(tmpdir)
 
-	blockDir := createBlock(t, tmpdir, 0, 0, 0)
+	blockDir := createBlock(t, tmpdir, 1, 0, 0)
 	b, err := OpenBlock(nil, blockDir, nil)
 	testutil.Ok(t, err)
 	testutil.Equals(t, false, b.meta.Compaction.Failed)
@@ -91,6 +91,5 @@ func createBlock(tb testing.TB, dir string, nSeries int, mint, maxt int64) strin
 
 	ulid, err := compactor.Write(dir, head, head.MinTime(), head.MaxTime(), nil)
 	testutil.Ok(tb, err)
-
 	return filepath.Join(dir, ulid.String())
 }
diff --git a/compact.go b/compact.go
index 49d4e5868c..5d8155f51b 100644
--- a/compact.go
+++ b/compact.go
@@ -55,12 +55,17 @@ type Compactor interface {
 	Plan(dir string) ([]string, error)
 
 	// Write persists a Block into a directory.
+	// No Block is written when resulting Block has 0 samples, and returns empty ulid.ULID{}.
 	Write(dest string, b BlockReader, mint, maxt int64, parent *BlockMeta) (ulid.ULID, error)
 
 	// Compact runs compaction against the provided directories. Must
 	// only be called concurrently with results of Plan().
 	// Can optionally pass a list of already open blocks,
 	// to avoid having to reopen them.
+	// When resulting Block has 0 samples
+	//  * No block is written.
+	//  * The source dirs are marked Deletable.
+	//  * Returns empty ulid.ULID{}.
 	Compact(dest string, dirs []string, open []*Block) (ulid.ULID, error)
 }
 
@@ -186,13 +191,12 @@ func (c *LeveledCompactor) plan(dms []dirMeta) ([]string, error) {
 		return res, nil
 	}
 
-	// Compact any blocks that have >5% tombstones.
+	// Compact any blocks with big enough time range that have >5% tombstones.
 	for i := len(dms) - 1; i >= 0; i-- {
 		meta := dms[i].meta
 		if meta.MaxTime-meta.MinTime < c.ranges[len(c.ranges)/2] {
 			break
 		}
-
 		if float64(meta.Stats.NumTombstones)/float64(meta.Stats.NumSeries+1) > 0.05 {
 			return []string{dms[i].dir}, nil
 		}
@@ -366,15 +370,34 @@ func (c *LeveledCompactor) Compact(dest string, dirs []string, open []*Block) (u
 	meta := compactBlockMetas(uid, metas...)
 	err = c.write(dest, meta, blocks...)
 	if err == nil {
-		level.Info(c.logger).Log(
-			"msg", "compact blocks",
-			"count", len(blocks),
-			"mint", meta.MinTime,
-			"maxt", meta.MaxTime,
-			"ulid", meta.ULID,
-			"sources", fmt.Sprintf("%v", uids),
-			"duration", time.Since(start),
-		)
+		if meta.Stats.NumSamples == 0 {
+			for _, b := range bs {
+				b.meta.Compaction.Deletable = true
+				if err = writeMetaFile(b.dir, &b.meta); err != nil {
+					level.Error(c.logger).Log(
+						"msg", "Failed to write 'Deletable' to meta file after compaction",
+						"ulid", b.meta.ULID,
+					)
+				}
+			}
+			uid = ulid.ULID{}
+			level.Info(c.logger).Log(
+				"msg", "compact blocks resulted in empty block",
+				"count", len(blocks),
+				"sources", fmt.Sprintf("%v", uids),
+				"duration", time.Since(start),
+			)
+		} else {
+			level.Info(c.logger).Log(
+				"msg", "compact blocks",
+				"count", len(blocks),
+				"mint", meta.MinTime,
+				"maxt", meta.MaxTime,
+				"ulid", meta.ULID,
+				"sources", fmt.Sprintf("%v", uids),
+				"duration", time.Since(start),
+			)
+		}
 		return uid, nil
 	}
 
@@ -413,6 +436,10 @@ func (c *LeveledCompactor) Write(dest string, b BlockReader, mint, maxt int64, p
 		return uid, err
 	}
 
+	if meta.Stats.NumSamples == 0 {
+		return ulid.ULID{}, nil
+	}
+
 	level.Info(c.logger).Log("msg", "write block", "mint", meta.MinTime, "maxt", meta.MaxTime, "ulid", meta.ULID)
 	return uid, nil
 }
@@ -490,11 +517,6 @@ func (c *LeveledCompactor) write(dest string, meta *BlockMeta, blocks ...BlockRe
 	if err := c.populateBlock(blocks, meta, indexw, chunkw); err != nil {
 		return errors.Wrap(err, "write compaction")
 	}
-
-	if err = writeMetaFile(tmp, meta); err != nil {
-		return errors.Wrap(err, "write merged meta")
-	}
-
 	// We are explicitly closing them here to check for error even
 	// though these are covered under defer. This is because in Windows,
 	// you cannot delete these unless they are closed and the defer is to
@@ -506,6 +528,18 @@ func (c *LeveledCompactor) write(dest string, meta *BlockMeta, blocks ...BlockRe
 		return errors.Wrap(err, "close index writer")
 	}
 
+	// Populated block is empty, so cleanup and exit.
+	if meta.Stats.NumSamples == 0 {
+		if err := os.RemoveAll(tmp); err != nil {
+			return errors.Wrap(err, "remove tmp folder after empty block failed")
+		}
+		return nil
+	}
+
+	if err = writeMetaFile(tmp, meta); err != nil {
+		return errors.Wrap(err, "write merged meta")
+	}
+
 	// Create an empty tombstones file.
 	if err := writeTombstoneFile(tmp, newMemTombstones()); err != nil {
 		return errors.Wrap(err, "write new tombstones file")
diff --git a/db.go b/db.go
index 6307004393..1349dfbd51 100644
--- a/db.go
+++ b/db.go
@@ -417,7 +417,8 @@ func (db *DB) compact() (err error) {
 			// from the block interval here.
 			maxt: maxt - 1,
 		}
-		if _, err = db.compactor.Write(db.dir, head, mint, maxt, nil); err != nil {
+		uid, err := db.compactor.Write(db.dir, head, mint, maxt, nil)
+		if err != nil {
 			return errors.Wrap(err, "persist head block")
 		}
 
@@ -426,6 +427,14 @@ func (db *DB) compact() (err error) {
 		if err := db.reload(); err != nil {
 			return errors.Wrap(err, "reload blocks")
 		}
+		if (uid == ulid.ULID{}) {
+			// Compaction resulted in an empty block.
+			// Head truncating during db.reload() depends on the persisted blocks and
+			// in this case no new block will be persisted so manually truncate the head.
+			if err = db.head.Truncate(maxt); err != nil {
+				return errors.Wrap(err, "head truncate failed (in compact)")
+			}
+		}
 		runtime.GC()
 	}
 
@@ -588,6 +597,12 @@ func (db *DB) deletableBlocks(blocks []*Block) map[ulid.ULID]*Block {
 		return blocks[i].Meta().MaxTime > blocks[j].Meta().MaxTime
 	})
 
+	for _, block := range blocks {
+		if block.Meta().Compaction.Deletable {
+			deletable[block.Meta().ULID] = block
+		}
+	}
+
 	for ulid, block := range db.beyondTimeRetention(blocks) {
 		deletable[ulid] = block
 	}
diff --git a/db_test.go b/db_test.go
index 2beef0c523..21de405e25 100644
--- a/db_test.go
+++ b/db_test.go
@@ -836,7 +836,7 @@ func TestTombstoneCleanFail(t *testing.T) {
 	// totalBlocks should be >=2 so we have enough blocks to trigger compaction failure.
 	totalBlocks := 2
 	for i := 0; i < totalBlocks; i++ {
-		blockDir := createBlock(t, db.Dir(), 0, 0, 0)
+		blockDir := createBlock(t, db.Dir(), 1, 0, 0)
 		block, err := OpenBlock(nil, blockDir, nil)
 		testutil.Ok(t, err)
 		// Add some some fake tombstones to trigger the compaction.
@@ -880,7 +880,7 @@ func (c *mockCompactorFailing) Write(dest string, b BlockReader, mint, maxt int6
 		return ulid.ULID{}, fmt.Errorf("the compactor already did the maximum allowed blocks so it is time to fail")
 	}
 
-	block, err := OpenBlock(nil, createBlock(c.t, dest, 0, 0, 0), nil)
+	block, err := OpenBlock(nil, createBlock(c.t, dest, 1, 0, 0), nil)
 	testutil.Ok(c.t, err)
 	testutil.Ok(c.t, block.Close()) // Close block as we won't be using anywhere.
 	c.blocks = append(c.blocks, block)
@@ -1364,6 +1364,109 @@ func TestInitializeHeadTimestamp(t *testing.T) {
 	})
 }
 
+func TestNoEmptyBlocks(t *testing.T) {
+	db, close := openTestDB(t, &Options{
+		BlockRanges: []int64{100},
+	})
+	defer close()
+	defer db.Close()
+	db.DisableCompactions()
+
+	rangeToTriggercompaction := db.opts.BlockRanges[0]/2*3 - 1
+	defaultLabel := labels.FromStrings("foo", "bar")
+	defaultMatcher := labels.NewMustRegexpMatcher("", ".*")
+
+	t.Run("Test no blocks after compact with empty head.", func(t *testing.T) {
+		testutil.Ok(t, db.compact())
+		actBlocks, err := blockDirs(db.Dir())
+		testutil.Ok(t, err)
+		testutil.Equals(t, len(db.Blocks()), len(actBlocks))
+		testutil.Equals(t, 0, len(actBlocks))
+		testutil.Equals(t, 0, int(prom_testutil.ToFloat64(db.compactor.(*LeveledCompactor).metrics.ran)), "no compaction should be triggered here")
+	})
+
+	t.Run("Test no blocks after deleting all samples from head.", func(t *testing.T) {
+		app := db.Appender()
+		_, err := app.Add(defaultLabel, 1, 0)
+		testutil.Ok(t, err)
+		_, err = app.Add(defaultLabel, 2, 0)
+		testutil.Ok(t, err)
+		_, err = app.Add(defaultLabel, 3+rangeToTriggercompaction, 0)
+		testutil.Ok(t, err)
+		testutil.Ok(t, app.Commit())
+		testutil.Ok(t, db.Delete(math.MinInt64, math.MaxInt64, defaultMatcher))
+		testutil.Ok(t, db.compact())
+		testutil.Equals(t, 1, int(prom_testutil.ToFloat64(db.compactor.(*LeveledCompactor).metrics.ran)), "compaction should have been triggered here")
+
+		actBlocks, err := blockDirs(db.Dir())
+		testutil.Ok(t, err)
+		testutil.Equals(t, len(db.Blocks()), len(actBlocks))
+		testutil.Equals(t, 0, len(actBlocks))
+
+		app = db.Appender()
+		_, err = app.Add(defaultLabel, 1, 0)
+		testutil.Assert(t, err == ErrOutOfBounds, "the head should be truncated so no samples in the past should be allowed")
+
+		// Adding new blocks.
+		currentTime := db.Head().MaxTime()
+		_, err = app.Add(defaultLabel, currentTime, 0)
+		testutil.Ok(t, err)
+		_, err = app.Add(defaultLabel, currentTime+1, 0)
+		testutil.Ok(t, err)
+		_, err = app.Add(defaultLabel, currentTime+rangeToTriggercompaction, 0)
+		testutil.Ok(t, err)
+		testutil.Ok(t, app.Commit())
+
+		testutil.Ok(t, db.compact())
+		testutil.Equals(t, 2, int(prom_testutil.ToFloat64(db.compactor.(*LeveledCompactor).metrics.ran)), "compaction should have been triggered here")
+		actBlocks, err = blockDirs(db.Dir())
+		testutil.Ok(t, err)
+		testutil.Equals(t, len(db.Blocks()), len(actBlocks))
+		testutil.Assert(t, len(actBlocks) == 1, "No blocks created when compacting with >0 samples")
+	})
+
+	t.Run(`When no new block is created from head, and there are some blocks on disk 
+	compaction should not run into infinite loop (was seen during development).`, func(t *testing.T) {
+		oldBlocks := db.Blocks()
+		app := db.Appender()
+		currentTime := db.Head().MaxTime()
+		_, err := app.Add(defaultLabel, currentTime, 0)
+		testutil.Ok(t, err)
+		_, err = app.Add(defaultLabel, currentTime+1, 0)
+		testutil.Ok(t, err)
+		_, err = app.Add(defaultLabel, currentTime+rangeToTriggercompaction, 0)
+		testutil.Ok(t, err)
+		testutil.Ok(t, app.Commit())
+		testutil.Ok(t, db.head.Delete(math.MinInt64, math.MaxInt64, defaultMatcher))
+		testutil.Ok(t, db.compact())
+		testutil.Equals(t, 3, int(prom_testutil.ToFloat64(db.compactor.(*LeveledCompactor).metrics.ran)), "compaction should have been triggered here")
+		testutil.Equals(t, oldBlocks, db.Blocks())
+	})
+
+	t.Run("Test no blocks remaining after deleting all samples from disk.", func(t *testing.T) {
+		currentTime := db.Head().MaxTime()
+		blocks := []*BlockMeta{
+			{MinTime: currentTime, MaxTime: currentTime + db.opts.BlockRanges[0]},
+			{MinTime: currentTime + 100, MaxTime: currentTime + 100 + db.opts.BlockRanges[0]},
+		}
+		for _, m := range blocks {
+			createBlock(t, db.Dir(), 2, m.MinTime, m.MaxTime)
+		}
+
+		oldBlocks := db.Blocks()
+		testutil.Ok(t, db.reload())                                      // Reload the db to register the new blocks.
+		testutil.Equals(t, len(blocks)+len(oldBlocks), len(db.Blocks())) // Ensure all blocks are registered.
+		testutil.Ok(t, db.Delete(math.MinInt64, math.MaxInt64, defaultMatcher))
+		testutil.Ok(t, db.compact())
+		testutil.Equals(t, 5, int(prom_testutil.ToFloat64(db.compactor.(*LeveledCompactor).metrics.ran)), "compaction should have been triggered here once for each block that have tombstones")
+
+		actBlocks, err := blockDirs(db.Dir())
+		testutil.Ok(t, err)
+		testutil.Equals(t, len(db.Blocks()), len(actBlocks))
+		testutil.Equals(t, 1, len(actBlocks), "All samples are deleted. Only the most recent block should remain after compaction.")
+	})
+}
+
 func TestDB_LabelNames(t *testing.T) {
 	tests := []struct {
 		// Add 'sampleLabels1' -> Test Head -> Compact -> Test Disk ->
@@ -1468,12 +1571,13 @@ func TestCorrectNumTombstones(t *testing.T) {
 	defer db.Close()
 
 	blockRange := DefaultOptions.BlockRanges[0]
-	label := labels.FromStrings("foo", "bar")
+	defaultLabel := labels.FromStrings("foo", "bar")
+	defaultMatcher := labels.NewEqualMatcher(defaultLabel[0].Name, defaultLabel[0].Value)
 
 	app := db.Appender()
 	for i := int64(0); i < 3; i++ {
 		for j := int64(0); j < 15; j++ {
-			_, err := app.Add(label, i*blockRange+j, 0)
+			_, err := app.Add(defaultLabel, i*blockRange+j, 0)
 			testutil.Ok(t, err)
 		}
 	}
@@ -1483,17 +1587,17 @@ func TestCorrectNumTombstones(t *testing.T) {
 	testutil.Ok(t, err)
 	testutil.Equals(t, 1, len(db.blocks))
 
-	testutil.Ok(t, db.Delete(0, 1, labels.NewEqualMatcher("foo", "bar")))
+	testutil.Ok(t, db.Delete(0, 1, defaultMatcher))
 	testutil.Equals(t, uint64(1), db.blocks[0].meta.Stats.NumTombstones)
 
 	// {0, 1} and {2, 3} are merged to form 1 tombstone.
-	testutil.Ok(t, db.Delete(2, 3, labels.NewEqualMatcher("foo", "bar")))
+	testutil.Ok(t, db.Delete(2, 3, defaultMatcher))
 	testutil.Equals(t, uint64(1), db.blocks[0].meta.Stats.NumTombstones)
 
-	testutil.Ok(t, db.Delete(5, 6, labels.NewEqualMatcher("foo", "bar")))
+	testutil.Ok(t, db.Delete(5, 6, defaultMatcher))
 	testutil.Equals(t, uint64(2), db.blocks[0].meta.Stats.NumTombstones)
 
-	testutil.Ok(t, db.Delete(9, 11, labels.NewEqualMatcher("foo", "bar")))
+	testutil.Ok(t, db.Delete(9, 11, defaultMatcher))
 	testutil.Equals(t, uint64(3), db.blocks[0].meta.Stats.NumTombstones)
 }
 

From 10ba228e6baa4811818e04b1ab9b48110bb43d7b Mon Sep 17 00:00:00 2001
From: Krasi Georgiev <krasi-georgiev@users.noreply.github.com>
Date: Fri, 18 Jan 2019 11:42:59 +0300
Subject: [PATCH 07/23] release 0.4.0 (#501)

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1238644bd6..1847345379 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,6 @@
 ## master / unreleased
+
+## 0.4.0
  - [CHANGE] New `WALSegmentSize` option to override the `DefaultOptions.WALSegmentSize`. Added to allow using smaller wal files. For example using tmpfs on a RPI to minimise the SD card wear out from the constant WAL writes. As part of this change the `DefaultOptions.WALSegmentSize` constant was also exposed.
  - [CHANGE] Empty blocks are not written during compaction [#374](https://github.com/prometheus/tsdb/pull/374)
  - [FEATURE]  Size base retention through `Options.MaxBytes`.  As part of this change:

From cc277e398b479bdb601e17e439bad11901824aa3 Mon Sep 17 00:00:00 2001
From: Krasi Georgiev <krasi-georgiev@users.noreply.github.com>
Date: Mon, 21 Jan 2019 14:56:26 +0300
Subject: [PATCH 08/23] fix the refs logic for the addFast path for createBlock
 (#504)

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
---
 block_test.go | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/block_test.go b/block_test.go
index 724ab37817..1a04091944 100644
--- a/block_test.go
+++ b/block_test.go
@@ -68,17 +68,20 @@ func createBlock(tb testing.TB, dir string, nSeries int, mint, maxt int64) strin
 
 	lbls, err := labels.ReadLabels(filepath.Join("testdata", "20kseries.json"), nSeries)
 	testutil.Ok(tb, err)
-	var ref uint64
+	refs := make([]uint64, nSeries)
 
 	for ts := mint; ts <= maxt; ts++ {
 		app := head.Appender()
-		for _, lbl := range lbls {
-			err := app.AddFast(ref, ts, rand.Float64())
-			if err == nil {
-				continue
+		for i, lbl := range lbls {
+			if refs[i] != 0 {
+				err := app.AddFast(refs[i], ts, rand.Float64())
+				if err == nil {
+					continue
+				}
 			}
-			ref, err = app.Add(lbl, int64(ts), rand.Float64())
+			ref, err := app.Add(lbl, int64(ts), rand.Float64())
 			testutil.Ok(tb, err)
+			refs[i] = ref
 		}
 		err := app.Commit()
 		testutil.Ok(tb, err)

From 5db162568b3c41b922791b532df09b0491b73c9b Mon Sep 17 00:00:00 2001
From: Brian Brazil <brian.brazil@robustperception.io>
Date: Wed, 23 Jan 2019 13:46:58 +0000
Subject: [PATCH 09/23] Remove _total from prometheus_tsdb_storage_blocks_bytes
 (#506)

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
---
 CHANGELOG.md | 3 +++
 db.go        | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1847345379..d89c8a2f50 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,7 @@
 ## master / unreleased
 
+ - [CHANGE] `prometheus_tsdb_storage_blocks_bytes_total` is now `prometheus_tsdb_storage_blocks_bytes`
+
 ## 0.4.0
  - [CHANGE] New `WALSegmentSize` option to override the `DefaultOptions.WALSegmentSize`. Added to allow using smaller wal files. For example using tmpfs on a RPI to minimise the SD card wear out from the constant WAL writes. As part of this change the `DefaultOptions.WALSegmentSize` constant was also exposed.
  - [CHANGE] Empty blocks are not written during compaction [#374](https://github.com/prometheus/tsdb/pull/374)
@@ -10,6 +12,7 @@
  - [REMOVED] `PrefixMatcher` is considered unused so was removed.
  - [CLEANUP] `Options.WALFlushInterval` is removed as it wasn't used anywhere.
  - [FEATURE] Add new `LiveReader` to WAL pacakge. Added to allow live tailing of a WAL segment, used by Prometheus Remote Write after refactor. The main difference between the new reader and the existing `Reader` is that for `LiveReader` a call to `Next()` that returns false does not mean that there will never be more data to read.
+ - [CHANGE] Empty blocks are not written during compaction [#374](https://github.com/prometheus/tsdb/pull/374)
 
 ## 0.3.1
  - [BUGFIX] Fixed most windows test and some actual bugs for unclosed file readers.
diff --git a/db.go b/db.go
index 1349dfbd51..bd3388b203 100644
--- a/db.go
+++ b/db.go
@@ -202,7 +202,7 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics {
 		Help: "The time taken to recompact blocks to remove tombstones.",
 	})
 	m.blocksBytes = prometheus.NewGauge(prometheus.GaugeOpts{
-		Name: "prometheus_tsdb_storage_blocks_bytes_total",
+		Name: "prometheus_tsdb_storage_blocks_bytes",
 		Help: "The number of bytes that are currently used for local storage by all blocks.",
 	})
 	m.sizeRetentionCount = prometheus.NewCounter(prometheus.CounterOpts{

From a44d15798e914b4a0d9b98bfc9d1a18807c873da Mon Sep 17 00:00:00 2001
From: Krasi Georgiev <krasi-georgiev@users.noreply.github.com>
Date: Wed, 23 Jan 2019 17:10:19 +0300
Subject: [PATCH 10/23] remove a changelog double entry (#507)

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
---
 CHANGELOG.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d89c8a2f50..ecf9e38a10 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,7 +12,6 @@
  - [REMOVED] `PrefixMatcher` is considered unused so was removed.
  - [CLEANUP] `Options.WALFlushInterval` is removed as it wasn't used anywhere.
  - [FEATURE] Add new `LiveReader` to WAL pacakge. Added to allow live tailing of a WAL segment, used by Prometheus Remote Write after refactor. The main difference between the new reader and the existing `Reader` is that for `LiveReader` a call to `Next()` that returns false does not mean that there will never be more data to read.
- - [CHANGE] Empty blocks are not written during compaction [#374](https://github.com/prometheus/tsdb/pull/374)
 
 ## 0.3.1
  - [BUGFIX] Fixed most windows test and some actual bugs for unclosed file readers.

From 051a7ae1a7f1ad12fac71028af42ad81710ede4a Mon Sep 17 00:00:00 2001
From: Alec <867245430@qq.com>
Date: Mon, 28 Jan 2019 18:33:44 +0800
Subject: [PATCH 11/23] Missing the length of the encoding byte when calling
 b.Range

Signed-off-by: naivewong <867245430@qq.com>
---
 chunks/chunks.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chunks/chunks.go b/chunks/chunks.go
index 8fb288384e..f35ad2cdd9 100644
--- a/chunks/chunks.go
+++ b/chunks/chunks.go
@@ -373,7 +373,7 @@ func (s *Reader) Chunk(ref uint64) (chunkenc.Chunk, error) {
 	if n <= 0 {
 		return nil, errors.Errorf("reading chunk length failed with %d", n)
 	}
-	r = b.Range(off+n, off+n+int(l))
+	r = b.Range(off+n, off+n+1+int(l))
 
 	return s.pool.Get(chunkenc.Encoding(r[0]), r[1:1+l])
 }

From dac2b97dfdf354a874f186f365fa75cd647e755e Mon Sep 17 00:00:00 2001
From: Krasi Georgiev <krasi-georgiev@users.noreply.github.com>
Date: Mon, 28 Jan 2019 14:24:49 +0300
Subject: [PATCH 12/23] make createBlock more generic so it can be used in
 other tests. (#489)

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
---
 block_test.go   |  84 +++++++++++----
 db_test.go      |  27 ++---
 head_test.go    |  13 +--
 querier_test.go | 264 ++++++++++++++++++++++++------------------------
 4 files changed, 218 insertions(+), 170 deletions(-)

diff --git a/block_test.go b/block_test.go
index 1a04091944..68cad4b88d 100644
--- a/block_test.go
+++ b/block_test.go
@@ -21,8 +21,8 @@ import (
 	"testing"
 
 	"github.com/go-kit/kit/log"
-	"github.com/prometheus/tsdb/labels"
 	"github.com/prometheus/tsdb/testutil"
+	"github.com/prometheus/tsdb/tsdbutil"
 )
 
 // In Prometheus 2.1.0 we had a bug where the meta.json version was falsely bumped
@@ -45,7 +45,7 @@ func TestSetCompactionFailed(t *testing.T) {
 	testutil.Ok(t, err)
 	defer os.RemoveAll(tmpdir)
 
-	blockDir := createBlock(t, tmpdir, 1, 0, 0)
+	blockDir := createBlock(t, tmpdir, genSeries(1, 1, 0, 0))
 	b, err := OpenBlock(nil, blockDir, nil)
 	testutil.Ok(t, err)
 	testutil.Equals(t, false, b.meta.Compaction.Failed)
@@ -59,33 +59,31 @@ func TestSetCompactionFailed(t *testing.T) {
 	testutil.Ok(t, b.Close())
 }
 
-// createBlock creates a block with nSeries series, filled with
-// samples of the given mint,maxt time range and returns its dir.
-func createBlock(tb testing.TB, dir string, nSeries int, mint, maxt int64) string {
+// createBlock creates a block with given set of series and returns its dir.
+func createBlock(tb testing.TB, dir string, series []Series) string {
 	head, err := NewHead(nil, nil, nil, 2*60*60*1000)
 	testutil.Ok(tb, err)
 	defer head.Close()
 
-	lbls, err := labels.ReadLabels(filepath.Join("testdata", "20kseries.json"), nSeries)
-	testutil.Ok(tb, err)
-	refs := make([]uint64, nSeries)
-
-	for ts := mint; ts <= maxt; ts++ {
-		app := head.Appender()
-		for i, lbl := range lbls {
-			if refs[i] != 0 {
-				err := app.AddFast(refs[i], ts, rand.Float64())
+	app := head.Appender()
+	for _, s := range series {
+		ref := uint64(0)
+		it := s.Iterator()
+		for it.Next() {
+			t, v := it.At()
+			if ref != 0 {
+				err := app.AddFast(ref, t, v)
 				if err == nil {
 					continue
 				}
 			}
-			ref, err := app.Add(lbl, int64(ts), rand.Float64())
+			ref, err = app.Add(s.Labels(), t, v)
 			testutil.Ok(tb, err)
-			refs[i] = ref
 		}
-		err := app.Commit()
-		testutil.Ok(tb, err)
+		testutil.Ok(tb, it.Err())
 	}
+	err = app.Commit()
+	testutil.Ok(tb, err)
 
 	compactor, err := NewLeveledCompactor(nil, log.NewNopLogger(), []int64{1000000}, nil)
 	testutil.Ok(tb, err)
@@ -96,3 +94,53 @@ func createBlock(tb testing.TB, dir string, nSeries int, mint, maxt int64) strin
 	testutil.Ok(tb, err)
 	return filepath.Join(dir, ulid.String())
 }
+
+// genSeries generates series with a given number of labels and values.
+func genSeries(totalSeries, labelCount int, mint, maxt int64) []Series {
+	if totalSeries == 0 || labelCount == 0 {
+		return nil
+	}
+	series := make([]Series, totalSeries)
+
+	for i := 0; i < totalSeries; i++ {
+		lbls := make(map[string]string, labelCount)
+		for len(lbls) < labelCount {
+			lbls[randString()] = randString()
+		}
+		samples := make([]tsdbutil.Sample, 0, maxt-mint+1)
+		for t := mint; t <= maxt; t++ {
+			samples = append(samples, sample{t: t, v: rand.Float64()})
+		}
+		series[i] = newSeries(lbls, samples)
+	}
+
+	return series
+}
+
+const letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
+const (
+	letterIdxBits = 6                    // 6 bits to represent a letter index
+	letterIdxMask = 1<<letterIdxBits - 1 // All 1-bits, as many as letterIdxBits
+	letterIdxMax  = 63 / letterIdxBits   // # of letter indices fitting in 63 bits
+)
+
+// randString generates random string.
+func randString() string {
+	maxLength := int32(50)
+	length := rand.Int31n(maxLength)
+	b := make([]byte, length+1)
+	// A rand.Int63() generates 63 random bits, enough for letterIdxMax characters!
+	for i, cache, remain := length, rand.Int63(), letterIdxMax; i >= 0; {
+		if remain == 0 {
+			cache, remain = rand.Int63(), letterIdxMax
+		}
+		if idx := int(cache & letterIdxMask); idx < len(letterBytes) {
+			b[i] = letterBytes[idx]
+			i--
+		}
+		cache >>= letterIdxBits
+		remain--
+	}
+
+	return string(b)
+}
diff --git a/db_test.go b/db_test.go
index 21de405e25..e3e90ce8b3 100644
--- a/db_test.go
+++ b/db_test.go
@@ -34,6 +34,7 @@ import (
 	"github.com/prometheus/tsdb/index"
 	"github.com/prometheus/tsdb/labels"
 	"github.com/prometheus/tsdb/testutil"
+	"github.com/prometheus/tsdb/tsdbutil"
 	"github.com/prometheus/tsdb/wal"
 )
 
@@ -87,7 +88,7 @@ func TestDB_reloadOrder(t *testing.T) {
 		{MinTime: 100, MaxTime: 110},
 	}
 	for _, m := range metas {
-		createBlock(t, db.Dir(), 1, m.MinTime, m.MaxTime)
+		createBlock(t, db.Dir(), genSeries(1, 1, m.MinTime, m.MaxTime))
 	}
 
 	testutil.Ok(t, db.reload())
@@ -250,7 +251,7 @@ Outer:
 		res, err := q.Select(labels.NewEqualMatcher("a", "b"))
 		testutil.Ok(t, err)
 
-		expSamples := make([]Sample, 0, len(c.remaint))
+		expSamples := make([]tsdbutil.Sample, 0, len(c.remaint))
 		for _, ts := range c.remaint {
 			expSamples = append(expSamples, sample{ts, smpls[ts]})
 		}
@@ -477,7 +478,7 @@ Outer:
 		res, err := q.Select(labels.NewEqualMatcher("a", "b"))
 		testutil.Ok(t, err)
 
-		expSamples := make([]Sample, 0, len(c.remaint))
+		expSamples := make([]tsdbutil.Sample, 0, len(c.remaint))
 		for _, ts := range c.remaint {
 			expSamples = append(expSamples, sample{ts, smpls[ts]})
 		}
@@ -782,7 +783,7 @@ func TestTombstoneClean(t *testing.T) {
 		res, err := q.Select(labels.NewEqualMatcher("a", "b"))
 		testutil.Ok(t, err)
 
-		expSamples := make([]Sample, 0, len(c.remaint))
+		expSamples := make([]tsdbutil.Sample, 0, len(c.remaint))
 		for _, ts := range c.remaint {
 			expSamples = append(expSamples, sample{ts, smpls[ts]})
 		}
@@ -836,7 +837,7 @@ func TestTombstoneCleanFail(t *testing.T) {
 	// totalBlocks should be >=2 so we have enough blocks to trigger compaction failure.
 	totalBlocks := 2
 	for i := 0; i < totalBlocks; i++ {
-		blockDir := createBlock(t, db.Dir(), 1, 0, 0)
+		blockDir := createBlock(t, db.Dir(), genSeries(1, 1, 0, 0))
 		block, err := OpenBlock(nil, blockDir, nil)
 		testutil.Ok(t, err)
 		// Add some some fake tombstones to trigger the compaction.
@@ -880,7 +881,7 @@ func (c *mockCompactorFailing) Write(dest string, b BlockReader, mint, maxt int6
 		return ulid.ULID{}, fmt.Errorf("the compactor already did the maximum allowed blocks so it is time to fail")
 	}
 
-	block, err := OpenBlock(nil, createBlock(c.t, dest, 1, 0, 0), nil)
+	block, err := OpenBlock(nil, createBlock(c.t, dest, genSeries(1, 1, 0, 0)), nil)
 	testutil.Ok(c.t, err)
 	testutil.Ok(c.t, block.Close()) // Close block as we won't be using anywhere.
 	c.blocks = append(c.blocks, block)
@@ -918,7 +919,7 @@ func TestTimeRetention(t *testing.T) {
 	}
 
 	for _, m := range blocks {
-		createBlock(t, db.Dir(), 10, m.MinTime, m.MaxTime)
+		createBlock(t, db.Dir(), genSeries(10, 10, m.MinTime, m.MaxTime))
 	}
 
 	testutil.Ok(t, db.reload())                       // Reload the db to register the new blocks.
@@ -952,7 +953,7 @@ func TestSizeRetention(t *testing.T) {
 	}
 
 	for _, m := range blocks {
-		createBlock(t, db.Dir(), 100, m.MinTime, m.MaxTime)
+		createBlock(t, db.Dir(), genSeries(100, 10, m.MinTime, m.MaxTime))
 	}
 
 	// Test that registered size matches the actual disk size.
@@ -1319,7 +1320,7 @@ func TestInitializeHeadTimestamp(t *testing.T) {
 		testutil.Ok(t, err)
 		defer os.RemoveAll(dir)
 
-		createBlock(t, dir, 1, 1000, 2000)
+		createBlock(t, dir, genSeries(1, 1, 1000, 2000))
 
 		db, err := Open(dir, nil, nil, nil)
 		testutil.Ok(t, err)
@@ -1332,7 +1333,7 @@ func TestInitializeHeadTimestamp(t *testing.T) {
 		testutil.Ok(t, err)
 		defer os.RemoveAll(dir)
 
-		createBlock(t, dir, 1, 1000, 6000)
+		createBlock(t, dir, genSeries(1, 1, 1000, 6000))
 
 		testutil.Ok(t, os.MkdirAll(path.Join(dir, "wal"), 0777))
 		w, err := wal.New(nil, nil, path.Join(dir, "wal"))
@@ -1450,7 +1451,7 @@ func TestNoEmptyBlocks(t *testing.T) {
 			{MinTime: currentTime + 100, MaxTime: currentTime + 100 + db.opts.BlockRanges[0]},
 		}
 		for _, m := range blocks {
-			createBlock(t, db.Dir(), 2, m.MinTime, m.MaxTime)
+			createBlock(t, db.Dir(), genSeries(2, 2, m.MinTime, m.MaxTime))
 		}
 
 		oldBlocks := db.Blocks()
@@ -1623,7 +1624,7 @@ func TestBlockRanges(t *testing.T) {
 	// Test that the compactor doesn't create overlapping blocks
 	// when a non standard block already exists.
 	firstBlockMaxT := int64(3)
-	createBlock(t, dir, 1, 0, firstBlockMaxT)
+	createBlock(t, dir, genSeries(1, 1, 0, firstBlockMaxT))
 	db, err := Open(dir, logger, nil, DefaultOptions)
 	if err != nil {
 		t.Fatalf("Opening test storage failed: %s", err)
@@ -1673,7 +1674,7 @@ func TestBlockRanges(t *testing.T) {
 	testutil.Ok(t, db.Close())
 
 	thirdBlockMaxt := secondBlockMaxt + 2
-	createBlock(t, dir, 1, secondBlockMaxt+1, thirdBlockMaxt)
+	createBlock(t, dir, genSeries(1, 1, secondBlockMaxt+1, thirdBlockMaxt))
 
 	db, err = Open(dir, logger, nil, DefaultOptions)
 	if err != nil {
diff --git a/head_test.go b/head_test.go
index 8781f677af..137818323d 100644
--- a/head_test.go
+++ b/head_test.go
@@ -28,6 +28,7 @@ import (
 	"github.com/prometheus/tsdb/index"
 	"github.com/prometheus/tsdb/labels"
 	"github.com/prometheus/tsdb/testutil"
+	"github.com/prometheus/tsdb/tsdbutil"
 	"github.com/prometheus/tsdb/wal"
 )
 
@@ -352,7 +353,7 @@ Outer:
 		res, err := q.Select(labels.NewEqualMatcher("a", "b"))
 		testutil.Ok(t, err)
 
-		expSamples := make([]Sample, 0, len(c.remaint))
+		expSamples := make([]tsdbutil.Sample, 0, len(c.remaint))
 		for _, ts := range c.remaint {
 			expSamples = append(expSamples, sample{ts, smpls[ts]})
 		}
@@ -472,9 +473,9 @@ func TestDelete_e2e(t *testing.T) {
 			{"job", "prom-k8s"},
 		},
 	}
-	seriesMap := map[string][]Sample{}
+	seriesMap := map[string][]tsdbutil.Sample{}
 	for _, l := range lbls {
-		seriesMap[labels.New(l...).String()] = []Sample{}
+		seriesMap[labels.New(l...).String()] = []tsdbutil.Sample{}
 	}
 	dir, _ := ioutil.TempDir("", "test")
 	defer os.RemoveAll(dir)
@@ -484,7 +485,7 @@ func TestDelete_e2e(t *testing.T) {
 	app := hb.Appender()
 	for _, l := range lbls {
 		ls := labels.New(l...)
-		series := []Sample{}
+		series := []tsdbutil.Sample{}
 		ts := rand.Int63n(300)
 		for i := 0; i < numDatapoints; i++ {
 			v := rand.Float64()
@@ -604,8 +605,8 @@ func boundedSamples(full []sample, mint, maxt int64) []sample {
 	return full
 }
 
-func deletedSamples(full []Sample, dranges Intervals) []Sample {
-	ds := make([]Sample, 0, len(full))
+func deletedSamples(full []tsdbutil.Sample, dranges Intervals) []tsdbutil.Sample {
+	ds := make([]tsdbutil.Sample, 0, len(full))
 Outer:
 	for _, s := range full {
 		for _, r := range dranges {
diff --git a/querier_test.go b/querier_test.go
index 69ca84602b..63bfda00d7 100644
--- a/querier_test.go
+++ b/querier_test.go
@@ -68,58 +68,6 @@ func (m *mockSeriesIterator) At() (int64, float64) { return m.at() }
 func (m *mockSeriesIterator) Next() bool           { return m.next() }
 func (m *mockSeriesIterator) Err() error           { return m.err() }
 
-type mockSeries struct {
-	labels   func() labels.Labels
-	iterator func() SeriesIterator
-}
-
-type Sample = tsdbutil.Sample
-
-func newSeries(l map[string]string, s []Sample) Series {
-	return &mockSeries{
-		labels:   func() labels.Labels { return labels.FromMap(l) },
-		iterator: func() SeriesIterator { return newListSeriesIterator(s) },
-	}
-}
-func (m *mockSeries) Labels() labels.Labels    { return m.labels() }
-func (m *mockSeries) Iterator() SeriesIterator { return m.iterator() }
-
-type listSeriesIterator struct {
-	list []Sample
-	idx  int
-}
-
-func newListSeriesIterator(list []Sample) *listSeriesIterator {
-	return &listSeriesIterator{list: list, idx: -1}
-}
-
-func (it *listSeriesIterator) At() (int64, float64) {
-	s := it.list[it.idx]
-	return s.T(), s.V()
-}
-
-func (it *listSeriesIterator) Next() bool {
-	it.idx++
-	return it.idx < len(it.list)
-}
-
-func (it *listSeriesIterator) Seek(t int64) bool {
-	if it.idx == -1 {
-		it.idx = 0
-	}
-	// Do binary search between current position and end.
-	it.idx = sort.Search(len(it.list)-it.idx, func(i int) bool {
-		s := it.list[i+it.idx]
-		return s.T() >= t
-	})
-
-	return it.idx < len(it.list)
-}
-
-func (it *listSeriesIterator) Err() error {
-	return nil
-}
-
 func TestMergedSeriesSet(t *testing.T) {
 
 	cases := []struct {
@@ -134,32 +82,32 @@ func TestMergedSeriesSet(t *testing.T) {
 			a: newMockSeriesSet([]Series{
 				newSeries(map[string]string{
 					"a": "a",
-				}, []Sample{
+				}, []tsdbutil.Sample{
 					sample{t: 1, v: 1},
 				}),
 			}),
 			b: newMockSeriesSet([]Series{
 				newSeries(map[string]string{
 					"a": "a",
-				}, []Sample{
+				}, []tsdbutil.Sample{
 					sample{t: 2, v: 2},
 				}),
 				newSeries(map[string]string{
 					"b": "b",
-				}, []Sample{
+				}, []tsdbutil.Sample{
 					sample{t: 1, v: 1},
 				}),
 			}),
 			exp: newMockSeriesSet([]Series{
 				newSeries(map[string]string{
 					"a": "a",
-				}, []Sample{
+				}, []tsdbutil.Sample{
 					sample{t: 1, v: 1},
 					sample{t: 2, v: 2},
 				}),
 				newSeries(map[string]string{
 					"b": "b",
-				}, []Sample{
+				}, []tsdbutil.Sample{
 					sample{t: 1, v: 1},
 				}),
 			}),
@@ -169,13 +117,13 @@ func TestMergedSeriesSet(t *testing.T) {
 				newSeries(map[string]string{
 					"handler":  "prometheus",
 					"instance": "127.0.0.1:9090",
-				}, []Sample{
+				}, []tsdbutil.Sample{
 					sample{t: 1, v: 1},
 				}),
 				newSeries(map[string]string{
 					"handler":  "prometheus",
 					"instance": "localhost:9090",
-				}, []Sample{
+				}, []tsdbutil.Sample{
 					sample{t: 1, v: 2},
 				}),
 			}),
@@ -183,13 +131,13 @@ func TestMergedSeriesSet(t *testing.T) {
 				newSeries(map[string]string{
 					"handler":  "prometheus",
 					"instance": "127.0.0.1:9090",
-				}, []Sample{
+				}, []tsdbutil.Sample{
 					sample{t: 2, v: 1},
 				}),
 				newSeries(map[string]string{
 					"handler":  "query",
 					"instance": "localhost:9090",
-				}, []Sample{
+				}, []tsdbutil.Sample{
 					sample{t: 2, v: 2},
 				}),
 			}),
@@ -197,20 +145,20 @@ func TestMergedSeriesSet(t *testing.T) {
 				newSeries(map[string]string{
 					"handler":  "prometheus",
 					"instance": "127.0.0.1:9090",
-				}, []Sample{
+				}, []tsdbutil.Sample{
 					sample{t: 1, v: 1},
 					sample{t: 2, v: 1},
 				}),
 				newSeries(map[string]string{
 					"handler":  "prometheus",
 					"instance": "localhost:9090",
-				}, []Sample{
+				}, []tsdbutil.Sample{
 					sample{t: 1, v: 2},
 				}),
 				newSeries(map[string]string{
 					"handler":  "query",
 					"instance": "localhost:9090",
-				}, []Sample{
+				}, []tsdbutil.Sample{
 					sample{t: 2, v: 2},
 				}),
 			}),
@@ -316,7 +264,7 @@ func createIdxChkReaders(tc []seriesSamples) (IndexReader, ChunkReader) {
 }
 
 func TestBlockQuerier(t *testing.T) {
-	newSeries := func(l map[string]string, s []Sample) Series {
+	newSeries := func(l map[string]string, s []tsdbutil.Sample) Series {
 		return &mockSeries{
 			labels:   func() labels.Labels { return labels.FromMap(l) },
 			iterator: func() SeriesIterator { return newListSeriesIterator(s) },
@@ -404,13 +352,13 @@ func TestBlockQuerier(t *testing.T) {
 					newSeries(map[string]string{
 						"a": "a",
 					},
-						[]Sample{sample{2, 3}, sample{3, 4}, sample{5, 2}, sample{6, 3}},
+						[]tsdbutil.Sample{sample{2, 3}, sample{3, 4}, sample{5, 2}, sample{6, 3}},
 					),
 					newSeries(map[string]string{
 						"a": "a",
 						"b": "b",
 					},
-						[]Sample{sample{2, 2}, sample{3, 3}, sample{5, 3}, sample{6, 6}},
+						[]tsdbutil.Sample{sample{2, 2}, sample{3, 3}, sample{5, 3}, sample{6, 6}},
 					),
 				}),
 			},
@@ -456,7 +404,7 @@ Outer:
 }
 
 func TestBlockQuerierDelete(t *testing.T) {
-	newSeries := func(l map[string]string, s []Sample) Series {
+	newSeries := func(l map[string]string, s []tsdbutil.Sample) Series {
 		return &mockSeries{
 			labels:   func() labels.Labels { return labels.FromMap(l) },
 			iterator: func() SeriesIterator { return newListSeriesIterator(s) },
@@ -531,13 +479,13 @@ func TestBlockQuerierDelete(t *testing.T) {
 					newSeries(map[string]string{
 						"a": "a",
 					},
-						[]Sample{sample{5, 2}, sample{6, 3}, sample{7, 4}},
+						[]tsdbutil.Sample{sample{5, 2}, sample{6, 3}, sample{7, 4}},
 					),
 					newSeries(map[string]string{
 						"a": "a",
 						"b": "b",
 					},
-						[]Sample{sample{4, 15}, sample{5, 3}},
+						[]tsdbutil.Sample{sample{4, 15}, sample{5, 3}},
 					),
 				}),
 			},
@@ -550,12 +498,12 @@ func TestBlockQuerierDelete(t *testing.T) {
 						"a": "a",
 						"b": "b",
 					},
-						[]Sample{sample{4, 15}, sample{5, 3}},
+						[]tsdbutil.Sample{sample{4, 15}, sample{5, 3}},
 					),
 					newSeries(map[string]string{
 						"b": "b",
 					},
-						[]Sample{sample{2, 2}, sample{3, 6}, sample{5, 1}},
+						[]tsdbutil.Sample{sample{2, 2}, sample{3, 6}, sample{5, 1}},
 					),
 				}),
 			},
@@ -568,7 +516,7 @@ func TestBlockQuerierDelete(t *testing.T) {
 						"a": "a",
 						"b": "b",
 					},
-						[]Sample{sample{4, 15}},
+						[]tsdbutil.Sample{sample{4, 15}},
 					),
 				}),
 			},
@@ -727,66 +675,66 @@ func (s itSeries) Labels() labels.Labels    { return labels.Labels{} }
 
 func TestSeriesIterator(t *testing.T) {
 	itcases := []struct {
-		a, b, c []Sample
-		exp     []Sample
+		a, b, c []tsdbutil.Sample
+		exp     []tsdbutil.Sample
 
 		mint, maxt int64
 	}{
 		{
-			a: []Sample{},
-			b: []Sample{},
-			c: []Sample{},
+			a: []tsdbutil.Sample{},
+			b: []tsdbutil.Sample{},
+			c: []tsdbutil.Sample{},
 
-			exp: []Sample{},
+			exp: []tsdbutil.Sample{},
 
 			mint: math.MinInt64,
 			maxt: math.MaxInt64,
 		},
 		{
-			a: []Sample{
+			a: []tsdbutil.Sample{
 				sample{1, 2},
 				sample{2, 3},
 				sample{3, 5},
 				sample{6, 1},
 			},
-			b: []Sample{},
-			c: []Sample{
+			b: []tsdbutil.Sample{},
+			c: []tsdbutil.Sample{
 				sample{7, 89}, sample{9, 8},
 			},
 
-			exp: []Sample{
+			exp: []tsdbutil.Sample{
 				sample{1, 2}, sample{2, 3}, sample{3, 5}, sample{6, 1}, sample{7, 89}, sample{9, 8},
 			},
 			mint: math.MinInt64,
 			maxt: math.MaxInt64,
 		},
 		{
-			a: []Sample{},
-			b: []Sample{
+			a: []tsdbutil.Sample{},
+			b: []tsdbutil.Sample{
 				sample{1, 2}, sample{2, 3}, sample{3, 5}, sample{6, 1},
 			},
-			c: []Sample{
+			c: []tsdbutil.Sample{
 				sample{7, 89}, sample{9, 8},
 			},
 
-			exp: []Sample{
+			exp: []tsdbutil.Sample{
 				sample{1, 2}, sample{2, 3}, sample{3, 5}, sample{6, 1}, sample{7, 89}, sample{9, 8},
 			},
 			mint: 2,
 			maxt: 8,
 		},
 		{
-			a: []Sample{
+			a: []tsdbutil.Sample{
 				sample{1, 2}, sample{2, 3}, sample{3, 5}, sample{6, 1},
 			},
-			b: []Sample{
+			b: []tsdbutil.Sample{
 				sample{7, 89}, sample{9, 8},
 			},
-			c: []Sample{
+			c: []tsdbutil.Sample{
 				sample{10, 22}, sample{203, 3493},
 			},
 
-			exp: []Sample{
+			exp: []tsdbutil.Sample{
 				sample{1, 2}, sample{2, 3}, sample{3, 5}, sample{6, 1}, sample{7, 89}, sample{9, 8}, sample{10, 22}, sample{203, 3493},
 			},
 			mint: 6,
@@ -795,29 +743,29 @@ func TestSeriesIterator(t *testing.T) {
 	}
 
 	seekcases := []struct {
-		a, b, c []Sample
+		a, b, c []tsdbutil.Sample
 
 		seek    int64
 		success bool
-		exp     []Sample
+		exp     []tsdbutil.Sample
 
 		mint, maxt int64
 	}{
 		{
-			a: []Sample{},
-			b: []Sample{},
-			c: []Sample{},
+			a: []tsdbutil.Sample{},
+			b: []tsdbutil.Sample{},
+			c: []tsdbutil.Sample{},
 
 			seek:    0,
 			success: false,
 			exp:     nil,
 		},
 		{
-			a: []Sample{
+			a: []tsdbutil.Sample{
 				sample{2, 3},
 			},
-			b: []Sample{},
-			c: []Sample{
+			b: []tsdbutil.Sample{},
+			c: []tsdbutil.Sample{
 				sample{7, 89}, sample{9, 8},
 			},
 
@@ -828,55 +776,55 @@ func TestSeriesIterator(t *testing.T) {
 			maxt:    math.MaxInt64,
 		},
 		{
-			a: []Sample{},
-			b: []Sample{
+			a: []tsdbutil.Sample{},
+			b: []tsdbutil.Sample{
 				sample{1, 2}, sample{3, 5}, sample{6, 1},
 			},
-			c: []Sample{
+			c: []tsdbutil.Sample{
 				sample{7, 89}, sample{9, 8},
 			},
 
 			seek:    2,
 			success: true,
-			exp: []Sample{
+			exp: []tsdbutil.Sample{
 				sample{3, 5}, sample{6, 1}, sample{7, 89}, sample{9, 8},
 			},
 			mint: 5,
 			maxt: 8,
 		},
 		{
-			a: []Sample{
+			a: []tsdbutil.Sample{
 				sample{6, 1},
 			},
-			b: []Sample{
+			b: []tsdbutil.Sample{
 				sample{9, 8},
 			},
-			c: []Sample{
+			c: []tsdbutil.Sample{
 				sample{10, 22}, sample{203, 3493},
 			},
 
 			seek:    10,
 			success: true,
-			exp: []Sample{
+			exp: []tsdbutil.Sample{
 				sample{10, 22}, sample{203, 3493},
 			},
 			mint: 10,
 			maxt: 203,
 		},
 		{
-			a: []Sample{
+			a: []tsdbutil.Sample{
 				sample{6, 1},
 			},
-			b: []Sample{
+			b: []tsdbutil.Sample{
 				sample{9, 8},
 			},
-			c: []Sample{
+			c: []tsdbutil.Sample{
 				sample{10, 22}, sample{203, 3493},
 			},
 
 			seek:    203,
 			success: true,
-			exp: []Sample{
+			exp: []tsdbutil.Sample{
 				sample{203, 3493},
 			},
 			mint: 7,
@@ -893,10 +841,10 @@ func TestSeriesIterator(t *testing.T) {
 			}
 			res := newChunkSeriesIterator(chkMetas, nil, tc.mint, tc.maxt)
 
-			smplValid := make([]Sample, 0)
+			smplValid := make([]tsdbutil.Sample, 0)
 			for _, s := range tc.exp {
 				if s.T() >= tc.mint && s.T() <= tc.maxt {
-					smplValid = append(smplValid, Sample(s))
+					smplValid = append(smplValid, tsdbutil.Sample(s))
 				}
 			}
 			exp := newListSeriesIterator(smplValid)
@@ -910,22 +858,22 @@ func TestSeriesIterator(t *testing.T) {
 
 		t.Run("Seek", func(t *testing.T) {
 			extra := []struct {
-				a, b, c []Sample
+				a, b, c []tsdbutil.Sample
 
 				seek    int64
 				success bool
-				exp     []Sample
+				exp     []tsdbutil.Sample
 
 				mint, maxt int64
 			}{
 				{
-					a: []Sample{
+					a: []tsdbutil.Sample{
 						sample{6, 1},
 					},
-					b: []Sample{
+					b: []tsdbutil.Sample{
 						sample{9, 8},
 					},
-					c: []Sample{
+					c: []tsdbutil.Sample{
 						sample{10, 22}, sample{203, 3493},
 					},
 
@@ -936,19 +884,19 @@ func TestSeriesIterator(t *testing.T) {
 					maxt:    202,
 				},
 				{
-					a: []Sample{
+					a: []tsdbutil.Sample{
 						sample{6, 1},
 					},
-					b: []Sample{
+					b: []tsdbutil.Sample{
 						sample{9, 8},
 					},
-					c: []Sample{
+					c: []tsdbutil.Sample{
 						sample{10, 22}, sample{203, 3493},
 					},
 
 					seek:    5,
 					success: true,
-					exp:     []Sample{sample{10, 22}},
+					exp:     []tsdbutil.Sample{sample{10, 22}},
 					mint:    10,
 					maxt:    202,
 				},
@@ -964,10 +912,10 @@ func TestSeriesIterator(t *testing.T) {
 				}
 				res := newChunkSeriesIterator(chkMetas, nil, tc.mint, tc.maxt)
 
-				smplValid := make([]Sample, 0)
+				smplValid := make([]tsdbutil.Sample, 0)
 				for _, s := range tc.exp {
 					if s.T() >= tc.mint && s.T() <= tc.maxt {
-						smplValid = append(smplValid, Sample(s))
+						smplValid = append(smplValid, tsdbutil.Sample(s))
 					}
 				}
 				exp := newListSeriesIterator(smplValid)
@@ -1000,7 +948,7 @@ func TestSeriesIterator(t *testing.T) {
 				itSeries{newListSeriesIterator(tc.c)}
 
 			res := newChainedSeriesIterator(a, b, c)
-			exp := newListSeriesIterator([]Sample(tc.exp))
+			exp := newListSeriesIterator([]tsdbutil.Sample(tc.exp))
 
 			smplExp, errExp := expandSeriesIterator(exp)
 			smplRes, errRes := expandSeriesIterator(res)
@@ -1045,9 +993,9 @@ func TestSeriesIterator(t *testing.T) {
 // Regression for: https://github.com/prometheus/tsdb/pull/97
 func TestChunkSeriesIterator_DoubleSeek(t *testing.T) {
 	chkMetas := []chunks.Meta{
-		tsdbutil.ChunkFromSamples([]Sample{}),
-		tsdbutil.ChunkFromSamples([]Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}}),
-		tsdbutil.ChunkFromSamples([]Sample{sample{4, 4}, sample{5, 5}}),
+		tsdbutil.ChunkFromSamples([]tsdbutil.Sample{}),
+		tsdbutil.ChunkFromSamples([]tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}}),
+		tsdbutil.ChunkFromSamples([]tsdbutil.Sample{sample{4, 4}, sample{5, 5}}),
 	}
 
 	res := newChunkSeriesIterator(chkMetas, nil, 2, 8)
@@ -1062,9 +1010,9 @@ func TestChunkSeriesIterator_DoubleSeek(t *testing.T) {
 // skipped to the end when seeking a value in the current chunk.
 func TestChunkSeriesIterator_SeekInCurrentChunk(t *testing.T) {
 	metas := []chunks.Meta{
-		tsdbutil.ChunkFromSamples([]Sample{}),
-		tsdbutil.ChunkFromSamples([]Sample{sample{1, 2}, sample{3, 4}, sample{5, 6}, sample{7, 8}}),
-		tsdbutil.ChunkFromSamples([]Sample{}),
+		tsdbutil.ChunkFromSamples([]tsdbutil.Sample{}),
+		tsdbutil.ChunkFromSamples([]tsdbutil.Sample{sample{1, 2}, sample{3, 4}, sample{5, 6}, sample{7, 8}}),
+		tsdbutil.ChunkFromSamples([]tsdbutil.Sample{}),
 	}
 
 	it := newChunkSeriesIterator(metas, nil, 1, 7)
@@ -1084,7 +1032,7 @@ func TestChunkSeriesIterator_SeekInCurrentChunk(t *testing.T) {
 // Seek gets called and advances beyond the max time, which was just accepted as a valid sample.
 func TestChunkSeriesIterator_NextWithMinTime(t *testing.T) {
 	metas := []chunks.Meta{
-		tsdbutil.ChunkFromSamples([]Sample{sample{1, 6}, sample{5, 6}, sample{7, 8}}),
+		tsdbutil.ChunkFromSamples([]tsdbutil.Sample{sample{1, 6}, sample{5, 6}, sample{7, 8}}),
 	}
 
 	it := newChunkSeriesIterator(metas, nil, 2, 4)
@@ -1232,7 +1180,7 @@ func BenchmarkPersistedQueries(b *testing.B) {
 				dir, err := ioutil.TempDir("", "bench_persisted")
 				testutil.Ok(b, err)
 				defer os.RemoveAll(dir)
-				block, err := OpenBlock(nil, createBlock(b, dir, nSeries, 1, int64(nSamples)), nil)
+				block, err := OpenBlock(nil, createBlock(b, dir, genSeries(nSeries, 10, 1, int64(nSamples))), nil)
 				testutil.Ok(b, err)
 				defer block.Close()
 
@@ -1462,3 +1410,53 @@ func (m mockIndex) LabelNames() ([]string, error) {
 	sort.Strings(labelNames)
 	return labelNames, nil
 }
+
+type mockSeries struct {
+	labels   func() labels.Labels
+	iterator func() SeriesIterator
+}
+
+func newSeries(l map[string]string, s []tsdbutil.Sample) Series {
+	return &mockSeries{
+		labels:   func() labels.Labels { return labels.FromMap(l) },
+		iterator: func() SeriesIterator { return newListSeriesIterator(s) },
+	}
+}
+func (m *mockSeries) Labels() labels.Labels    { return m.labels() }
+func (m *mockSeries) Iterator() SeriesIterator { return m.iterator() }
+
+type listSeriesIterator struct {
+	list []tsdbutil.Sample
+	idx  int
+}
+
+func newListSeriesIterator(list []tsdbutil.Sample) *listSeriesIterator {
+	return &listSeriesIterator{list: list, idx: -1}
+}
+
+func (it *listSeriesIterator) At() (int64, float64) {
+	s := it.list[it.idx]
+	return s.T(), s.V()
+}
+
+func (it *listSeriesIterator) Next() bool {
+	it.idx++
+	return it.idx < len(it.list)
+}
+
+func (it *listSeriesIterator) Seek(t int64) bool {
+	if it.idx == -1 {
+		it.idx = 0
+	}
+	// Do binary search between current position and end.
+	it.idx = sort.Search(len(it.list)-it.idx, func(i int) bool {
+		s := it.list[i+it.idx]
+		return s.T() >= t
+	})
+
+	return it.idx < len(it.list)
+}
+
+func (it *listSeriesIterator) Err() error {
+	return nil
+}

From 6181d1f18fc8578a89ccbabe8cee907e536aa034 Mon Sep 17 00:00:00 2001
From: yeya24 <ben.ye@daocloud.io>
Date: Tue, 29 Jan 2019 10:25:12 +0800
Subject: [PATCH 13/23] fix two typos in db_test

Signed-off-by: yeya24 <ben.ye@daocloud.io>
---
 db_test.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/db_test.go b/db_test.go
index e3e90ce8b3..f727a79192 100644
--- a/db_test.go
+++ b/db_test.go
@@ -1604,9 +1604,9 @@ func TestCorrectNumTombstones(t *testing.T) {
 
 // TestBlockRanges checks the following use cases:
 //  - No samples can be added with timestamps lower than the last block maxt.
-//  - The compactor doesn't create overlaping blocks
+//  - The compactor doesn't create overlapping blocks
 // even when the last blocks is not within the default boundaries.
-//	- Lower bondary is based on the smallest sample in the head and
+//	- Lower boundary is based on the smallest sample in the head and
 // upper boundary is rounded to the configured block range.
 //
 // This ensures that a snapshot that includes the head and creates a block with a custom time range

From eb5034d5b0c54651beee345590b17a1040d67f9c Mon Sep 17 00:00:00 2001
From: radek_lesniewski <Radeklesniewski13@gmail.com>
Date: Tue, 29 Jan 2019 12:23:53 +0100
Subject: [PATCH 14/23] Additional logging in compact.go - logged time needed
 for writing blocks (#505)

* Additional logging in compact.go - logged time needed for writing blocks to disk

Signed-off-by: Radoslaw Lesniewski <Radoslaw.Lesniewski@sabre.com>

* Additional logging in compact.go - code formatted

Signed-off-by: Radoslaw Lesniewski <Radoslaw.Lesniewski@sabre.com>
---
 compact.go | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/compact.go b/compact.go
index 5d8155f51b..0358a80066 100644
--- a/compact.go
+++ b/compact.go
@@ -414,6 +414,8 @@ func (c *LeveledCompactor) Compact(dest string, dirs []string, open []*Block) (u
 }
 
 func (c *LeveledCompactor) Write(dest string, b BlockReader, mint, maxt int64, parent *BlockMeta) (ulid.ULID, error) {
+	start := time.Now()
+
 	entropy := rand.New(rand.NewSource(time.Now().UnixNano()))
 	uid := ulid.MustNew(ulid.Now(), entropy)
 
@@ -440,7 +442,13 @@ func (c *LeveledCompactor) Write(dest string, b BlockReader, mint, maxt int64, p
 		return ulid.ULID{}, nil
 	}
 
-	level.Info(c.logger).Log("msg", "write block", "mint", meta.MinTime, "maxt", meta.MaxTime, "ulid", meta.ULID)
+	level.Info(c.logger).Log(
+		"msg", "write block",
+		"mint", meta.MinTime,
+		"maxt", meta.MaxTime,
+		"ulid", meta.ULID,
+		"duration", time.Since(start),
+	)
 	return uid, nil
 }
 

From ee99718ff622f7b8b6ecdee3db6592e5184023ec Mon Sep 17 00:00:00 2001
From: Krasi Georgiev <krasi-georgiev@users.noreply.github.com>
Date: Tue, 29 Jan 2019 20:46:12 +0300
Subject: [PATCH 15/23] rename chunk reader vars to make it easier to follow.
 (#508)

* rename chunk reader vars to make it easyer to understand.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
---
 chunks/chunks.go | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/chunks/chunks.go b/chunks/chunks.go
index f35ad2cdd9..fe3e982e8d 100644
--- a/chunks/chunks.go
+++ b/chunks/chunks.go
@@ -352,30 +352,31 @@ func (s *Reader) Size() int64 {
 	return s.size
 }
 
+// Chunk returns a chunk from a given reference.
 func (s *Reader) Chunk(ref uint64) (chunkenc.Chunk, error) {
 	var (
-		seq = int(ref >> 32)
-		off = int((ref << 32) >> 32)
+		sgmSeq    = int(ref >> 32)
+		sgmOffset = int((ref << 32) >> 32)
 	)
-	if seq >= len(s.bs) {
-		return nil, errors.Errorf("reference sequence %d out of range", seq)
+	if sgmSeq >= len(s.bs) {
+		return nil, errors.Errorf("reference sequence %d out of range", sgmSeq)
 	}
-	b := s.bs[seq]
+	chkS := s.bs[sgmSeq]
 
-	if off >= b.Len() {
-		return nil, errors.Errorf("offset %d beyond data size %d", off, b.Len())
+	if sgmOffset >= chkS.Len() {
+		return nil, errors.Errorf("offset %d beyond data size %d", sgmOffset, chkS.Len())
 	}
 	// With the minimum chunk length this should never cause us reading
 	// over the end of the slice.
-	r := b.Range(off, off+binary.MaxVarintLen32)
+	chk := chkS.Range(sgmOffset, sgmOffset+binary.MaxVarintLen32)
 
-	l, n := binary.Uvarint(r)
+	chkLen, n := binary.Uvarint(chk)
 	if n <= 0 {
 		return nil, errors.Errorf("reading chunk length failed with %d", n)
 	}
-	r = b.Range(off+n, off+n+1+int(l))
+	chk = chkS.Range(sgmOffset+n, sgmOffset+n+1+int(chkLen))
 
-	return s.pool.Get(chunkenc.Encoding(r[0]), r[1:1+l])
+	return s.pool.Get(chunkenc.Encoding(chk[0]), chk[1:1+chkLen])
 }
 
 func nextSequenceFile(dir string) (string, int, error) {

From 1bcda9d23f9951f1333cd2a3ba2a97b6b6ec69c4 Mon Sep 17 00:00:00 2001
From: Alec <867245430@qq.com>
Date: Mon, 4 Feb 2019 17:30:47 +0800
Subject: [PATCH 16/23] Update tombstones.md (format of tombstone) (#511)

---
 docs/format/tombstones.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/format/tombstones.md b/docs/format/tombstones.md
index 308a458a48..058f5f1e2f 100644
--- a/docs/format/tombstones.md
+++ b/docs/format/tombstones.md
@@ -25,7 +25,7 @@ The stones section is 0 padded to a multiple of 4 for fast scans.
 # Tombstone 
 
 ```
-┌─────────────┬───────────────┬──────────────┐
-│ref <varint> │ mint <varint> │ maxt <varint>│
-└─────────────┴───────────────┴──────────────┘
+┌────────────────┬─────────────────┬────────────────┐
+│ref <uvarint64> │ mint <varint64> │ maxt <varint64>│
+└────────────────┴─────────────────┴────────────────┘
 ```

From 2b6bc9fb3250015cfc4d1041dbcea06fcd2a4222 Mon Sep 17 00:00:00 2001
From: Alec <867245430@qq.com>
Date: Tue, 5 Feb 2019 10:23:14 +0800
Subject: [PATCH 17/23] Update index.md

Signed-off-by: naivewong <867245430@qq.com>
---
 docs/format/index.md | 56 ++++++++++++++++++++++----------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/docs/format/index.md b/docs/format/index.md
index 18600c8351..bd8b8f1963 100644
--- a/docs/format/index.md
+++ b/docs/format/index.md
@@ -85,34 +85,34 @@ After the labels, the number of indexed chunks is encoded, followed by a sequenc
 `mint` of the first chunk is stored, it's `maxt` is stored as a delta and the `mint` and `maxt` are encoded as deltas to the previous time for subsequent chunks. Similarly, the reference of the first chunk is stored and the next ref is stored as a delta to the previous one.
 
 ```
-┌─────────────────────────────────────────────────────────────────────────┐
-│ len <uvarint>                                                           │
-├─────────────────────────────────────────────────────────────────────────┤
-│ ┌──────────────────┬──────────────────────────────────────────────────┐ │
-│ │                  │ ┌──────────────────────────────────────────┐     │ │
-│ │                  │ │ ref(l_i.name) <uvarint>                  │     │ │
-│ │     #labels      │ ├──────────────────────────────────────────┤ ... │ │
-│ │    <uvarint>     │ │ ref(l_i.value) <uvarint>                 │     │ │
-│ │                  │ └──────────────────────────────────────────┘     │ │
-│ ├──────────────────┼──────────────────────────────────────────────────┤ │
-│ │                  │ ┌──────────────────────────────────────────┐     │ │
-│ │                  │ │ c_0.mint <varint>                        │     │ │
-│ │                  │ ├──────────────────────────────────────────┤     │ │
-│ │                  │ │ c_0.maxt - c_0.mint <uvarint>            │     │ │
-│ │                  │ ├──────────────────────────────────────────┤     │ │
-│ │                  │ │ ref(c_0.data) <uvarint>                  │     │ │
-│ │      #chunks     │ └──────────────────────────────────────────┘     │ │
-│ │     <uvarint>    │ ┌──────────────────────────────────────────┐     │ │
-│ │                  │ │ c_i.mint - c_i-1.maxt <uvarint>          │     │ │
-│ │                  │ ├──────────────────────────────────────────┤     │ │
-│ │                  │ │ c_i.maxt - c_i.mint <uvarint>            │     │ │
-│ │                  │ ├──────────────────────────────────────────┤ ... │ │
-│ │                  │ │ ref(c_i.data) - ref(c_i-1.data) <varint> │     │ │
-│ │                  │ └──────────────────────────────────────────┘     │ │
-│ └──────────────────┴──────────────────────────────────────────────────┘ │
-├─────────────────────────────────────────────────────────────────────────┤
-│ CRC32 <4b>                                                              │
-└─────────────────────────────────────────────────────────────────────────┘
+┌───────────────────────────────────────────────────────────────────────────┐
+│ len <uvarint>                                                             │
+├───────────────────────────────────────────────────────────────────────────┤
+│ ┌──────────────────┬────────────────────────────────────────────────────┐ │
+│ │                  │ ┌────────────────────────────────────────────┐     │ │
+│ │                  │ │ ref(l_i.name) <uvarint32>                  │     │ │
+│ │     #labels      │ ├────────────────────────────────────────────┤ ... │ │
+│ │    <uvarint>     │ │ ref(l_i.value) <uvarint32>                 │     │ │
+│ │                  │ └────────────────────────────────────────────┘     │ │
+│ ├──────────────────┼────────────────────────────────────────────────────┤ │
+│ │                  │ ┌────────────────────────────────────────────┐     │ │
+│ │                  │ │ c_0.mint <varint64>                        │     │ │
+│ │                  │ ├────────────────────────────────────────────┤     │ │
+│ │                  │ │ c_0.maxt - c_0.mint <uvarint64>            │     │ │
+│ │                  │ ├────────────────────────────────────────────┤     │ │
+│ │                  │ │ ref(c_0.data) <uvarint64>                  │     │ │
+│ │      #chunks     │ └────────────────────────────────────────────┘     │ │
+│ │     <uvarint>    │ ┌────────────────────────────────────────────┐     │ │
+│ │                  │ │ c_i.mint - c_i-1.maxt <uvarint64>          │     │ │
+│ │                  │ ├────────────────────────────────────────────┤     │ │
+│ │                  │ │ c_i.maxt - c_i.mint <uvarint64>            │     │ │
+│ │                  │ ├────────────────────────────────────────────┤ ... │ │
+│ │                  │ │ ref(c_i.data) - ref(c_i-1.data) <varint64> │     │ │
+│ │                  │ └────────────────────────────────────────────┘     │ │
+│ └──────────────────┴────────────────────────────────────────────────────┘ │
+├───────────────────────────────────────────────────────────────────────────┤
+│ CRC32 <4b>                                                                │
+└───────────────────────────────────────────────────────────────────────────┘
 ```
 
 

From 8e589474c6d6c711e6e7a6d17fef534444162f82 Mon Sep 17 00:00:00 2001
From: Alec <867245430@qq.com>
Date: Tue, 5 Feb 2019 10:28:05 +0800
Subject: [PATCH 18/23] Update index.md

Signed-off-by: naivewong <867245430@qq.com>
---
 docs/format/index.md | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/docs/format/index.md b/docs/format/index.md
index bd8b8f1963..001f0b9ffe 100644
--- a/docs/format/index.md
+++ b/docs/format/index.md
@@ -176,24 +176,24 @@ The sequence of postings sections is finalized by an [offset table](#offset-tabl
 An offset table stores a sequence of entries that maps a list of strings to an offset. They are used to track label index and postings sections. They are read into memory when an index file is loaded.
 
 ```
-┌─────────────────────┬────────────────────┐
-│ len <4b>            │ #entries <4b>      │
-├─────────────────────┴────────────────────┤
-│ ┌──────────────────────────────────────┐ │
-│ │  n = #strs <uvarint>                 │ │
-│ ├──────────────────────┬───────────────┤ │
-│ │ len(str_1) <uvarint> │ str_1 <bytes> │ │
-│ ├──────────────────────┴───────────────┤ │
-│ │  ...                                 │ │
-│ ├──────────────────────┬───────────────┤ │
-│ │ len(str_n) <uvarint> │ str_n <bytes> │ │
-│ ├──────────────────────┴───────────────┤ │
-│ │  offset <uvarint>                    │ │
-│ └──────────────────────────────────────┘ │
-│                  . . .                   │
-├──────────────────────────────────────────┤
-│  CRC32 <4b>                              │
-└──────────────────────────────────────────┘
+┌─────────────────────┬──────────────────────┐
+│ len <4b>            │ #entries <4b>        │
+├─────────────────────┴──────────────────────┤
+│ ┌────────────────────────────────────────┐ │
+│ │  n = #strs <uvarint>                   │ │
+│ ├──────────────────────┬─────────────────┤ │
+│ │ len(str_1) <uvarint> │ str_1 <bytes>   │ │
+│ ├──────────────────────┴─────────────────┤ │
+│ │  ...                                   │ │
+│ ├──────────────────────┬─────────────────┤ │
+│ │ len(str_n) <uvarint> │ str_n <bytes>   │ │
+│ ├──────────────────────┴─────────────────┤ │
+│ │  offset <uvarint64>                    │ │
+│ └────────────────────────────────────────┘ │
+│                    . . .                   │
+├────────────────────────────────────────────┤
+│  CRC32 <4b>                                │
+└────────────────────────────────────────────┘
 ```
 
 

From 1f723a8eb5490f9988055cd2d6eab2458a61f410 Mon Sep 17 00:00:00 2001
From: naivewong <867245430@qq.com>
Date: Tue, 5 Feb 2019 23:20:43 +0800
Subject: [PATCH 19/23] update #labels and #chunks

Signed-off-by: naivewong <867245430@qq.com>
---
 docs/format/index.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/format/index.md b/docs/format/index.md
index 001f0b9ffe..17472a62d7 100644
--- a/docs/format/index.md
+++ b/docs/format/index.md
@@ -92,7 +92,7 @@ After the labels, the number of indexed chunks is encoded, followed by a sequenc
 │ │                  │ ┌────────────────────────────────────────────┐     │ │
 │ │                  │ │ ref(l_i.name) <uvarint32>                  │     │ │
 │ │     #labels      │ ├────────────────────────────────────────────┤ ... │ │
-│ │    <uvarint>     │ │ ref(l_i.value) <uvarint32>                 │     │ │
+│ │    <uvarint64>   │ │ ref(l_i.value) <uvarint32>                 │     │ │
 │ │                  │ └────────────────────────────────────────────┘     │ │
 │ ├──────────────────┼────────────────────────────────────────────────────┤ │
 │ │                  │ ┌────────────────────────────────────────────┐     │ │
@@ -102,7 +102,7 @@ After the labels, the number of indexed chunks is encoded, followed by a sequenc
 │ │                  │ ├────────────────────────────────────────────┤     │ │
 │ │                  │ │ ref(c_0.data) <uvarint64>                  │     │ │
 │ │      #chunks     │ └────────────────────────────────────────────┘     │ │
-│ │     <uvarint>    │ ┌────────────────────────────────────────────┐     │ │
+│ │     <uvarint64>  │ ┌────────────────────────────────────────────┐     │ │
 │ │                  │ │ c_i.mint - c_i-1.maxt <uvarint64>          │     │ │
 │ │                  │ ├────────────────────────────────────────────┤     │ │
 │ │                  │ │ c_i.maxt - c_i.mint <uvarint64>            │     │ │

From 0649dfddf0448f2280043d9461f3adab56290698 Mon Sep 17 00:00:00 2001
From: naivewong <867245430@qq.com>
Date: Wed, 6 Feb 2019 00:05:30 +0800
Subject: [PATCH 20/23] more clear format representation

Signed-off-by: naivewong <867245430@qq.com>
---
 docs/format/index.md | 60 +++++++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 28 deletions(-)

diff --git a/docs/format/index.md b/docs/format/index.md
index 17472a62d7..c97b123bd1 100644
--- a/docs/format/index.md
+++ b/docs/format/index.md
@@ -85,34 +85,38 @@ After the labels, the number of indexed chunks is encoded, followed by a sequenc
 `mint` of the first chunk is stored, it's `maxt` is stored as a delta and the `mint` and `maxt` are encoded as deltas to the previous time for subsequent chunks. Similarly, the reference of the first chunk is stored and the next ref is stored as a delta to the previous one.
 
 ```
-┌───────────────────────────────────────────────────────────────────────────┐
-│ len <uvarint>                                                             │
-├───────────────────────────────────────────────────────────────────────────┤
-│ ┌──────────────────┬────────────────────────────────────────────────────┐ │
-│ │                  │ ┌────────────────────────────────────────────┐     │ │
-│ │                  │ │ ref(l_i.name) <uvarint32>                  │     │ │
-│ │     #labels      │ ├────────────────────────────────────────────┤ ... │ │
-│ │    <uvarint64>   │ │ ref(l_i.value) <uvarint32>                 │     │ │
-│ │                  │ └────────────────────────────────────────────┘     │ │
-│ ├──────────────────┼────────────────────────────────────────────────────┤ │
-│ │                  │ ┌────────────────────────────────────────────┐     │ │
-│ │                  │ │ c_0.mint <varint64>                        │     │ │
-│ │                  │ ├────────────────────────────────────────────┤     │ │
-│ │                  │ │ c_0.maxt - c_0.mint <uvarint64>            │     │ │
-│ │                  │ ├────────────────────────────────────────────┤     │ │
-│ │                  │ │ ref(c_0.data) <uvarint64>                  │     │ │
-│ │      #chunks     │ └────────────────────────────────────────────┘     │ │
-│ │     <uvarint64>  │ ┌────────────────────────────────────────────┐     │ │
-│ │                  │ │ c_i.mint - c_i-1.maxt <uvarint64>          │     │ │
-│ │                  │ ├────────────────────────────────────────────┤     │ │
-│ │                  │ │ c_i.maxt - c_i.mint <uvarint64>            │     │ │
-│ │                  │ ├────────────────────────────────────────────┤ ... │ │
-│ │                  │ │ ref(c_i.data) - ref(c_i-1.data) <varint64> │     │ │
-│ │                  │ └────────────────────────────────────────────┘     │ │
-│ └──────────────────┴────────────────────────────────────────────────────┘ │
-├───────────────────────────────────────────────────────────────────────────┤
-│ CRC32 <4b>                                                                │
-└───────────────────────────────────────────────────────────────────────────┘
+┌──────────────────────────────────────────────────────────────────────────┐
+│ len <uvarint>                                                            │
+├──────────────────────────────────────────────────────────────────────────┤
+│ ┌──────────────────────────────────────────────────────────────────────┐ │
+│ │                     #labels <uvarint64>                              │ │
+│ ├──────────────────────────────────────────────────────────────────────┤ │
+│ │              ┌────────────────────────────────────────────┐          │ │
+│ │              │ ref(l_i.name) <uvarint32>                  │          │ │
+│ │              ├────────────────────────────────────────────┤ ...      │ │
+│ │              │ ref(l_i.value) <uvarint32>                 │          │ │
+│ │              └────────────────────────────────────────────┘          │ │
+│ ├──────────────────────────────────────────────────────────────────────┤ │
+│ │                     #chunks <uvarint64>                              │ │
+│ ├──────────────────────────────────────────────────────────────────────┤ │
+│ │              ┌────────────────────────────────────────────┐          │ │
+│ │              │ c_0.mint <varint64>                        │          │ │
+│ │              ├────────────────────────────────────────────┤          │ │
+│ │              │ c_0.maxt - c_0.mint <uvarint64>            │          │ │
+│ │              ├────────────────────────────────────────────┤          │ │
+│ │              │ ref(c_0.data) <uvarint64>                  │          │ │
+│ │              └────────────────────────────────────────────┘          │ │
+│ │              ┌────────────────────────────────────────────┐          │ │
+│ │              │ c_i.mint - c_i-1.maxt <uvarint64>          │          │ │
+│ │              ├────────────────────────────────────────────┤          │ │
+│ │              │ c_i.maxt - c_i.mint <uvarint64>            │          │ │
+│ │              ├────────────────────────────────────────────┤ ...      │ │
+│ │              │ ref(c_i.data) - ref(c_i-1.data) <varint64> │          │ │
+│ │              └────────────────────────────────────────────┘          │ │
+│ └──────────────────────────────────────────────────────────────────────┘ │
+├──────────────────────────────────────────────────────────────────────────┤
+│ CRC32 <4b>                                                               │
+└──────────────────────────────────────────────────────────────────────────┘
 ```
 
 

From 42ee59689b8d8a44cf79886fe01d5b94ab96ceb5 Mon Sep 17 00:00:00 2001
From: naivewong <867245430@qq.com>
Date: Wed, 6 Feb 2019 00:34:15 +0800
Subject: [PATCH 21/23] move dots to the bottom

Signed-off-by: naivewong <867245430@qq.com>
---
 docs/format/index.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/format/index.md b/docs/format/index.md
index c97b123bd1..4998d2b65c 100644
--- a/docs/format/index.md
+++ b/docs/format/index.md
@@ -93,9 +93,10 @@ After the labels, the number of indexed chunks is encoded, followed by a sequenc
 │ ├──────────────────────────────────────────────────────────────────────┤ │
 │ │              ┌────────────────────────────────────────────┐          │ │
 │ │              │ ref(l_i.name) <uvarint32>                  │          │ │
-│ │              ├────────────────────────────────────────────┤ ...      │ │
+│ │              ├────────────────────────────────────────────┤          │ │
 │ │              │ ref(l_i.value) <uvarint32>                 │          │ │
 │ │              └────────────────────────────────────────────┘          │ │
+│ │                             ...                                      │ │
 │ ├──────────────────────────────────────────────────────────────────────┤ │
 │ │                     #chunks <uvarint64>                              │ │
 │ ├──────────────────────────────────────────────────────────────────────┤ │
@@ -110,9 +111,10 @@ After the labels, the number of indexed chunks is encoded, followed by a sequenc
 │ │              │ c_i.mint - c_i-1.maxt <uvarint64>          │          │ │
 │ │              ├────────────────────────────────────────────┤          │ │
 │ │              │ c_i.maxt - c_i.mint <uvarint64>            │          │ │
-│ │              ├────────────────────────────────────────────┤ ...      │ │
+│ │              ├────────────────────────────────────────────┤          │ │
 │ │              │ ref(c_i.data) - ref(c_i-1.data) <varint64> │          │ │
 │ │              └────────────────────────────────────────────┘          │ │
+│ │                             ...                                      │ │
 │ └──────────────────────────────────────────────────────────────────────┘ │
 ├──────────────────────────────────────────────────────────────────────────┤
 │ CRC32 <4b>                                                               │

From 8e7e2041d39e91989e23d245045172b357dfa98b Mon Sep 17 00:00:00 2001
From: naivewong <867245430@qq.com>
Date: Wed, 6 Feb 2019 00:48:25 +0800
Subject: [PATCH 22/23] update #labels count, #chunks count

Signed-off-by: naivewong <867245430@qq.com>
---
 docs/format/index.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/format/index.md b/docs/format/index.md
index 4998d2b65c..5bcacbdaa6 100644
--- a/docs/format/index.md
+++ b/docs/format/index.md
@@ -89,7 +89,7 @@ After the labels, the number of indexed chunks is encoded, followed by a sequenc
 │ len <uvarint>                                                            │
 ├──────────────────────────────────────────────────────────────────────────┤
 │ ┌──────────────────────────────────────────────────────────────────────┐ │
-│ │                     #labels <uvarint64>                              │ │
+│ │                     #labels count <uvarint64>                        │ │
 │ ├──────────────────────────────────────────────────────────────────────┤ │
 │ │              ┌────────────────────────────────────────────┐          │ │
 │ │              │ ref(l_i.name) <uvarint32>                  │          │ │
@@ -98,7 +98,7 @@ After the labels, the number of indexed chunks is encoded, followed by a sequenc
 │ │              └────────────────────────────────────────────┘          │ │
 │ │                             ...                                      │ │
 │ ├──────────────────────────────────────────────────────────────────────┤ │
-│ │                     #chunks <uvarint64>                              │ │
+│ │                     #chunks count <uvarint64>                        │ │
 │ ├──────────────────────────────────────────────────────────────────────┤ │
 │ │              ┌────────────────────────────────────────────┐          │ │
 │ │              │ c_0.mint <varint64>                        │          │ │

From 9b227d27e75a70266d72c71107892e55ab68b247 Mon Sep 17 00:00:00 2001
From: naivewong <867245430@qq.com>
Date: Wed, 6 Feb 2019 10:00:18 +0800
Subject: [PATCH 23/23] update labels count, chunks count

Signed-off-by: naivewong <867245430@qq.com>
---
 docs/format/index.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/format/index.md b/docs/format/index.md
index 5bcacbdaa6..1ec3c21b44 100644
--- a/docs/format/index.md
+++ b/docs/format/index.md
@@ -89,7 +89,7 @@ After the labels, the number of indexed chunks is encoded, followed by a sequenc
 │ len <uvarint>                                                            │
 ├──────────────────────────────────────────────────────────────────────────┤
 │ ┌──────────────────────────────────────────────────────────────────────┐ │
-│ │                     #labels count <uvarint64>                        │ │
+│ │                     labels count <uvarint64>                         │ │
 │ ├──────────────────────────────────────────────────────────────────────┤ │
 │ │              ┌────────────────────────────────────────────┐          │ │
 │ │              │ ref(l_i.name) <uvarint32>                  │          │ │
@@ -98,7 +98,7 @@ After the labels, the number of indexed chunks is encoded, followed by a sequenc
 │ │              └────────────────────────────────────────────┘          │ │
 │ │                             ...                                      │ │
 │ ├──────────────────────────────────────────────────────────────────────┤ │
-│ │                     #chunks count <uvarint64>                        │ │
+│ │                     chunks count <uvarint64>                         │ │
 │ ├──────────────────────────────────────────────────────────────────────┤ │
 │ │              ┌────────────────────────────────────────────┐          │ │
 │ │              │ c_0.mint <varint64>                        │          │ │