From 78780cd2baa8b658bb77b85a0f96261953904f49 Mon Sep 17 00:00:00 2001
From: Fabian Reinartz <fab.reinartz@gmail.com>
Date: Thu, 23 Feb 2017 10:50:22 +0100
Subject: [PATCH] Segment chunk file

This adds write path support for segmented chunk data files.
Files of 512MB are pre-allocated and written to. If the file size
is exceeded, the next file is started. On completion, files
are truncated to their final size.
---
 block.go   |  43 +++++++-------
 compact.go |  30 ++++------
 db.go      |  10 +++-
 head.go    |  12 ++--
 querier.go |  12 ++--
 reader.go  |  42 +++++++++++---
 writer.go  | 167 ++++++++++++++++++++++++++++++++++++++++-------------
 7 files changed, 209 insertions(+), 107 deletions(-)

diff --git a/block.go b/block.go
index db63e14f2..14275247d 100644
--- a/block.go
+++ b/block.go
@@ -22,7 +22,7 @@ type Block interface {
 	Index() IndexReader
 
 	// Series returns a SeriesReader over the block's data.
-	Series() SeriesReader
+	Chunks() ChunkReader
 
 	// Persisted returns whether the block is already persisted,
 	// and no longer being appended to.
@@ -64,9 +64,9 @@ type persistedBlock struct {
 	dir  string
 	meta BlockMeta
 
-	chunksf, indexf *mmapFile
+	indexf *mmapFile
 
-	chunkr *seriesReader
+	chunkr *chunkReader
 	indexr *indexReader
 }
 
@@ -120,37 +120,36 @@ func newPersistedBlock(dir string) (*persistedBlock, error) {
 		return nil, err
 	}
 
-	chunksf, err := openMmapFile(chunksFileName(dir))
+	cr, err := newChunkReader(filepath.Join(dir, "chunks"))
 	if err != nil {
-		return nil, errors.Wrap(err, "open chunk file")
+		return nil, err
 	}
+	// ir, err := newIndexReader(dir)
+	// if err != nil {
+	// 	return nil, err
+	// }
+
 	indexf, err := openMmapFile(indexFileName(dir))
 	if err != nil {
 		return nil, errors.Wrap(err, "open index file")
 	}
-
-	sr, err := newSeriesReader([][]byte{chunksf.b})
-	if err != nil {
-		return nil, errors.Wrap(err, "create series reader")
-	}
 	ir, err := newIndexReader(indexf.b)
 	if err != nil {
 		return nil, errors.Wrap(err, "create index reader")
 	}
 
 	pb := &persistedBlock{
-		dir:     dir,
-		meta:    *meta,
-		chunksf: chunksf,
-		indexf:  indexf,
-		chunkr:  sr,
-		indexr:  ir,
+		dir:    dir,
+		meta:   *meta,
+		indexf: indexf,
+		chunkr: cr,
+		indexr: ir,
 	}
 	return pb, nil
 }
 
 func (pb *persistedBlock) Close() error {
-	err0 := pb.chunksf.Close()
+	err0 := pb.chunkr.Close()
 	err1 := pb.indexf.Close()
 
 	if err0 != nil {
@@ -159,11 +158,11 @@ func (pb *persistedBlock) Close() error {
 	return err1
 }
 
-func (pb *persistedBlock) Dir() string          { return pb.dir }
-func (pb *persistedBlock) Persisted() bool      { return true }
-func (pb *persistedBlock) Index() IndexReader   { return pb.indexr }
-func (pb *persistedBlock) Series() SeriesReader { return pb.chunkr }
-func (pb *persistedBlock) Meta() BlockMeta      { return pb.meta }
+func (pb *persistedBlock) Dir() string         { return pb.dir }
+func (pb *persistedBlock) Persisted() bool     { return true }
+func (pb *persistedBlock) Index() IndexReader  { return pb.indexr }
+func (pb *persistedBlock) Chunks() ChunkReader { return pb.chunkr }
+func (pb *persistedBlock) Meta() BlockMeta     { return pb.meta }
 
 func chunksFileName(path string) string {
 	return filepath.Join(path, "chunks-000")
diff --git a/compact.go b/compact.go
index 6a0d67499..67d536b74 100644
--- a/compact.go
+++ b/compact.go
@@ -64,7 +64,7 @@ type compactionInfo struct {
 	mint, maxt int64
 }
 
-const compactionBlocksLen = 4
+const compactionBlocksLen = 3
 
 // pick returns a range [i, j) in the blocks that are suitable to be compacted
 // into a single block at position i.
@@ -114,9 +114,6 @@ func (c *compactor) pick(bs []compactionInfo) (i, j int, ok bool) {
 
 func (c *compactor) match(bs []compactionInfo) bool {
 	g := bs[0].generation
-	if g >= 5 {
-		return false
-	}
 
 	for _, b := range bs {
 		if b.generation == 0 {
@@ -166,17 +163,16 @@ func (c *compactor) compact(dir string, blocks ...Block) (err error) {
 		return err
 	}
 
-	chunkf, err := os.OpenFile(chunksFileName(dir), os.O_WRONLY|os.O_CREATE, 0666)
-	if err != nil {
-		return errors.Wrap(err, "create chunk file")
-	}
 	indexf, err := os.OpenFile(indexFileName(dir), os.O_WRONLY|os.O_CREATE, 0666)
 	if err != nil {
 		return errors.Wrap(err, "create index file")
 	}
 
 	indexw := newIndexWriter(indexf)
-	chunkw := newChunkWriter(chunkf)
+	chunkw, err := newChunkWriter(filepath.Join(dir, "chunks"))
+	if err != nil {
+		return errors.Wrap(err, "open chunk writer")
+	}
 
 	if err = c.write(dir, blocks, indexw, chunkw); err != nil {
 		return errors.Wrap(err, "write compaction")
@@ -188,15 +184,9 @@ func (c *compactor) compact(dir string, blocks ...Block) (err error) {
 	if err = indexw.Close(); err != nil {
 		return errors.Wrap(err, "close index writer")
 	}
-	if err = fileutil.Fsync(chunkf); err != nil {
-		return errors.Wrap(err, "fsync chunk file")
-	}
 	if err = fileutil.Fsync(indexf); err != nil {
 		return errors.Wrap(err, "fsync index file")
 	}
-	if err = chunkf.Close(); err != nil {
-		return errors.Wrap(err, "close chunk file")
-	}
 	if err = indexf.Close(); err != nil {
 		return errors.Wrap(err, "close index file")
 	}
@@ -215,7 +205,7 @@ func (c *compactor) write(dir string, blocks []Block, indexw IndexWriter, chunkw
 		if hb, ok := b.(*headBlock); ok {
 			all = hb.remapPostings(all)
 		}
-		s := newCompactionSeriesSet(b.Index(), b.Series(), all)
+		s := newCompactionSeriesSet(b.Index(), b.Chunks(), all)
 
 		if i == 0 {
 			set = s
@@ -300,17 +290,17 @@ type compactionSet interface {
 type compactionSeriesSet struct {
 	p      Postings
 	index  IndexReader
-	series SeriesReader
+	chunks ChunkReader
 
 	l   labels.Labels
 	c   []ChunkMeta
 	err error
 }
 
-func newCompactionSeriesSet(i IndexReader, s SeriesReader, p Postings) *compactionSeriesSet {
+func newCompactionSeriesSet(i IndexReader, c ChunkReader, p Postings) *compactionSeriesSet {
 	return &compactionSeriesSet{
 		index:  i,
-		series: s,
+		chunks: c,
 		p:      p,
 	}
 }
@@ -327,7 +317,7 @@ func (c *compactionSeriesSet) Next() bool {
 	for i := range c.c {
 		chk := &c.c[i]
 
-		chk.Chunk, c.err = c.series.Chunk(chk.Ref)
+		chk.Chunk, c.err = c.chunks.Chunk(chk.Ref)
 		if c.err != nil {
 			return false
 		}
diff --git a/db.go b/db.go
index 76794a98c..9fc8dde3c 100644
--- a/db.go
+++ b/db.go
@@ -153,8 +153,8 @@ func Open(dir string, l log.Logger, opts *Options) (db *DB, err error) {
 		l = log.NewContext(l).With("ts", log.DefaultTimestampUTC, "caller", log.DefaultCaller)
 	}
 
-	var r prometheus.Registerer
-	// r := prometheus.DefaultRegisterer
+	// var r prometheus.Registerer
+	r := prometheus.DefaultRegisterer
 
 	if opts == nil {
 		opts = DefaultOptions
@@ -307,7 +307,11 @@ func (db *DB) compact(i, j int) error {
 			return errors.Wrap(err, "removing old block")
 		}
 	}
-	return db.retentionCutoff()
+	if err := db.retentionCutoff(); err != nil {
+		return err
+	}
+
+	return nil
 }
 
 func (db *DB) retentionCutoff() error {
diff --git a/head.go b/head.go
index d9ab2944c..cd0167ffb 100644
--- a/head.go
+++ b/head.go
@@ -146,10 +146,10 @@ func (h *headBlock) Meta() BlockMeta {
 	return h.meta
 }
 
-func (h *headBlock) Dir() string          { return h.dir }
-func (h *headBlock) Persisted() bool      { return false }
-func (h *headBlock) Index() IndexReader   { return &headIndexReader{h} }
-func (h *headBlock) Series() SeriesReader { return &headSeriesReader{h} }
+func (h *headBlock) Dir() string         { return h.dir }
+func (h *headBlock) Persisted() bool     { return false }
+func (h *headBlock) Index() IndexReader  { return &headIndexReader{h} }
+func (h *headBlock) Chunks() ChunkReader { return &headChunkReader{h} }
 
 func (h *headBlock) Appender() Appender {
 	atomic.AddUint64(&h.activeWriters, 1)
@@ -359,12 +359,12 @@ func (a *headAppender) Rollback() error {
 	return nil
 }
 
-type headSeriesReader struct {
+type headChunkReader struct {
 	*headBlock
 }
 
 // Chunk returns the chunk for the reference number.
-func (h *headSeriesReader) Chunk(ref uint64) (chunks.Chunk, error) {
+func (h *headChunkReader) Chunk(ref uint64) (chunks.Chunk, error) {
 	h.mtx.RLock()
 	defer h.mtx.RUnlock()
 
diff --git a/querier.go b/querier.go
index c09bab885..5f02c77f7 100644
--- a/querier.go
+++ b/querier.go
@@ -59,7 +59,7 @@ func (s *DB) Querier(mint, maxt int64) Querier {
 			mint:   mint,
 			maxt:   maxt,
 			index:  b.Index(),
-			series: b.Series(),
+			chunks: b.Chunks(),
 		}
 
 		// TODO(fabxc): find nicer solution.
@@ -123,19 +123,19 @@ func (q *querier) Close() error {
 // blockQuerier provides querying access to a single block database.
 type blockQuerier struct {
 	index  IndexReader
-	series SeriesReader
+	chunks ChunkReader
 
 	postingsMapper func(Postings) Postings
 
 	mint, maxt int64
 }
 
-func newBlockQuerier(ix IndexReader, s SeriesReader, mint, maxt int64) *blockQuerier {
+func newBlockQuerier(ix IndexReader, c ChunkReader, mint, maxt int64) *blockQuerier {
 	return &blockQuerier{
 		mint:   mint,
 		maxt:   maxt,
 		index:  ix,
-		series: s,
+		chunks: c,
 	}
 }
 
@@ -162,7 +162,7 @@ func (q *blockQuerier) Select(ms ...labels.Matcher) SeriesSet {
 
 	return &blockSeriesSet{
 		index:  q.index,
-		chunks: q.series,
+		chunks: q.chunks,
 		it:     p,
 		absent: absent,
 		mint:   q.mint,
@@ -425,7 +425,7 @@ func (s *partitionSeriesSet) Next() bool {
 // blockSeriesSet is a set of series from an inverted index query.
 type blockSeriesSet struct {
 	index      IndexReader
-	chunks     SeriesReader
+	chunks     ChunkReader
 	it         Postings // postings list referencing series
 	absent     []string // labels that must not be set for result series
 	mint, maxt int64    // considered time range
diff --git a/reader.go b/reader.go
index 630c11a99..ee3738b89 100644
--- a/reader.go
+++ b/reader.go
@@ -3,6 +3,7 @@ package tsdb
 import (
 	"encoding/binary"
 	"fmt"
+	"io"
 	"strings"
 
 	"github.com/fabxc/tsdb/chunks"
@@ -10,23 +11,42 @@ import (
 	"github.com/pkg/errors"
 )
 
-// SeriesReader provides reading access of serialized time series data.
-type SeriesReader interface {
+// ChunkReader provides reading access of serialized time series data.
+type ChunkReader interface {
 	// Chunk returns the series data chunk with the given reference.
 	Chunk(ref uint64) (chunks.Chunk, error)
+
+	// Close releases all underlying resources of the reader.
+	Close() error
 }
 
-// seriesReader implements a SeriesReader for a serialized byte stream
+// chunkReader implements a SeriesReader for a serialized byte stream
 // of series data.
-type seriesReader struct {
+type chunkReader struct {
 	// The underlying bytes holding the encoded series data.
 	bs [][]byte
+
+	cs []io.Closer
 }
 
-func newSeriesReader(bs [][]byte) (*seriesReader, error) {
-	s := &seriesReader{bs: bs}
+// newChunkReader returns a new chunkReader based on mmaped files found in dir.
+func newChunkReader(dir string) (*chunkReader, error) {
+	files, err := sequenceFiles(dir, "")
+	if err != nil {
+		return nil, err
+	}
+	var cr chunkReader
 
-	for i, b := range bs {
+	for _, fn := range files {
+		f, err := openMmapFile(fn)
+		if err != nil {
+			return nil, errors.Wrapf(err, "mmap files")
+		}
+		cr.cs = append(cr.cs, f)
+		cr.bs = append(cr.bs, f.b)
+	}
+
+	for i, b := range cr.bs {
 		if len(b) < 4 {
 			return nil, errors.Wrapf(errInvalidSize, "validate magic in segment %d", i)
 		}
@@ -35,10 +55,14 @@ func newSeriesReader(bs [][]byte) (*seriesReader, error) {
 			return nil, fmt.Errorf("invalid magic number %x", m)
 		}
 	}
-	return s, nil
+	return &cr, nil
 }
 
-func (s *seriesReader) Chunk(ref uint64) (chunks.Chunk, error) {
+func (s *chunkReader) Close() error {
+	return closeAll(s.cs...)
+}
+
+func (s *chunkReader) Chunk(ref uint64) (chunks.Chunk, error) {
 	var (
 		seq = int(ref >> 32)
 		off = int((ref << 32) >> 32)
diff --git a/writer.go b/writer.go
index 6b4e31deb..6b98b2b95 100644
--- a/writer.go
+++ b/writer.go
@@ -6,10 +6,12 @@ import (
 	"hash"
 	"hash/crc32"
 	"io"
+	"os"
 	"sort"
 	"strings"
 
 	"github.com/bradfitz/slice"
+	"github.com/coreos/etcd/pkg/fileutil"
 	"github.com/fabxc/tsdb/chunks"
 	"github.com/fabxc/tsdb/labels"
 	"github.com/pkg/errors"
@@ -44,20 +46,109 @@ type ChunkWriter interface {
 // chunkWriter implements the ChunkWriter interface for the standard
 // serialization format.
 type chunkWriter struct {
-	ow    io.Writer
-	w     *bufio.Writer
-	n     int64
-	c     int
-	crc32 hash.Hash
+	dirFile *os.File
+	files   []*os.File
+	wbuf    *bufio.Writer
+	n       int64
+	crc32   hash.Hash
+
+	segmentSize int64
 }
 
-func newChunkWriter(w io.Writer) *chunkWriter {
-	return &chunkWriter{
-		ow:    w,
-		w:     bufio.NewWriterSize(w, 1*1024*1024),
-		n:     0,
-		crc32: crc32.New(crc32.MakeTable(crc32.Castagnoli)),
+const (
+	defaultChunkSegmentSize = 512 * 1024 * 1024
+
+	chunksFormatV1 = 1
+	indexFormatV1  = 1
+)
+
+func newChunkWriter(dir string) (*chunkWriter, error) {
+	if err := os.MkdirAll(dir, 0777); err != nil {
+		return nil, err
 	}
+	dirFile, err := fileutil.OpenDir(dir)
+	if err != nil {
+		return nil, err
+	}
+	cw := &chunkWriter{
+		dirFile:     dirFile,
+		n:           0,
+		crc32:       crc32.New(crc32.MakeTable(crc32.Castagnoli)),
+		segmentSize: defaultChunkSegmentSize,
+	}
+	return cw, nil
+}
+
+func (w *chunkWriter) tail() *os.File {
+	if len(w.files) == 0 {
+		return nil
+	}
+	return w.files[len(w.files)-1]
+}
+
+// finalizeTail writes all pending data to the current tail file,
+// truncates its size, and closes it.
+func (w *chunkWriter) finalizeTail() error {
+	tf := w.tail()
+	if tf == nil {
+		return nil
+	}
+
+	if err := w.wbuf.Flush(); err != nil {
+		return err
+	}
+	if err := fileutil.Fsync(tf); err != nil {
+		return err
+	}
+	// As the file was pre-allocated, we truncate any superfluous zero bytes.
+	off, err := tf.Seek(0, os.SEEK_CUR)
+	if err != nil {
+		return err
+	}
+	if err := tf.Truncate(off); err != nil {
+		return err
+	}
+	return tf.Close()
+}
+
+func (w *chunkWriter) cut() error {
+	// Sync current tail to disk and close.
+	w.finalizeTail()
+
+	p, _, err := nextSequenceFile(w.dirFile.Name(), "")
+	if err != nil {
+		return err
+	}
+	f, err := os.OpenFile(p, os.O_WRONLY|os.O_CREATE, 0666)
+	if err != nil {
+		return err
+	}
+	if err = fileutil.Preallocate(f, w.segmentSize, true); err != nil {
+		return err
+	}
+	if err = w.dirFile.Sync(); err != nil {
+		return err
+	}
+
+	// Write header metadata for new file.
+
+	metab := make([]byte, 8)
+	binary.BigEndian.PutUint32(metab[:4], MagicSeries)
+	metab[4] = chunksFormatV1
+
+	if _, err := f.Write(metab); err != nil {
+		return err
+	}
+
+	w.files = append(w.files, f)
+	if w.wbuf != nil {
+		w.wbuf.Reset(f)
+	} else {
+		w.wbuf = bufio.NewWriterSize(f, 8*1024*1024)
+	}
+	w.n = 8
+
+	return nil
 }
 
 func (w *chunkWriter) write(wr io.Writer, b []byte) error {
@@ -66,44 +157,40 @@ func (w *chunkWriter) write(wr io.Writer, b []byte) error {
 	return err
 }
 
-func (w *chunkWriter) writeMeta() error {
-	b := [8]byte{}
-
-	binary.BigEndian.PutUint32(b[:4], MagicSeries)
-	b[4] = flagStd
-
-	return w.write(w.w, b[:])
-}
-
 func (w *chunkWriter) WriteChunks(chks ...ChunkMeta) error {
-	// Initialize with meta data.
-	if w.n == 0 {
-		if err := w.writeMeta(); err != nil {
+	// Calculate maximum space we need and cut a new segment in case
+	// we don't fit into the current one.
+	maxLen := int64(binary.MaxVarintLen32)
+	for _, c := range chks {
+		maxLen += binary.MaxVarintLen32 + 1
+		maxLen += int64(len(c.Chunk.Bytes()))
+	}
+	newsz := w.n + maxLen
+
+	if w.wbuf == nil || w.n > w.segmentSize || newsz > w.segmentSize && maxLen <= w.segmentSize {
+		if err := w.cut(); err != nil {
 			return err
 		}
 	}
 
+	// Write chunks sequentially and set the reference field in the ChunkMeta.
 	w.crc32.Reset()
-	wr := io.MultiWriter(w.crc32, w.w)
+	wr := io.MultiWriter(w.crc32, w.wbuf)
 
-	// For normal reads we don't need the number of the chunk section but
-	// it allows us to verify checksums without reading the index file.
-	// The offsets are also technically enough to calculate chunk size. but
-	// holding the length of each chunk could later allow for adding padding
-	// between chunks.
-	b := [binary.MaxVarintLen32]byte{}
-	n := binary.PutUvarint(b[:], uint64(len(chks)))
+	b := make([]byte, binary.MaxVarintLen32)
+	n := binary.PutUvarint(b, uint64(len(chks)))
 
 	if err := w.write(wr, b[:n]); err != nil {
 		return err
 	}
+	seq := uint64(w.seq()) << 32
 
 	for i := range chks {
 		chk := &chks[i]
 
-		chk.Ref = uint64(w.n)
+		chk.Ref = seq | uint64(w.n)
 
-		n = binary.PutUvarint(b[:], uint64(len(chk.Chunk.Bytes())))
+		n = binary.PutUvarint(b, uint64(len(chk.Chunk.Bytes())))
 
 		if err := w.write(wr, b[:n]); err != nil {
 			return err
@@ -117,24 +204,22 @@ func (w *chunkWriter) WriteChunks(chks ...ChunkMeta) error {
 		chk.Chunk = nil
 	}
 
-	if err := w.write(w.w, w.crc32.Sum(nil)); err != nil {
+	if err := w.write(w.wbuf, w.crc32.Sum(nil)); err != nil {
 		return err
 	}
 	return nil
 }
 
+func (w *chunkWriter) seq() int {
+	return len(w.files) - 1
+}
+
 func (w *chunkWriter) Size() int64 {
 	return w.n
 }
 
 func (w *chunkWriter) Close() error {
-	// Initialize block in case no data was written to it.
-	if w.n == 0 {
-		if err := w.writeMeta(); err != nil {
-			return err
-		}
-	}
-	return w.w.Flush()
+	return w.finalizeTail()
 }
 
 // ChunkMeta holds information about a chunk of data.