Fix bug of unsorted postings lists being created

The former approach created unordered postings list by either map iteration of new series being unsorted (fixable) or concurrent writers creating new series interleaved. We switch back to generating ephemeral references for a single batch. Newly created series have to be re-set upon the next insert.
2025-03-05 20:59:13 -08:00 · 2017-01-13 16:14:40 +01:00 · 2017-01-13 16:14:40 +01:00 · 1c80c33e72
parent c7f5590a71
commit 1c80c33e72
4 changed files with 75 additions and 73 deletions
--- a/db.go
+++ b/db.go
@ -352,7 +352,7 @@ func (a *dbAppender) SetSeries(lset labels.Labels) (uint64, error) {
 	if err != nil {
 		return 0, err
 	}
-	return ref | (uint64(a.gen) << 32), nil
+	return ref | (uint64(a.gen) << 40), nil
 }

 func (a *dbAppender) setSeries(hash uint64, lset labels.Labels) (uint64, error) {
@ -360,13 +360,13 @@ func (a *dbAppender) setSeries(hash uint64, lset labels.Labels) (uint64, error)
 	if err != nil {
 		return 0, err
 	}
-	return ref | (uint64(a.gen) << 32), nil
+	return ref | (uint64(a.gen) << 40), nil
 }

 func (a *dbAppender) Add(ref uint64, t int64, v float64) error {
 	// We store the head generation in the 4th byte and use it to reject
 	// stale references.
-	gen := uint8((ref << 24) >> 56)
+	gen := uint8((ref << 16) >> 56)

 	if gen != a.gen {
 		return errNotFound
@ -647,11 +647,11 @@ func (a *partitionedAppender) SetSeries(lset labels.Labels) (uint64, error) {
 	if err != nil {
 		return 0, err
 	}
-	return ref | (p << 40), nil
+	return ref | (p << 48), nil
 }

 func (a *partitionedAppender) Add(ref uint64, t int64, v float64) error {
-	p := uint8((ref << 16) >> 56)
+	p := uint8((ref << 8) >> 56)
 	return a.partitions[p].Add(ref, t, v)
 }

--- a/head.go
+++ b/head.go
@ -3,9 +3,9 @@ package tsdb
 import (
 	"errors"
 	"math"
+	"math/rand"
 	"sort"
 	"sync"
-	"sync/atomic"
 	"time"

 	"github.com/bradfitz/slice"
@ -29,8 +29,6 @@ type headBlock struct {
 	// to their chunk descs.
 	hashes map[uint64][]*memSeries

-	nextSeriesID uint64
-
 	values   map[string]stringset // label names to possible values
 	postings *memPostings         // postings lists for terms

@ -62,11 +60,10 @@ func openHeadBlock(dir string, l log.Logger) (*headBlock, error) {

 	err = wal.ReadAll(&walHandler{
 		series: func(lset labels.Labels) {
-			b.create(uint32(b.nextSeriesID), lset.Hash(), lset)
-			b.nextSeriesID++
+			b.create(lset.Hash(), lset)
 			b.stats.SeriesCount++
 		},
-		sample: func(s hashedSample) {
+		sample: func(s refdSample) {
 			si := s.ref

 			cd := b.series[si]
@ -112,27 +109,27 @@ func (h *headBlock) Appender() Appender {

 var headPool = sync.Pool{}

-func getHeadAppendBuffer() []hashedSample {
+func getHeadAppendBuffer() []refdSample {
 	b := headPool.Get()
 	if b == nil {
-		return make([]hashedSample, 0, 512)
+		return make([]refdSample, 0, 512)
 	}
-	return b.([]hashedSample)
+	return b.([]refdSample)
 }

-func putHeadAppendBuffer(b []hashedSample) {
+func putHeadAppendBuffer(b []refdSample) {
 	headPool.Put(b[:0])
 }

 type headAppender struct {
 	*headBlock

-	newSeries map[uint32]hashedLabels
-	newHashes map[uint64]uint32
+	newSeries map[uint64]hashedLabels
+	newHashes map[uint64]uint64
+	refmap    map[uint64]uint64
 	newLabels []labels.Labels
-	newRefs   []uint32

-	samples []hashedSample
+	samples []refdSample
 }

 type hashedLabels struct {
@ -140,6 +137,12 @@ type hashedLabels struct {
 	labels labels.Labels
 }

+type refdSample struct {
+	ref uint64
+	t   int64
+	v   float64
+}
+
 func (a *headAppender) SetSeries(lset labels.Labels) (uint64, error) {
 	return a.setSeries(lset.Hash(), lset)
 }
@ -152,35 +155,41 @@ func (a *headAppender) setSeries(hash uint64, lset labels.Labels) (uint64, error
 		return uint64(ref), nil
 	}

-	id := atomic.AddUint64(&a.nextSeriesID, 1) - 1
-	if a.newSeries == nil {
-		a.newSeries = map[uint32]hashedLabels{}
-		a.newHashes = map[uint64]uint32{}
-	}
-	a.newSeries[uint32(id)] = hashedLabels{hash: hash, labels: lset}
-	a.newHashes[hash] = uint32(id)
-	a.newRefs = append(a.newRefs, uint32(id))
+	// We only know the actual reference after committing. We generate an
+	// intermediate reference only valid for this batch.
+	// It is indicated by the the LSB of the 4th byte being set to 1.
+	// We use a random ID to avoid collisions when new series are created
+	// in two subsequent batches. (TODO(fabxc): safe enough?)
+	ref := uint64(rand.Int31()) | (1 << 32)

-	return id, nil
+	if a.newSeries == nil {
+		a.newSeries = map[uint64]hashedLabels{}
+		a.newHashes = map[uint64]uint64{}
+		a.refmap = map[uint64]uint64{}
+	}
+	a.newSeries[ref] = hashedLabels{hash: hash, labels: lset}
+	a.newHashes[hash] = ref
+
+	return ref, nil
 }

 func (a *headAppender) Add(ref uint64, t int64, v float64) error {
-	// We only act on the last 4 bytes. Anything before is used by higher-order
-	// appenders. We erase it to avoid issues.
-	ref = (ref << 32) >> 32
+	// We only own the first 5 bytes of the reference. Anything before is
+	// used by higher-order appenders. We erase it to avoid issues.
+	ref = (ref << 31) >> 31

 	// Distinguish between existing series and series created in
 	// this transaction.
-	if int(ref) >= len(a.series) {
-		if _, ok := a.newSeries[uint32(ref)]; !ok {
+	if ref&(1<<32) > 0 {
+		if _, ok := a.newSeries[ref]; !ok {
 			return errNotFound
 		}
 		// TODO(fabxc): we also have to validate here that the
 		// sample sequence is valid.
 		// We also have to revalidate it as we switch locks an create
 		// the new series.
-		a.samples = append(a.samples, hashedSample{
-			ref: uint32(ref),
+		a.samples = append(a.samples, refdSample{
+			ref: ref,
 			t:   t,
 			v:   v,
 		})
@ -202,8 +211,8 @@ func (a *headAppender) Add(ref uint64, t int64, v float64) error {
 		return ErrAmendSample
 	}

-	a.samples = append(a.samples, hashedSample{
-		ref: uint32(ref),
+	a.samples = append(a.samples, refdSample{
+		ref: ref,
 		t:   t,
 		v:   v,
 	})
@ -215,21 +224,27 @@ func (a *headAppender) createSeries() {
 		return
 	}
 	a.newLabels = make([]labels.Labels, 0, len(a.newSeries))
+	base0 := len(a.series)

 	a.mtx.RUnlock()
 	a.mtx.Lock()

-	for _, ref := range a.newRefs {
-		l := a.newSeries[ref]
+	base1 := len(a.series)
+
+	for ref, l := range a.newSeries {
 		// We switched locks and have to re-validate that the series were not
 		// created by another goroutine in the meantime.
-		if int(ref) < len(a.series) && a.series[ref] != nil {
+		if base1 > base0 {
+			if ms := a.get(l.hash, l.labels); ms != nil {
+				a.refmap[ref] = uint64(ms.ref)
 				continue
 			}
+		}
 		// Series is still new.
 		a.newLabels = append(a.newLabels, l.labels)
+		a.refmap[ref] = uint64(len(a.series))

-		a.create(ref, l.hash, l.labels)
+		a.create(l.hash, l.labels)
 	}

 	a.mtx.Unlock()
@ -253,7 +268,12 @@ func (a *headAppender) Commit() error {
 		maxt  = int64(math.MinInt64)
 	)

-	for _, s := range a.samples {
+	for i := range a.samples {
+		s := &a.samples[i]
+
+		if s.ref&(1<<32) > 0 {
+			s.ref = a.refmap[s.ref]
+		}
 		if !a.series[s.ref].append(s.t, s.v) {
 			total--
 		}
@ -401,17 +421,14 @@ func (h *headBlock) get(hash uint64, lset labels.Labels) *memSeries {
 	return nil
 }

-func (h *headBlock) create(ref uint32, hash uint64, lset labels.Labels) *memSeries {
+func (h *headBlock) create(hash uint64, lset labels.Labels) *memSeries {
 	s := &memSeries{
-		ref:  ref,
 		lset: lset,
+		ref:  uint32(len(h.series)),
 	}

 	// Allocate empty space until we can insert at the given index.
-	for int(ref) >= len(h.series) {
-		h.series = append(h.series, nil)
-	}
-	h.series[ref] = s
+	h.series = append(h.series, s)

 	h.hashes[hash] = append(h.hashes[hash], s)

--- a/postings.go
+++ b/postings.go
@ -26,23 +26,7 @@ func (p *memPostings) get(t term) Postings {
 // term argument appears twice.
 func (p *memPostings) add(id uint32, terms ...term) {
 	for _, t := range terms {
-		// We expect IDs to roughly be appended in order but some concurrency
-		// related out of order at the end. We do insertion sort from the end
-		// to account for it.
-		l := p.m[t]
-		i := len(l) - 1
-
-		for ; i >= 0; i-- {
-			if id > l[i] {
-				break
-			}
-		}
-		l = append(l, 0)
-
-		copy(l[i+2:], l[i+1:])
-		l[i+1] = id
-
-		p.m[t] = l
+		p.m[t] = append(p.m[t], id)
 	}
 }

--- a/wal.go
+++ b/wal.go
@ -88,7 +88,7 @@ func OpenWAL(dir string, l log.Logger, flushInterval time.Duration) (*WAL, error
 }

 type walHandler struct {
-	sample func(hashedSample)
+	sample func(refdSample)
 	series func(labels.Labels)
 }

@ -110,7 +110,7 @@ func (w *WAL) ReadAll(h *walHandler) error {
 }

 // Log writes a batch of new series labels and samples to the log.
-func (w *WAL) Log(series []labels.Labels, samples []hashedSample) error {
+func (w *WAL) Log(series []labels.Labels, samples []refdSample) error {
 	if err := w.enc.encodeSeries(series); err != nil {
 		return err
 	}
@ -268,7 +268,7 @@ func (e *walEncoder) encodeSeries(series []labels.Labels) error {
 	return e.entry(WALEntrySeries, walSeriesSimple, buf)
 }

-func (e *walEncoder) encodeSamples(samples []hashedSample) error {
+func (e *walEncoder) encodeSamples(samples []refdSample) error {
 	if len(samples) == 0 {
 		return nil
 	}
@ -282,7 +282,7 @@ func (e *walEncoder) encodeSamples(samples []hashedSample) error {
 	// TODO(fabxc): optimize for all samples having the same timestamp.
 	first := samples[0]

-	binary.BigEndian.PutUint32(b, first.ref)
+	binary.BigEndian.PutUint64(b, first.ref)
 	buf = append(buf, b[:4]...)
 	binary.BigEndian.PutUint64(b, uint64(first.t))
 	buf = append(buf, b[:8]...)
@ -351,20 +351,21 @@ func (d *walDecoder) decodeSamples(flag byte, b []byte) error {
 		return errors.Wrap(errInvalidSize, "header length")
 	}
 	var (
-		baseRef  = binary.BigEndian.Uint32(b)
+		baseRef  = binary.BigEndian.Uint64(b)
 		baseTime = int64(binary.BigEndian.Uint64(b[4:]))
 	)
 	b = b[12:]

 	for len(b) > 0 {
-		var smpl hashedSample
+		var smpl refdSample

 		dref, n := binary.Varint(b)
 		if n < 1 {
 			return errors.Wrap(errInvalidSize, "sample ref delta")
 		}
 		b = b[n:]
-		smpl.ref = uint32(int64(baseRef) + dref)
+
+		smpl.ref = uint64(int64(baseRef) + dref)

 		dtime, n := binary.Varint(b)
 		if n < 1 {