prometheus/tsdb/head.go

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tsdb

import (
	"fmt"
	"math"
	"path/filepath"
	"runtime"
	"sort"
	"strings"
	"sync"
	"sync/atomic"
	"time"

	"github.com/go-kit/kit/log"
	"github.com/go-kit/kit/log/level"
	"github.com/oklog/ulid"
	"github.com/pkg/errors"
	"github.com/prometheus/client_golang/prometheus"
	"github.com/prometheus/prometheus/pkg/labels"
	"github.com/prometheus/prometheus/storage"
	"github.com/prometheus/prometheus/tsdb/chunkenc"
	"github.com/prometheus/prometheus/tsdb/chunks"
	tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
	"github.com/prometheus/prometheus/tsdb/index"
	"github.com/prometheus/prometheus/tsdb/record"
	"github.com/prometheus/prometheus/tsdb/tombstones"
	"github.com/prometheus/prometheus/tsdb/wal"
)

var (
	// ErrInvalidSample is returned if an appended sample is not valid and can't
	// be ingested.
	ErrInvalidSample = errors.New("invalid sample")
)

// Head handles reads and writes of time series data within a time window.
type Head struct {
	// Keep all 64bit atomically accessed variables at the top of this struct.
	// See https://golang.org/pkg/sync/atomic/#pkg-note-BUG for more info.
	chunkRange       int64
	numSeries        uint64
	minTime, maxTime int64 // Current min and max of the samples included in the head.
	minValidTime     int64 // Mint allowed to be added to the head. It shouldn't be lower than the maxt of the last persisted block.
	lastSeriesID     uint64

	metrics      *headMetrics
	wal          *wal.WAL
	logger       log.Logger
	appendPool   sync.Pool
	seriesPool   sync.Pool
	bytesPool    sync.Pool
	memChunkPool sync.Pool

	// All series addressable by their ID or hash.
	series         *stripeSeries
	seriesCallback SeriesLifecycleCallback

	symMtx  sync.RWMutex
	symbols map[string]struct{}
	values  map[string]stringset // Label names to possible values.

	deletedMtx sync.Mutex
	deleted    map[uint64]int // Deleted series, and what WAL segment they must be kept until.

	postings *index.MemPostings // Postings lists for terms.

	tombstones *tombstones.MemTombstones

	iso *isolation

	cardinalityMutex      sync.Mutex
	cardinalityCache      *index.PostingsStats // Posting stats cache which will expire after 30sec.
	lastPostingsStatsCall time.Duration        // Last posting stats call (PostingsCardinalityStats()) time for caching.

	// chunkDiskMapper is used to write and read Head chunks to/from disk.
	chunkDiskMapper *chunks.ChunkDiskMapper
	// chunkDirRoot is the parent directory of the chunks directory.
	chunkDirRoot string

	closedMtx sync.Mutex
	closed    bool
}

type headMetrics struct {
	activeAppenders          prometheus.Gauge
	series                   prometheus.GaugeFunc
	seriesCreated            prometheus.Counter
	seriesRemoved            prometheus.Counter
	seriesNotFound           prometheus.Counter
	chunks                   prometheus.Gauge
	chunksCreated            prometheus.Counter
	chunksRemoved            prometheus.Counter
	gcDuration               prometheus.Summary
	samplesAppended          prometheus.Counter
	outOfBoundSamples        prometheus.Counter
	outOfOrderSamples        prometheus.Counter
	walTruncateDuration      prometheus.Summary
	walCorruptionsTotal      prometheus.Counter
	headTruncateFail         prometheus.Counter
	headTruncateTotal        prometheus.Counter
	checkpointDeleteFail     prometheus.Counter
	checkpointDeleteTotal    prometheus.Counter
	checkpointCreationFail   prometheus.Counter
	checkpointCreationTotal  prometheus.Counter
	mmapChunkCorruptionTotal prometheus.Counter
}

func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
	m := &headMetrics{
		activeAppenders: prometheus.NewGauge(prometheus.GaugeOpts{
			Name: "prometheus_tsdb_head_active_appenders",
			Help: "Number of currently active appender transactions",
		}),
		series: prometheus.NewGaugeFunc(prometheus.GaugeOpts{
			Name: "prometheus_tsdb_head_series",
			Help: "Total number of series in the head block.",
		}, func() float64 {
			return float64(h.NumSeries())
		}),
		seriesCreated: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_head_series_created_total",
			Help: "Total number of series created in the head",
		}),
		seriesRemoved: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_head_series_removed_total",
			Help: "Total number of series removed in the head",
		}),
		seriesNotFound: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_head_series_not_found_total",
			Help: "Total number of requests for series that were not found.",
		}),
		chunks: prometheus.NewGauge(prometheus.GaugeOpts{
			Name: "prometheus_tsdb_head_chunks",
			Help: "Total number of chunks in the head block.",
		}),
		chunksCreated: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_head_chunks_created_total",
			Help: "Total number of chunks created in the head",
		}),
		chunksRemoved: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_head_chunks_removed_total",
			Help: "Total number of chunks removed in the head",
		}),
		gcDuration: prometheus.NewSummary(prometheus.SummaryOpts{
			Name: "prometheus_tsdb_head_gc_duration_seconds",
			Help: "Runtime of garbage collection in the head block.",
		}),
		walTruncateDuration: prometheus.NewSummary(prometheus.SummaryOpts{
			Name: "prometheus_tsdb_wal_truncate_duration_seconds",
			Help: "Duration of WAL truncation.",
		}),
		walCorruptionsTotal: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_wal_corruptions_total",
			Help: "Total number of WAL corruptions.",
		}),
		samplesAppended: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_head_samples_appended_total",
			Help: "Total number of appended samples.",
		}),
		outOfBoundSamples: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_out_of_bound_samples_total",
			Help: "Total number of out of bound samples ingestion failed attempts.",
		}),
		outOfOrderSamples: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_out_of_order_samples_total",
			Help: "Total number of out of order samples ingestion failed attempts.",
		}),
		headTruncateFail: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_head_truncations_failed_total",
			Help: "Total number of head truncations that failed.",
		}),
		headTruncateTotal: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_head_truncations_total",
			Help: "Total number of head truncations attempted.",
		}),
		checkpointDeleteFail: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_checkpoint_deletions_failed_total",
			Help: "Total number of checkpoint deletions that failed.",
		}),
		checkpointDeleteTotal: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_checkpoint_deletions_total",
			Help: "Total number of checkpoint deletions attempted.",
		}),
		checkpointCreationFail: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_checkpoint_creations_failed_total",
			Help: "Total number of checkpoint creations that failed.",
		}),
		checkpointCreationTotal: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_checkpoint_creations_total",
			Help: "Total number of checkpoint creations attempted.",
		}),
		mmapChunkCorruptionTotal: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_mmap_chunk_corruptions_total",
			Help: "Total number of memory-mapped chunk corruptions.",
		}),
	}

	if r != nil {
		r.MustRegister(
			m.activeAppenders,
			m.series,
			m.chunks,
			m.chunksCreated,
			m.chunksRemoved,
			m.seriesCreated,
			m.seriesRemoved,
			m.seriesNotFound,
			m.gcDuration,
			m.walTruncateDuration,
			m.walCorruptionsTotal,
			m.samplesAppended,
			m.outOfBoundSamples,
			m.outOfOrderSamples,
			m.headTruncateFail,
			m.headTruncateTotal,
			m.checkpointDeleteFail,
			m.checkpointDeleteTotal,
			m.checkpointCreationFail,
			m.checkpointCreationTotal,
			m.mmapChunkCorruptionTotal,
			// Metrics bound to functions and not needed in tests
			// can be created and registered on the spot.
			prometheus.NewGaugeFunc(prometheus.GaugeOpts{
				Name: "prometheus_tsdb_head_max_time",
				Help: "Maximum timestamp of the head block. The unit is decided by the library consumer.",
			}, func() float64 {
				return float64(h.MaxTime())
			}),
			prometheus.NewGaugeFunc(prometheus.GaugeOpts{
				Name: "prometheus_tsdb_head_min_time",
				Help: "Minimum time bound of the head block. The unit is decided by the library consumer.",
			}, func() float64 {
				return float64(h.MinTime())
			}),
			prometheus.NewGaugeFunc(prometheus.GaugeOpts{
				Name: "prometheus_tsdb_isolation_low_watermark",
				Help: "The lowest TSDB append ID that is still referenced.",
			}, func() float64 {
				return float64(h.iso.lowWatermark())
			}),
			prometheus.NewGaugeFunc(prometheus.GaugeOpts{
				Name: "prometheus_tsdb_isolation_high_watermark",
				Help: "The highest TSDB append ID that has been given out.",
			}, func() float64 {
				return float64(h.iso.lastAppendID())
			}),
		)
	}
	return m
}

const cardinalityCacheExpirationTime = time.Duration(30) * time.Second

// PostingsCardinalityStats returns top 10 highest cardinality stats By label and value names.
func (h *Head) PostingsCardinalityStats(statsByLabelName string) *index.PostingsStats {
	h.cardinalityMutex.Lock()
	defer h.cardinalityMutex.Unlock()
	currentTime := time.Duration(time.Now().Unix()) * time.Second
	seconds := currentTime - h.lastPostingsStatsCall
	if seconds > cardinalityCacheExpirationTime {
		h.cardinalityCache = nil
	}
	if h.cardinalityCache != nil {
		return h.cardinalityCache
	}
	h.cardinalityCache = h.postings.Stats(statsByLabelName)
	h.lastPostingsStatsCall = time.Duration(time.Now().Unix()) * time.Second

	return h.cardinalityCache
}

// NewHead opens the head block in dir.
// stripeSize sets the number of entries in the hash map, it must be a power of 2.
// A larger stripeSize will allocate more memory up-front, but will increase performance when handling a large number of series.
// A smaller stripeSize reduces the memory allocated, but can decrease performance with large number of series.
func NewHead(r prometheus.Registerer, l log.Logger, wal *wal.WAL, chunkRange int64, chkDirRoot string, pool chunkenc.Pool, stripeSize int, seriesCallback SeriesLifecycleCallback) (*Head, error) {
	if l == nil {
		l = log.NewNopLogger()
	}
	if chunkRange < 1 {
		return nil, errors.Errorf("invalid chunk range %d", chunkRange)
	}
	if seriesCallback == nil {
		seriesCallback = &noopSeriesLifecycleCallback{}
	}
	h := &Head{
		wal:        wal,
		logger:     l,
		chunkRange: chunkRange,
		minTime:    math.MaxInt64,
		maxTime:    math.MinInt64,
		series:     newStripeSeries(stripeSize, seriesCallback),
		values:     map[string]stringset{},
		symbols:    map[string]struct{}{},
		postings:   index.NewUnorderedMemPostings(),
		tombstones: tombstones.NewMemTombstones(),
		iso:        newIsolation(),
		deleted:    map[uint64]int{},
		memChunkPool: sync.Pool{
			New: func() interface{} {
				return &memChunk{}
			},
		},
		chunkDirRoot:   chkDirRoot,
		seriesCallback: seriesCallback,
	}
	h.metrics = newHeadMetrics(h, r)

	if pool == nil {
		pool = chunkenc.NewPool()
	}

	var err error
	h.chunkDiskMapper, err = chunks.NewChunkDiskMapper(mmappedChunksDir(chkDirRoot), pool)
	if err != nil {
		return nil, err
	}

	return h, nil
}

func mmappedChunksDir(dir string) string { return filepath.Join(dir, "chunks_head") }

// processWALSamples adds a partition of samples it receives to the head and passes
// them on to other workers.
// Samples before the mint timestamp are discarded.
func (h *Head) processWALSamples(
	minValidTime int64,
	input <-chan []record.RefSample, output chan<- []record.RefSample,
) (unknownRefs uint64) {
	defer close(output)

	// Mitigate lock contention in getByID.
	refSeries := map[uint64]*memSeries{}

	mint, maxt := int64(math.MaxInt64), int64(math.MinInt64)

	for samples := range input {
		for _, s := range samples {
			if s.T < minValidTime {
				continue
			}
			ms := refSeries[s.Ref]
			if ms == nil {
				ms = h.series.getByID(s.Ref)
				if ms == nil {
					unknownRefs++
					continue
				}
				refSeries[s.Ref] = ms
			}
			if _, chunkCreated := ms.append(s.T, s.V, 0, h.chunkDiskMapper); chunkCreated {
				h.metrics.chunksCreated.Inc()
				h.metrics.chunks.Inc()
			}
			if s.T > maxt {
				maxt = s.T
			}
			if s.T < mint {
				mint = s.T
			}
		}
		output <- samples
	}
	h.updateMinMaxTime(mint, maxt)

	return unknownRefs
}

func (h *Head) updateMinMaxTime(mint, maxt int64) {
	for {
		lt := h.MinTime()
		if mint >= lt {
			break
		}
		if atomic.CompareAndSwapInt64(&h.minTime, lt, mint) {
			break
		}
	}
	for {
		ht := h.MaxTime()
		if maxt <= ht {
			break
		}
		if atomic.CompareAndSwapInt64(&h.maxTime, ht, maxt) {
			break
		}
	}
}

func (h *Head) loadWAL(r *wal.Reader, multiRef map[uint64]uint64, mmappedChunks map[uint64][]*mmappedChunk) (err error) {
	// Track number of samples that referenced a series we don't know about
	// for error reporting.
	var unknownRefs uint64

	// Start workers that each process samples for a partition of the series ID space.
	// They are connected through a ring of channels which ensures that all sample batches
	// read from the WAL are processed in order.
	var (
		wg      sync.WaitGroup
		n       = runtime.GOMAXPROCS(0)
		inputs  = make([]chan []record.RefSample, n)
		outputs = make([]chan []record.RefSample, n)

		dec    record.Decoder
		shards = make([][]record.RefSample, n)

		decoded                      = make(chan interface{}, 10)
		decodeErr, seriesCreationErr error
		seriesPool                   = sync.Pool{
			New: func() interface{} {
				return []record.RefSeries{}
			},
		}
		samplesPool = sync.Pool{
			New: func() interface{} {
				return []record.RefSample{}
			},
		}
		tstonesPool = sync.Pool{
			New: func() interface{} {
				return []tombstones.Stone{}
			},
		}
	)

	defer func() {
		// For CorruptionErr ensure to terminate all workers before exiting.
		_, ok := err.(*wal.CorruptionErr)
		if ok || seriesCreationErr != nil {
			for i := 0; i < n; i++ {
				close(inputs[i])
				for range outputs[i] {
				}
			}
			wg.Wait()
		}
	}()

	wg.Add(n)
	for i := 0; i < n; i++ {
		outputs[i] = make(chan []record.RefSample, 300)
		inputs[i] = make(chan []record.RefSample, 300)

		go func(input <-chan []record.RefSample, output chan<- []record.RefSample) {
			unknown := h.processWALSamples(h.minValidTime, input, output)
			atomic.AddUint64(&unknownRefs, unknown)
			wg.Done()
		}(inputs[i], outputs[i])
	}

	go func() {
		defer close(decoded)
		for r.Next() {
			rec := r.Record()
			switch dec.Type(rec) {
			case record.Series:
				series := seriesPool.Get().([]record.RefSeries)[:0]
				series, err = dec.Series(rec, series)
				if err != nil {
					decodeErr = &wal.CorruptionErr{
						Err:     errors.Wrap(err, "decode series"),
						Segment: r.Segment(),
						Offset:  r.Offset(),
					}
					return
				}
				decoded <- series
			case record.Samples:
				samples := samplesPool.Get().([]record.RefSample)[:0]
				samples, err = dec.Samples(rec, samples)
				if err != nil {
					decodeErr = &wal.CorruptionErr{
						Err:     errors.Wrap(err, "decode samples"),
						Segment: r.Segment(),
						Offset:  r.Offset(),
					}
					return
				}
				decoded <- samples
			case record.Tombstones:
				tstones := tstonesPool.Get().([]tombstones.Stone)[:0]
				tstones, err = dec.Tombstones(rec, tstones)
				if err != nil {
					decodeErr = &wal.CorruptionErr{
						Err:     errors.Wrap(err, "decode tombstones"),
						Segment: r.Segment(),
						Offset:  r.Offset(),
					}
					return
				}
				decoded <- tstones
			default:
				decodeErr = &wal.CorruptionErr{
					Err:     errors.Errorf("invalid record type %v", dec.Type(rec)),
					Segment: r.Segment(),
					Offset:  r.Offset(),
				}
				return
			}
		}
	}()

Outer:
	for d := range decoded {
		switch v := d.(type) {
		case []record.RefSeries:
			for _, s := range v {
				series, created, err := h.getOrCreateWithID(s.Ref, s.Labels.Hash(), s.Labels)
				if err != nil {
					seriesCreationErr = err
					break Outer
				}

				if created {
					// If this series gets a duplicate record, we don't restore its mmapped chunks,
					// and instead restore everything from WAL records.
					series.mmappedChunks = mmappedChunks[series.ref]

					h.metrics.chunks.Add(float64(len(series.mmappedChunks)))
					h.metrics.chunksCreated.Add(float64(len(series.mmappedChunks)))

					if len(series.mmappedChunks) > 0 {
						h.updateMinMaxTime(series.minTime(), series.maxTime())
					}
				} else {
					// TODO(codesome) Discard old samples and mmapped chunks and use mmap chunks for the new series ID.

					// There's already a different ref for this series.
					multiRef[s.Ref] = series.ref
				}

				if h.lastSeriesID < s.Ref {
					h.lastSeriesID = s.Ref
				}
			}
			//lint:ignore SA6002 relax staticcheck verification.
			seriesPool.Put(v)
		case []record.RefSample:
			samples := v
			// We split up the samples into chunks of 5000 samples or less.
			// With O(300 * #cores) in-flight sample batches, large scrapes could otherwise
			// cause thousands of very large in flight buffers occupying large amounts
			// of unused memory.
			for len(samples) > 0 {
				m := 5000
				if len(samples) < m {
					m = len(samples)
				}
				for i := 0; i < n; i++ {
					var buf []record.RefSample
					select {
					case buf = <-outputs[i]:
					default:
					}
					shards[i] = buf[:0]
				}
				for _, sam := range samples[:m] {
					if r, ok := multiRef[sam.Ref]; ok {
						sam.Ref = r
					}
					mod := sam.Ref % uint64(n)
					shards[mod] = append(shards[mod], sam)
				}
				for i := 0; i < n; i++ {
					inputs[i] <- shards[i]
				}
				samples = samples[m:]
			}
			//lint:ignore SA6002 relax staticcheck verification.
			samplesPool.Put(v)
		case []tombstones.Stone:
			for _, s := range v {
				for _, itv := range s.Intervals {
					if itv.Maxt < h.minValidTime {
						continue
					}
					if m := h.series.getByID(s.Ref); m == nil {
						unknownRefs++
						continue
					}
					h.tombstones.AddInterval(s.Ref, itv)
				}
			}
			//lint:ignore SA6002 relax staticcheck verification.
			tstonesPool.Put(v)
		default:
			panic(fmt.Errorf("unexpected decoded type: %T", d))
		}
	}

	if decodeErr != nil {
		return decodeErr
	}
	if seriesCreationErr != nil {
		// Drain the channel to unblock the goroutine.
		for range decoded {
		}
		return seriesCreationErr
	}

	// Signal termination to each worker and wait for it to close its output channel.
	for i := 0; i < n; i++ {
		close(inputs[i])
		for range outputs[i] {
		}
	}
	wg.Wait()

	if r.Err() != nil {
		return errors.Wrap(r.Err(), "read records")
	}

	if unknownRefs > 0 {
		level.Warn(h.logger).Log("msg", "Unknown series references", "count", unknownRefs)
	}
	return nil
}

// Init loads data from the write ahead log and prepares the head for writes.
// It should be called before using an appender so that it
// limits the ingested samples to the head min valid time.
func (h *Head) Init(minValidTime int64) error {
	h.minValidTime = minValidTime
	defer h.postings.EnsureOrder()
	defer h.gc() // After loading the wal remove the obsolete data from the head.

	if h.wal == nil {
		return nil
	}

	level.Info(h.logger).Log("msg", "Replaying WAL and on-disk memory mappable chunks if any, this may take a while")
	start := time.Now()

	mmappedChunks, err := h.loadMmappedChunks()
	if err != nil {
		level.Error(h.logger).Log("msg", "Loading on-disk chunks failed", "err", err)
		if _, ok := errors.Cause(err).(*chunks.CorruptionErr); ok {
			h.metrics.mmapChunkCorruptionTotal.Inc()
		}
		// If this fails, data will be recovered from WAL.
		// Hence we wont lose any data (given WAL is not corrupt).
		h.removeCorruptedMmappedChunks(err)
	}

	// Backfill the checkpoint first if it exists.
	dir, startFrom, err := wal.LastCheckpoint(h.wal.Dir())
	if err != nil && err != record.ErrNotFound {
		return errors.Wrap(err, "find last checkpoint")
	}
	multiRef := map[uint64]uint64{}
	if err == nil {
		sr, err := wal.NewSegmentsReader(dir)
		if err != nil {
			return errors.Wrap(err, "open checkpoint")
		}
		defer func() {
			if err := sr.Close(); err != nil {
				level.Warn(h.logger).Log("msg", "Error while closing the wal segments reader", "err", err)
			}
		}()

		// A corrupted checkpoint is a hard error for now and requires user
		// intervention. There's likely little data that can be recovered anyway.
		if err := h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks); err != nil {
			return errors.Wrap(err, "backfill checkpoint")
		}
		startFrom++
		level.Info(h.logger).Log("msg", "WAL checkpoint loaded")
	}

	// Find the last segment.
	_, last, err := h.wal.Segments()
	if err != nil {
		return errors.Wrap(err, "finding WAL segments")
	}

	// Backfill segments from the most recent checkpoint onwards.
	for i := startFrom; i <= last; i++ {
		s, err := wal.OpenReadSegment(wal.SegmentName(h.wal.Dir(), i))
		if err != nil {
			return errors.Wrap(err, fmt.Sprintf("open WAL segment: %d", i))
		}

		sr := wal.NewSegmentBufReader(s)
		err = h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks)
		if err := sr.Close(); err != nil {
			level.Warn(h.logger).Log("msg", "Error while closing the wal segments reader", "err", err)
		}
		if err != nil {
			return err
		}
		level.Info(h.logger).Log("msg", "WAL segment loaded", "segment", i, "maxSegment", last)
	}

	level.Info(h.logger).Log("msg", "WAL replay completed", "duration", time.Since(start).String())

	return nil
}

func (h *Head) loadMmappedChunks() (map[uint64][]*mmappedChunk, error) {
	mmappedChunks := map[uint64][]*mmappedChunk{}
	if err := h.chunkDiskMapper.IterateAllChunks(func(seriesRef, chunkRef uint64, mint, maxt int64, numSamples uint16) error {
		if maxt < h.minValidTime {
			return nil
		}

		slice := mmappedChunks[seriesRef]
		if len(slice) > 0 {
			if slice[len(slice)-1].maxTime >= mint {
				return errors.Errorf("out of sequence m-mapped chunk for series ref %d", seriesRef)
			}
		}

		slice = append(slice, &mmappedChunk{
			ref:        chunkRef,
			minTime:    mint,
			maxTime:    maxt,
			numSamples: numSamples,
		})
		mmappedChunks[seriesRef] = slice
		return nil
	}); err != nil {
		return nil, errors.Wrap(err, "iterate on on-disk chunks")
	}
	return mmappedChunks, nil
}

// removeCorruptedMmappedChunks attempts to delete the corrupted mmapped chunks and if it fails, it clears all the previously
// loaded mmapped chunks.
func (h *Head) removeCorruptedMmappedChunks(err error) map[uint64][]*mmappedChunk {
	level.Info(h.logger).Log("msg", "Deleting mmapped chunk files")

	if err := h.chunkDiskMapper.DeleteCorrupted(err); err != nil {
		level.Info(h.logger).Log("msg", "Deletion of mmap chunk files failed, discarding chunk files completely", "err", err)
		return map[uint64][]*mmappedChunk{}
	}

	level.Info(h.logger).Log("msg", "Deletion of mmap chunk files successful, reattempting m-mapping the on-disk chunks")
	mmappedChunks, err := h.loadMmappedChunks()
	if err != nil {
		level.Error(h.logger).Log("msg", "Loading on-disk chunks failed, discarding chunk files completely", "err", err)
		mmappedChunks = map[uint64][]*mmappedChunk{}
	}

	return mmappedChunks
}

// Truncate removes old data before mint from the head.
func (h *Head) Truncate(mint int64) (err error) {
	defer func() {
		if err != nil {
			h.metrics.headTruncateFail.Inc()
		}
	}()
	initialize := h.MinTime() == math.MaxInt64

	if h.MinTime() >= mint && !initialize {
		return nil
	}
	atomic.StoreInt64(&h.minTime, mint)
	atomic.StoreInt64(&h.minValidTime, mint)

	// Ensure that max time is at least as high as min time.
	for h.MaxTime() < mint {
		atomic.CompareAndSwapInt64(&h.maxTime, h.MaxTime(), mint)
	}

	// This was an initial call to Truncate after loading blocks on startup.
	// We haven't read back the WAL yet, so do not attempt to truncate it.
	if initialize {
		return nil
	}

	h.metrics.headTruncateTotal.Inc()
	start := time.Now()

	h.gc()
	level.Info(h.logger).Log("msg", "Head GC completed", "duration", time.Since(start))
	h.metrics.gcDuration.Observe(time.Since(start).Seconds())

	// Truncate the chunk m-mapper.
	if err := h.chunkDiskMapper.Truncate(mint); err != nil {
		return errors.Wrap(err, "truncate chunks.HeadReadWriter")
	}

	if h.wal == nil {
		return nil
	}
	start = time.Now()

	first, last, err := h.wal.Segments()
	if err != nil {
		return errors.Wrap(err, "get segment range")
	}
	// Start a new segment, so low ingestion volume TSDB don't have more WAL than
	// needed.
	err = h.wal.NextSegment()
	if err != nil {
		return errors.Wrap(err, "next segment")
	}
	last-- // Never consider last segment for checkpoint.
	if last < 0 {
		return nil // no segments yet.
	}
	// The lower two thirds of segments should contain mostly obsolete samples.
	// If we have less than two segments, it's not worth checkpointing yet.
	// With the default 2h blocks, this will keeping up to around 3h worth
	// of WAL segments.
	last = first + (last-first)*2/3
	if last <= first {
		return nil
	}

	keep := func(id uint64) bool {
		if h.series.getByID(id) != nil {
			return true
		}
		h.deletedMtx.Lock()
		_, ok := h.deleted[id]
		h.deletedMtx.Unlock()
		return ok
	}
	h.metrics.checkpointCreationTotal.Inc()
	if _, err = wal.Checkpoint(h.wal, first, last, keep, mint); err != nil {
		h.metrics.checkpointCreationFail.Inc()
		return errors.Wrap(err, "create checkpoint")
	}
	if err := h.wal.Truncate(last + 1); err != nil {
		// If truncating fails, we'll just try again at the next checkpoint.
		// Leftover segments will just be ignored in the future if there's a checkpoint
		// that supersedes them.
		level.Error(h.logger).Log("msg", "truncating segments failed", "err", err)
	}

	// The checkpoint is written and segments before it is truncated, so we no
	// longer need to track deleted series that are before it.
	h.deletedMtx.Lock()
	for ref, segment := range h.deleted {
		if segment < first {
			delete(h.deleted, ref)
		}
	}
	h.deletedMtx.Unlock()

	h.metrics.checkpointDeleteTotal.Inc()
	if err := wal.DeleteCheckpoints(h.wal.Dir(), last); err != nil {
		// Leftover old checkpoints do not cause problems down the line beyond
		// occupying disk space.
		// They will just be ignored since a higher checkpoint exists.
		level.Error(h.logger).Log("msg", "delete old checkpoints", "err", err)
		h.metrics.checkpointDeleteFail.Inc()
	}
	h.metrics.walTruncateDuration.Observe(time.Since(start).Seconds())

	level.Info(h.logger).Log("msg", "WAL checkpoint complete",
		"first", first, "last", last, "duration", time.Since(start))

	return nil
}

// initTime initializes a head with the first timestamp. This only needs to be called
// for a completely fresh head with an empty WAL.
// Returns true if the initialization took an effect.
func (h *Head) initTime(t int64) (initialized bool) {
	if !atomic.CompareAndSwapInt64(&h.minTime, math.MaxInt64, t) {
		return false
	}
	// Ensure that max time is initialized to at least the min time we just set.
	// Concurrent appenders may already have set it to a higher value.
	atomic.CompareAndSwapInt64(&h.maxTime, math.MinInt64, t)

	return true
}

type Stats struct {
	NumSeries         uint64
	MinTime, MaxTime  int64
	IndexPostingStats *index.PostingsStats
}

// Stats returns important current HEAD statistics. Note that it is expensive to
// calculate these.
func (h *Head) Stats(statsByLabelName string) *Stats {
	return &Stats{
		NumSeries:         h.NumSeries(),
		MaxTime:           h.MaxTime(),
		MinTime:           h.MinTime(),
		IndexPostingStats: h.PostingsCardinalityStats(statsByLabelName),
	}
}

type RangeHead struct {
	head       *Head
	mint, maxt int64
}

// NewRangeHead returns a *RangeHead.
func NewRangeHead(head *Head, mint, maxt int64) *RangeHead {
	return &RangeHead{
		head: head,
		mint: mint,
		maxt: maxt,
	}
}

func (h *RangeHead) Index() (IndexReader, error) {
	return h.head.indexRange(h.mint, h.maxt), nil
}

func (h *RangeHead) Chunks() (ChunkReader, error) {
	return h.head.chunksRange(h.mint, h.maxt, h.head.iso.State())
}

func (h *RangeHead) Tombstones() (tombstones.Reader, error) {
	return h.head.tombstones, nil
}

func (h *RangeHead) MinTime() int64 {
	return h.mint
}

func (h *RangeHead) MaxTime() int64 {
	return h.maxt
}

func (h *RangeHead) NumSeries() uint64 {
	return h.head.NumSeries()
}

func (h *RangeHead) Meta() BlockMeta {
	return BlockMeta{
		MinTime: h.MinTime(),
		MaxTime: h.MaxTime(),
		ULID:    h.head.Meta().ULID,
		Stats: BlockStats{
			NumSeries: h.NumSeries(),
		},
	}
}

// initAppender is a helper to initialize the time bounds of the head
// upon the first sample it receives.
type initAppender struct {
	app  storage.Appender
	head *Head
}

func (a *initAppender) Add(lset labels.Labels, t int64, v float64) (uint64, error) {
	if a.app != nil {
		return a.app.Add(lset, t, v)
	}
	a.head.initTime(t)
	a.app = a.head.appender()

	return a.app.Add(lset, t, v)
}

func (a *initAppender) AddFast(ref uint64, t int64, v float64) error {
	if a.app == nil {
		return storage.ErrNotFound
	}
	return a.app.AddFast(ref, t, v)
}

func (a *initAppender) Commit() error {
	if a.app == nil {
		return nil
	}
	return a.app.Commit()
}

func (a *initAppender) Rollback() error {
	if a.app == nil {
		return nil
	}
	return a.app.Rollback()
}

// Appender returns a new Appender on the database.
func (h *Head) Appender() storage.Appender {
	h.metrics.activeAppenders.Inc()

	// The head cache might not have a starting point yet. The init appender
	// picks up the first appended timestamp as the base.
	if h.MinTime() == math.MaxInt64 {
		return &initAppender{
			head: h,
		}
	}
	return h.appender()
}

func (h *Head) appender() *headAppender {
	appendID := h.iso.newAppendID()
	cleanupAppendIDsBelow := h.iso.lowWatermark()

	return &headAppender{
		head: h,
		// Set the minimum valid time to whichever is greater the head min valid time or the compaction window.
		// This ensures that no samples will be added within the compaction window to avoid races.
		minValidTime:          max(atomic.LoadInt64(&h.minValidTime), h.MaxTime()-h.chunkRange/2),
		mint:                  math.MaxInt64,
		maxt:                  math.MinInt64,
		samples:               h.getAppendBuffer(),
		sampleSeries:          h.getSeriesBuffer(),
		appendID:              appendID,
		cleanupAppendIDsBelow: cleanupAppendIDsBelow,
	}
}

func max(a, b int64) int64 {
	if a > b {
		return a
	}
	return b
}

func (h *Head) getAppendBuffer() []record.RefSample {
	b := h.appendPool.Get()
	if b == nil {
		return make([]record.RefSample, 0, 512)
	}
	return b.([]record.RefSample)
}

func (h *Head) putAppendBuffer(b []record.RefSample) {
	//lint:ignore SA6002 safe to ignore and actually fixing it has some performance penalty.
	h.appendPool.Put(b[:0])
}

func (h *Head) getSeriesBuffer() []*memSeries {
	b := h.seriesPool.Get()
	if b == nil {
		return make([]*memSeries, 0, 512)
	}
	return b.([]*memSeries)
}

func (h *Head) putSeriesBuffer(b []*memSeries) {
	//lint:ignore SA6002 safe to ignore and actually fixing it has some performance penalty.
	h.seriesPool.Put(b[:0])
}

func (h *Head) getBytesBuffer() []byte {
	b := h.bytesPool.Get()
	if b == nil {
		return make([]byte, 0, 1024)
	}
	return b.([]byte)
}

func (h *Head) putBytesBuffer(b []byte) {
	//lint:ignore SA6002 safe to ignore and actually fixing it has some performance penalty.
	h.bytesPool.Put(b[:0])
}

type headAppender struct {
	head         *Head
	minValidTime int64 // No samples below this timestamp are allowed.
	mint, maxt   int64

	series       []record.RefSeries
	samples      []record.RefSample
	sampleSeries []*memSeries

	appendID, cleanupAppendIDsBelow uint64
}

func (a *headAppender) Add(lset labels.Labels, t int64, v float64) (uint64, error) {
	if t < a.minValidTime {
		a.head.metrics.outOfBoundSamples.Inc()
		return 0, storage.ErrOutOfBounds
	}

	// Ensure no empty labels have gotten through.
	lset = lset.WithoutEmpty()

	if len(lset) == 0 {
		return 0, errors.Wrap(ErrInvalidSample, "empty labelset")
	}

	if l, dup := lset.HasDuplicateLabelNames(); dup {
		return 0, errors.Wrap(ErrInvalidSample, fmt.Sprintf(`label name "%s" is not unique`, l))
	}

	s, created, err := a.head.getOrCreate(lset.Hash(), lset)
	if err != nil {
		return 0, err
	}
	if created {
		a.series = append(a.series, record.RefSeries{
			Ref:    s.ref,
			Labels: lset,
		})
	}
	return s.ref, a.AddFast(s.ref, t, v)
}

func (a *headAppender) AddFast(ref uint64, t int64, v float64) error {
	if t < a.minValidTime {
		a.head.metrics.outOfBoundSamples.Inc()
		return storage.ErrOutOfBounds
	}

	s := a.head.series.getByID(ref)
	if s == nil {
		return errors.Wrap(storage.ErrNotFound, "unknown series")
	}
	s.Lock()
	if err := s.appendable(t, v); err != nil {
		s.Unlock()
		if err == storage.ErrOutOfOrderSample {
			a.head.metrics.outOfOrderSamples.Inc()
		}
		return err
	}
	s.pendingCommit = true
	s.Unlock()

	if t < a.mint {
		a.mint = t
	}
	if t > a.maxt {
		a.maxt = t
	}

	a.samples = append(a.samples, record.RefSample{
		Ref: ref,
		T:   t,
		V:   v,
	})
	a.sampleSeries = append(a.sampleSeries, s)
	return nil
}

func (a *headAppender) log() error {
	if a.head.wal == nil {
		return nil
	}

	buf := a.head.getBytesBuffer()
	defer func() { a.head.putBytesBuffer(buf) }()

	var rec []byte
	var enc record.Encoder

	if len(a.series) > 0 {
		rec = enc.Series(a.series, buf)
		buf = rec[:0]

		if err := a.head.wal.Log(rec); err != nil {
			return errors.Wrap(err, "log series")
		}
	}
	if len(a.samples) > 0 {
		rec = enc.Samples(a.samples, buf)
		buf = rec[:0]

		if err := a.head.wal.Log(rec); err != nil {
			return errors.Wrap(err, "log samples")
		}
	}
	return nil
}

func (a *headAppender) Commit() error {
	if err := a.log(); err != nil {
		//nolint: errcheck
		a.Rollback() // Most likely the same error will happen again.
		return errors.Wrap(err, "write to WAL")
	}

	defer a.head.metrics.activeAppenders.Dec()
	defer a.head.putAppendBuffer(a.samples)
	defer a.head.putSeriesBuffer(a.sampleSeries)
	defer a.head.iso.closeAppend(a.appendID)

	total := len(a.samples)
	var series *memSeries
	for i, s := range a.samples {
		series = a.sampleSeries[i]
		series.Lock()
		ok, chunkCreated := series.append(s.T, s.V, a.appendID, a.head.chunkDiskMapper)
		series.cleanupAppendIDsBelow(a.cleanupAppendIDsBelow)
		series.pendingCommit = false
		series.Unlock()

		if !ok {
			total--
			a.head.metrics.outOfOrderSamples.Inc()
		}
		if chunkCreated {
			a.head.metrics.chunks.Inc()
			a.head.metrics.chunksCreated.Inc()
		}
	}

	a.head.metrics.samplesAppended.Add(float64(total))
	a.head.updateMinMaxTime(a.mint, a.maxt)

	return nil
}

func (a *headAppender) Rollback() error {
	defer a.head.metrics.activeAppenders.Dec()
	defer a.head.iso.closeAppend(a.appendID)
	defer a.head.putSeriesBuffer(a.sampleSeries)

	var series *memSeries
	for i := range a.samples {
		series = a.sampleSeries[i]
		series.Lock()
		series.cleanupAppendIDsBelow(a.cleanupAppendIDsBelow)
		series.pendingCommit = false
		series.Unlock()
	}
	a.head.putAppendBuffer(a.samples)
	a.samples = nil

	// Series are created in the head memory regardless of rollback. Thus we have
	// to log them to the WAL in any case.
	return a.log()
}

// Delete all samples in the range of [mint, maxt] for series that satisfy the given
// label matchers.
func (h *Head) Delete(mint, maxt int64, ms ...*labels.Matcher) error {
	// Do not delete anything beyond the currently valid range.
	mint, maxt = clampInterval(mint, maxt, h.MinTime(), h.MaxTime())

	ir := h.indexRange(mint, maxt)

	p, err := PostingsForMatchers(ir, ms...)
	if err != nil {
		return errors.Wrap(err, "select series")
	}

	var stones []tombstones.Stone
	for p.Next() {
		series := h.series.getByID(p.At())

		series.RLock()
		t0, t1 := series.minTime(), series.maxTime()
		series.RUnlock()
		if t0 == math.MinInt64 || t1 == math.MinInt64 {
			continue
		}
		// Delete only until the current values and not beyond.
		t0, t1 = clampInterval(mint, maxt, t0, t1)
		stones = append(stones, tombstones.Stone{Ref: p.At(), Intervals: tombstones.Intervals{{Mint: t0, Maxt: t1}}})
	}
	if p.Err() != nil {
		return p.Err()
	}
	if h.wal != nil {
		var enc record.Encoder
		if err := h.wal.Log(enc.Tombstones(stones, nil)); err != nil {
			return err
		}
	}
	for _, s := range stones {
		h.tombstones.AddInterval(s.Ref, s.Intervals[0])
	}

	return nil
}

// gc removes data before the minimum timestamp from the head.
func (h *Head) gc() {
	// Only data strictly lower than this timestamp must be deleted.
	mint := h.MinTime()

	// Drop old chunks and remember series IDs and hashes if they can be
	// deleted entirely.
	deleted, chunksRemoved := h.series.gc(mint)
	seriesRemoved := len(deleted)

	h.metrics.seriesRemoved.Add(float64(seriesRemoved))
	h.metrics.chunksRemoved.Add(float64(chunksRemoved))
	h.metrics.chunks.Sub(float64(chunksRemoved))
	// Using AddUint64 to subtract series removed.
	// See: https://golang.org/pkg/sync/atomic/#AddUint64.
	atomic.AddUint64(&h.numSeries, ^uint64(seriesRemoved-1))

	// Remove deleted series IDs from the postings lists.
	h.postings.Delete(deleted)

	if h.wal != nil {
		_, last, _ := h.wal.Segments()
		h.deletedMtx.Lock()
		// Keep series records until we're past segment 'last'
		// because the WAL will still have samples records with
		// this ref ID. If we didn't keep these series records then
		// on start up when we replay the WAL, or any other code
		// that reads the WAL, wouldn't be able to use those
		// samples since we would have no labels for that ref ID.
		for ref := range deleted {
			h.deleted[ref] = last
		}
		h.deletedMtx.Unlock()
	}

	// Rebuild symbols and label value indices from what is left in the postings terms.
	symbols := make(map[string]struct{}, len(h.symbols))
	values := make(map[string]stringset, len(h.values))

	if err := h.postings.Iter(func(t labels.Label, _ index.Postings) error {
		symbols[t.Name] = struct{}{}
		symbols[t.Value] = struct{}{}

		ss, ok := values[t.Name]
		if !ok {
			ss = stringset{}
			values[t.Name] = ss
		}
		ss.set(t.Value)
		return nil
	}); err != nil {
		// This should never happen, as the iteration function only returns nil.
		panic(err)
	}

	h.symMtx.Lock()

	h.symbols = symbols
	h.values = values

	h.symMtx.Unlock()
}

// Tombstones returns a new reader over the head's tombstones
func (h *Head) Tombstones() (tombstones.Reader, error) {
	return h.tombstones, nil
}

// Index returns an IndexReader against the block.
func (h *Head) Index() (IndexReader, error) {
	return h.indexRange(math.MinInt64, math.MaxInt64), nil
}

func (h *Head) indexRange(mint, maxt int64) *headIndexReader {
	if hmin := h.MinTime(); hmin > mint {
		mint = hmin
	}
	return &headIndexReader{head: h, mint: mint, maxt: maxt}
}

// Chunks returns a ChunkReader against the block.
func (h *Head) Chunks() (ChunkReader, error) {
	return h.chunksRange(math.MinInt64, math.MaxInt64, h.iso.State())
}

func (h *Head) chunksRange(mint, maxt int64, is *isolationState) (*headChunkReader, error) {
	h.closedMtx.Lock()
	defer h.closedMtx.Unlock()
	if h.closed {
		return nil, errors.New("can't read from a closed head")
	}
	if hmin := h.MinTime(); hmin > mint {
		mint = hmin
	}
	return &headChunkReader{
		head:         h,
		mint:         mint,
		maxt:         maxt,
		isoState:     is,
		memChunkPool: &h.memChunkPool,
	}, nil
}

// NumSeries returns the number of active series in the head.
func (h *Head) NumSeries() uint64 {
	return atomic.LoadUint64(&h.numSeries)
}

// Meta returns meta information about the head.
// The head is dynamic so will return dynamic results.
func (h *Head) Meta() BlockMeta {
	var id [16]byte
	copy(id[:], "______head______")
	return BlockMeta{
		MinTime: h.MinTime(),
		MaxTime: h.MaxTime(),
		ULID:    ulid.ULID(id),
		Stats: BlockStats{
			NumSeries: h.NumSeries(),
		},
	}
}

// MinTime returns the lowest time bound on visible data in the head.
func (h *Head) MinTime() int64 {
	return atomic.LoadInt64(&h.minTime)
}

// MaxTime returns the highest timestamp seen in data of the head.
func (h *Head) MaxTime() int64 {
	return atomic.LoadInt64(&h.maxTime)
}

// compactable returns whether the head has a compactable range.
// The head has a compactable range when the head time range is 1.5 times the chunk range.
// The 0.5 acts as a buffer of the appendable window.
func (h *Head) compactable() bool {
	return h.MaxTime()-h.MinTime() > h.chunkRange/2*3
}

// Close flushes the WAL and closes the head.
func (h *Head) Close() error {
	h.closedMtx.Lock()
	defer h.closedMtx.Unlock()
	h.closed = true
	var merr tsdb_errors.MultiError
	merr.Add(h.chunkDiskMapper.Close())
	if h.wal != nil {
		merr.Add(h.wal.Close())
	}
	return merr.Err()
}

type headChunkReader struct {
	head         *Head
	mint, maxt   int64
	isoState     *isolationState
	memChunkPool *sync.Pool
}

func (h *headChunkReader) Close() error {
	h.isoState.Close()
	return nil
}

// packChunkID packs a seriesID and a chunkID within it into a global 8 byte ID.
// It panicks if the seriesID exceeds 5 bytes or the chunk ID 3 bytes.
func packChunkID(seriesID, chunkID uint64) uint64 {
	if seriesID > (1<<40)-1 {
		panic("series ID exceeds 5 bytes")
	}
	if chunkID > (1<<24)-1 {
		panic("chunk ID exceeds 3 bytes")
	}
	return (seriesID << 24) | chunkID
}

func unpackChunkID(id uint64) (seriesID, chunkID uint64) {
	return id >> 24, (id << 40) >> 40
}

// Chunk returns the chunk for the reference number.
func (h *headChunkReader) Chunk(ref uint64) (chunkenc.Chunk, error) {
	sid, cid := unpackChunkID(ref)

	s := h.head.series.getByID(sid)
	// This means that the series has been garbage collected.
	if s == nil {
		return nil, storage.ErrNotFound
	}

	s.Lock()
	c, garbageCollect, err := s.chunk(int(cid), h.head.chunkDiskMapper)
	if err != nil {
		s.Unlock()
		return nil, err
	}
	defer func() {
		if garbageCollect {
			// Set this to nil so that Go GC can collect it after it has been used.
			c.chunk = nil
			h.memChunkPool.Put(c)
		}
	}()

	// This means that the chunk is outside the specified range.
	if !c.OverlapsClosedInterval(h.mint, h.maxt) {
		s.Unlock()
		return nil, storage.ErrNotFound
	}
	s.Unlock()

	return &safeChunk{
		Chunk:           c.chunk,
		s:               s,
		cid:             int(cid),
		isoState:        h.isoState,
		chunkDiskMapper: h.head.chunkDiskMapper,
	}, nil
}

type safeChunk struct {
	chunkenc.Chunk
	s               *memSeries
	cid             int
	isoState        *isolationState
	chunkDiskMapper *chunks.ChunkDiskMapper
}

func (c *safeChunk) Iterator(reuseIter chunkenc.Iterator) chunkenc.Iterator {
	c.s.Lock()
	it := c.s.iterator(c.cid, c.isoState, c.chunkDiskMapper, reuseIter)
	c.s.Unlock()
	return it
}

type headIndexReader struct {
	head       *Head
	mint, maxt int64
}

func (h *headIndexReader) Close() error {
	return nil
}

func (h *headIndexReader) Symbols() index.StringIter {
	h.head.symMtx.RLock()
	res := make([]string, 0, len(h.head.symbols))

	for s := range h.head.symbols {
		res = append(res, s)
	}
	h.head.symMtx.RUnlock()

	sort.Strings(res)
	return index.NewStringListIter(res)
}

// LabelValues returns label values present in the head for the
// specific label name that are within the time range mint to maxt.
func (h *headIndexReader) LabelValues(name string) ([]string, error) {
	h.head.symMtx.RLock()

	if h.maxt < h.head.MinTime() || h.mint > h.head.MaxTime() {
		h.head.symMtx.RUnlock()
		return []string{}, nil
	}

	sl := make([]string, 0, len(h.head.values[name]))
	for s := range h.head.values[name] {
		sl = append(sl, s)
	}
	h.head.symMtx.RUnlock()
	sort.Strings(sl)
	return sl, nil
}

// LabelNames returns all the unique label names present in the head
// that are within the time range mint to maxt.
func (h *headIndexReader) LabelNames() ([]string, error) {
	h.head.symMtx.RLock()
	defer h.head.symMtx.RUnlock()

	if h.maxt < h.head.MinTime() || h.mint > h.head.MaxTime() {
		return []string{}, nil
	}

	labelNames := make([]string, 0, len(h.head.values))
	for name := range h.head.values {
		if name == "" {
			continue
		}
		labelNames = append(labelNames, name)
	}
	sort.Strings(labelNames)
	return labelNames, nil
}

// Postings returns the postings list iterator for the label pairs.
func (h *headIndexReader) Postings(name string, values ...string) (index.Postings, error) {
	res := make([]index.Postings, 0, len(values))
	for _, value := range values {
		res = append(res, h.head.postings.Get(name, value))
	}
	return index.Merge(res...), nil
}

func (h *headIndexReader) SortedPostings(p index.Postings) index.Postings {
	series := make([]*memSeries, 0, 128)

	// Fetch all the series only once.
	for p.Next() {
		s := h.head.series.getByID(p.At())
		if s == nil {
			level.Debug(h.head.logger).Log("msg", "Looked up series not found")
		} else {
			series = append(series, s)
		}
	}
	if err := p.Err(); err != nil {
		return index.ErrPostings(errors.Wrap(err, "expand postings"))
	}

	sort.Slice(series, func(i, j int) bool {
		return labels.Compare(series[i].lset, series[j].lset) < 0
	})

	// Convert back to list.
	ep := make([]uint64, 0, len(series))
	for _, p := range series {
		ep = append(ep, p.ref)
	}
	return index.NewListPostings(ep)
}

// Series returns the series for the given reference.
func (h *headIndexReader) Series(ref uint64, lbls *labels.Labels, chks *[]chunks.Meta) error {
	s := h.head.series.getByID(ref)

	if s == nil {
		h.head.metrics.seriesNotFound.Inc()
		return storage.ErrNotFound
	}
	*lbls = append((*lbls)[:0], s.lset...)

	s.Lock()
	defer s.Unlock()

	*chks = (*chks)[:0]

	for i, c := range s.mmappedChunks {
		// Do not expose chunks that are outside of the specified range.
		if !c.OverlapsClosedInterval(h.mint, h.maxt) {
			continue
		}
		*chks = append(*chks, chunks.Meta{
			MinTime: c.minTime,
			MaxTime: c.maxTime,
			Ref:     packChunkID(s.ref, uint64(s.chunkID(i))),
		})
	}
	if s.headChunk != nil && s.headChunk.OverlapsClosedInterval(h.mint, h.maxt) {
		*chks = append(*chks, chunks.Meta{
			MinTime: s.headChunk.minTime,
			MaxTime: math.MaxInt64, // Set the head chunks as open (being appended to).
			Ref:     packChunkID(s.ref, uint64(s.chunkID(len(s.mmappedChunks)))),
		})
	}

	return nil
}

func (h *Head) getOrCreate(hash uint64, lset labels.Labels) (*memSeries, bool, error) {
	// Just using `getOrSet` below would be semantically sufficient, but we'd create
	// a new series on every sample inserted via Add(), which causes allocations
	// and makes our series IDs rather random and harder to compress in postings.
	s := h.series.getByHash(hash, lset)
	if s != nil {
		return s, false, nil
	}

	// Optimistically assume that we are the first one to create the series.
	id := atomic.AddUint64(&h.lastSeriesID, 1)

	return h.getOrCreateWithID(id, hash, lset)
}

func (h *Head) getOrCreateWithID(id, hash uint64, lset labels.Labels) (*memSeries, bool, error) {
	s := newMemSeries(lset, id, h.chunkRange, &h.memChunkPool)

	s, created, err := h.series.getOrSet(hash, s)
	if err != nil {
		return nil, false, err
	}
	if !created {
		return s, false, nil
	}

	h.metrics.seriesCreated.Inc()
	atomic.AddUint64(&h.numSeries, 1)

	h.postings.Add(id, lset)

	h.symMtx.Lock()
	defer h.symMtx.Unlock()

	for _, l := range lset {
		valset, ok := h.values[l.Name]
		if !ok {
			valset = stringset{}
			h.values[l.Name] = valset
		}
		valset.set(l.Value)

		h.symbols[l.Name] = struct{}{}
		h.symbols[l.Value] = struct{}{}
	}

	return s, true, nil
}

// seriesHashmap is a simple hashmap for memSeries by their label set. It is built
// on top of a regular hashmap and holds a slice of series to resolve hash collisions.
// Its methods require the hash to be submitted with it to avoid re-computations throughout
// the code.
type seriesHashmap map[uint64][]*memSeries

func (m seriesHashmap) get(hash uint64, lset labels.Labels) *memSeries {
	for _, s := range m[hash] {
		if labels.Equal(s.lset, lset) {
			return s
		}
	}
	return nil
}

func (m seriesHashmap) set(hash uint64, s *memSeries) {
	l := m[hash]
	for i, prev := range l {
		if labels.Equal(prev.lset, s.lset) {
			l[i] = s
			return
		}
	}
	m[hash] = append(l, s)
}

func (m seriesHashmap) del(hash uint64, lset labels.Labels) {
	var rem []*memSeries
	for _, s := range m[hash] {
		if !labels.Equal(s.lset, lset) {
			rem = append(rem, s)
		}
	}
	if len(rem) == 0 {
		delete(m, hash)
	} else {
		m[hash] = rem
	}
}

const (
	// DefaultStripeSize is the default number of entries to allocate in the stripeSeries hash map.
	DefaultStripeSize = 1 << 14
)

// stripeSeries locks modulo ranges of IDs and hashes to reduce lock contention.
// The locks are padded to not be on the same cache line. Filling the padded space
// with the maps was profiled to be slower – likely due to the additional pointer
// dereferences.
type stripeSeries struct {
	size                    int
	series                  []map[uint64]*memSeries
	hashes                  []seriesHashmap
	locks                   []stripeLock
	seriesLifecycleCallback SeriesLifecycleCallback
}

type stripeLock struct {
	sync.RWMutex
	// Padding to avoid multiple locks being on the same cache line.
	_ [40]byte
}

func newStripeSeries(stripeSize int, seriesCallback SeriesLifecycleCallback) *stripeSeries {
	s := &stripeSeries{
		size:                    stripeSize,
		series:                  make([]map[uint64]*memSeries, stripeSize),
		hashes:                  make([]seriesHashmap, stripeSize),
		locks:                   make([]stripeLock, stripeSize),
		seriesLifecycleCallback: seriesCallback,
	}

	for i := range s.series {
		s.series[i] = map[uint64]*memSeries{}
	}
	for i := range s.hashes {
		s.hashes[i] = seriesHashmap{}
	}
	return s
}

// gc garbage collects old chunks that are strictly before mint and removes
// series entirely that have no chunks left.
func (s *stripeSeries) gc(mint int64) (map[uint64]struct{}, int) {
	var (
		deleted            = map[uint64]struct{}{}
		deletedForCallback = []labels.Labels{}
		rmChunks           = 0
	)
	// Run through all series and truncate old chunks. Mark those with no
	// chunks left as deleted and store their ID.
	for i := 0; i < s.size; i++ {
		s.locks[i].Lock()

		for hash, all := range s.hashes[i] {
			for _, series := range all {
				series.Lock()
				rmChunks += series.truncateChunksBefore(mint)

				if len(series.mmappedChunks) > 0 || series.headChunk != nil || series.pendingCommit {
					series.Unlock()
					continue
				}

				// The series is gone entirely. We need to keep the series lock
				// and make sure we have acquired the stripe locks for hash and ID of the
				// series alike.
				// If we don't hold them all, there's a very small chance that a series receives
				// samples again while we are half-way into deleting it.
				j := int(series.ref) & (s.size - 1)

				if i != j {
					s.locks[j].Lock()
				}

				deleted[series.ref] = struct{}{}
				s.hashes[i].del(hash, series.lset)
				delete(s.series[j], series.ref)
				deletedForCallback = append(deletedForCallback, series.lset)

				if i != j {
					s.locks[j].Unlock()
				}

				series.Unlock()
			}
		}

		s.locks[i].Unlock()

		s.seriesLifecycleCallback.PostDeletion(deletedForCallback...)
		deletedForCallback = deletedForCallback[:0]
	}

	return deleted, rmChunks
}

func (s *stripeSeries) getByID(id uint64) *memSeries {
	i := id & uint64(s.size-1)

	s.locks[i].RLock()
	series := s.series[i][id]
	s.locks[i].RUnlock()

	return series
}

func (s *stripeSeries) getByHash(hash uint64, lset labels.Labels) *memSeries {
	i := hash & uint64(s.size-1)

	s.locks[i].RLock()
	series := s.hashes[i].get(hash, lset)
	s.locks[i].RUnlock()

	return series
}

func (s *stripeSeries) getOrSet(hash uint64, series *memSeries) (*memSeries, bool, error) {
	// PreCreation is called here to avoid calling it inside the lock.
	// It is not necessary to call it just before creating a series,
	// rather it gives a 'hint' whether to create a series or not.
	createSeriesErr := s.seriesLifecycleCallback.PreCreation(series.lset)

	i := hash & uint64(s.size-1)
	s.locks[i].Lock()

	if prev := s.hashes[i].get(hash, series.lset); prev != nil {
		s.locks[i].Unlock()
		return prev, false, nil
	}
	if createSeriesErr == nil {
		s.hashes[i].set(hash, series)
	}
	s.locks[i].Unlock()

	if createSeriesErr != nil {
		// The callback prevented creation of series.
		return nil, false, createSeriesErr
	}
	// Setting the series in the s.hashes marks the creation of series
	// as any further calls to this methods would return that series.
	s.seriesLifecycleCallback.PostCreation(series.lset)

	i = series.ref & uint64(s.size-1)

	s.locks[i].Lock()
	s.series[i][series.ref] = series
	s.locks[i].Unlock()

	return series, true, nil
}

type sample struct {
	t int64
	v float64
}

func (s sample) T() int64 {
	return s.t
}

func (s sample) V() float64 {
	return s.v
}

// memSeries is the in-memory representation of a series. None of its methods
// are goroutine safe and it is the caller's responsibility to lock it.
type memSeries struct {
	sync.RWMutex

	ref           uint64
	lset          labels.Labels
	mmappedChunks []*mmappedChunk
	headChunk     *memChunk
	chunkRange    int64
	firstChunkID  int

	nextAt        int64 // Timestamp at which to cut the next chunk.
	sampleBuf     [4]sample
	pendingCommit bool // Whether there are samples waiting to be committed to this series.

	app chunkenc.Appender // Current appender for the chunk.

	memChunkPool *sync.Pool

	txs *txRing
}

func newMemSeries(lset labels.Labels, id uint64, chunkRange int64, memChunkPool *sync.Pool) *memSeries {
	s := &memSeries{
		lset:         lset,
		ref:          id,
		chunkRange:   chunkRange,
		nextAt:       math.MinInt64,
		txs:          newTxRing(4),
		memChunkPool: memChunkPool,
	}
	return s
}

func (s *memSeries) minTime() int64 {
	if len(s.mmappedChunks) > 0 {
		return s.mmappedChunks[0].minTime
	}
	if s.headChunk != nil {
		return s.headChunk.minTime
	}
	return math.MinInt64
}

func (s *memSeries) maxTime() int64 {
	c := s.head()
	if c == nil {
		return math.MinInt64
	}
	return c.maxTime
}

func (s *memSeries) cutNewHeadChunk(mint int64, chunkDiskMapper *chunks.ChunkDiskMapper) *memChunk {
	s.mmapCurrentHeadChunk(chunkDiskMapper)

	s.headChunk = &memChunk{
		chunk:   chunkenc.NewXORChunk(),
		minTime: mint,
		maxTime: math.MinInt64,
	}

	// Set upper bound on when the next chunk must be started. An earlier timestamp
	// may be chosen dynamically at a later point.
	s.nextAt = rangeForTimestamp(mint, s.chunkRange)

	app, err := s.headChunk.chunk.Appender()
	if err != nil {
		panic(err)
	}
	s.app = app
	return s.headChunk
}

func (s *memSeries) mmapCurrentHeadChunk(chunkDiskMapper *chunks.ChunkDiskMapper) {
	if s.headChunk == nil {
		// There is no head chunk, so nothing to m-map here.
		return
	}

	chunkRef, err := chunkDiskMapper.WriteChunk(s.ref, s.headChunk.minTime, s.headChunk.maxTime, s.headChunk.chunk)
	if err != nil {
		if err != chunks.ErrChunkDiskMapperClosed {
			panic(err)
		}
	}
	s.mmappedChunks = append(s.mmappedChunks, &mmappedChunk{
		ref:        chunkRef,
		numSamples: uint16(s.headChunk.chunk.NumSamples()),
		minTime:    s.headChunk.minTime,
		maxTime:    s.headChunk.maxTime,
	})
}

// appendable checks whether the given sample is valid for appending to the series.
func (s *memSeries) appendable(t int64, v float64) error {
	c := s.head()
	if c == nil {
		return nil
	}

	if t > c.maxTime {
		return nil
	}
	if t < c.maxTime {
		return storage.ErrOutOfOrderSample
	}
	// We are allowing exact duplicates as we can encounter them in valid cases
	// like federation and erroring out at that time would be extremely noisy.
	if math.Float64bits(s.sampleBuf[3].v) != math.Float64bits(v) {
		return storage.ErrDuplicateSampleForTimestamp
	}
	return nil
}

// chunk returns the chunk for the chunk id from memory or by m-mapping it from the disk.
// If garbageCollect is true, it means that the returned *memChunk
// (and not the chunkenc.Chunk inside it) can be garbage collected after it's usage.
func (s *memSeries) chunk(id int, chunkDiskMapper *chunks.ChunkDiskMapper) (chunk *memChunk, garbageCollect bool, err error) {
	// ix represents the index of chunk in the s.mmappedChunks slice. The chunk id's are
	// incremented by 1 when new chunk is created, hence (id - firstChunkID) gives the slice index.
	// The max index for the s.mmappedChunks slice can be len(s.mmappedChunks)-1, hence if the ix
	// is len(s.mmappedChunks), it represents the next chunk, which is the head chunk.
	ix := id - s.firstChunkID
	if ix < 0 || ix > len(s.mmappedChunks) {
		return nil, false, storage.ErrNotFound
	}
	if ix == len(s.mmappedChunks) {
		if s.headChunk == nil {
			return nil, false, errors.New("invalid head chunk")
		}
		return s.headChunk, false, nil
	}
	chk, err := chunkDiskMapper.Chunk(s.mmappedChunks[ix].ref)
	if err != nil {
		if _, ok := err.(*chunks.CorruptionErr); ok {
			panic(err)
		}
		return nil, false, err
	}
	mc := s.memChunkPool.Get().(*memChunk)
	mc.chunk = chk
	mc.minTime = s.mmappedChunks[ix].minTime
	mc.maxTime = s.mmappedChunks[ix].maxTime
	return mc, true, nil
}

func (s *memSeries) chunkID(pos int) int {
	return pos + s.firstChunkID
}

// truncateChunksBefore removes all chunks from the series that have not timestamp
// at or after mint. Chunk IDs remain unchanged.
func (s *memSeries) truncateChunksBefore(mint int64) (removed int) {
	var k int
	if s.headChunk != nil && s.headChunk.maxTime < mint {
		// If head chunk is truncated, we can truncate all mmapped chunks.
		k = 1 + len(s.mmappedChunks)
		s.firstChunkID += k
		s.headChunk = nil
		s.mmappedChunks = nil
		return k
	}
	if len(s.mmappedChunks) > 0 {
		for i, c := range s.mmappedChunks {
			if c.maxTime >= mint {
				break
			}
			k = i + 1
		}
		s.mmappedChunks = append(s.mmappedChunks[:0], s.mmappedChunks[k:]...)
		s.firstChunkID += k
	}
	return k
}

// append adds the sample (t, v) to the series. The caller also has to provide
// the appendID for isolation. (The appendID can be zero, which results in no
// isolation for this append.)
// It is unsafe to call this concurrently with s.iterator(...) without holding the series lock.
func (s *memSeries) append(t int64, v float64, appendID uint64, chunkDiskMapper *chunks.ChunkDiskMapper) (sampleInOrder, chunkCreated bool) {
	// Based on Gorilla white papers this offers near-optimal compression ratio
	// so anything bigger that this has diminishing returns and increases
	// the time range within which we have to decompress all samples.
	const samplesPerChunk = 120

	c := s.head()

	if c == nil {
		if len(s.mmappedChunks) > 0 && s.mmappedChunks[len(s.mmappedChunks)-1].maxTime >= t {
			// Out of order sample. Sample timestamp is already in the mmaped chunks, so ignore it.
			return false, false
		}
		// There is no chunk in this series yet, create the first chunk for the sample.
		c = s.cutNewHeadChunk(t, chunkDiskMapper)
		chunkCreated = true
	}
	numSamples := c.chunk.NumSamples()

	// Out of order sample.
	if c.maxTime >= t {
		return false, chunkCreated
	}
	// If we reach 25% of a chunk's desired sample count, set a definitive time
	// at which to start the next chunk.
	// At latest it must happen at the timestamp set when the chunk was cut.
	if numSamples == samplesPerChunk/4 {
		s.nextAt = computeChunkEndTime(c.minTime, c.maxTime, s.nextAt)
	}
	if t >= s.nextAt {
		c = s.cutNewHeadChunk(t, chunkDiskMapper)
		chunkCreated = true
	}
	s.app.Append(t, v)

	c.maxTime = t

	s.sampleBuf[0] = s.sampleBuf[1]
	s.sampleBuf[1] = s.sampleBuf[2]
	s.sampleBuf[2] = s.sampleBuf[3]
	s.sampleBuf[3] = sample{t: t, v: v}

	if appendID > 0 {
		s.txs.add(appendID)
	}

	return true, chunkCreated
}

// cleanupAppendIDsBelow cleans up older appendIDs. Has to be called after
// acquiring lock.
func (s *memSeries) cleanupAppendIDsBelow(bound uint64) {
	s.txs.cleanupAppendIDsBelow(bound)
}

// computeChunkEndTime estimates the end timestamp based the beginning of a
// chunk, its current timestamp and the upper bound up to which we insert data.
// It assumes that the time range is 1/4 full.
func computeChunkEndTime(start, cur, max int64) int64 {
	a := (max - start) / ((cur - start + 1) * 4)
	if a == 0 {
		return max
	}
	return start + (max-start)/a
}

// iterator returns a chunk iterator.
// It is unsafe to call this concurrently with s.append(...) without holding the series lock.
func (s *memSeries) iterator(id int, isoState *isolationState, chunkDiskMapper *chunks.ChunkDiskMapper, it chunkenc.Iterator) chunkenc.Iterator {
	c, garbageCollect, err := s.chunk(id, chunkDiskMapper)
	// TODO(fabxc): Work around! An error will be returns when a querier have retrieved a pointer to a
	// series's chunk, which got then garbage collected before it got
	// accessed.  We must ensure to not garbage collect as long as any
	// readers still hold a reference.
	if err != nil {
		return chunkenc.NewNopIterator()
	}
	defer func() {
		if garbageCollect {
			// Set this to nil so that Go GC can collect it after it has been used.
			// This should be done always at the end.
			c.chunk = nil
			s.memChunkPool.Put(c)
		}
	}()

	ix := id - s.firstChunkID

	numSamples := c.chunk.NumSamples()
	stopAfter := numSamples

	if isoState != nil {
		totalSamples := 0    // Total samples in this series.
		previousSamples := 0 // Samples before this chunk.

		for j, d := range s.mmappedChunks {
			totalSamples += int(d.numSamples)
			if j < ix {
				previousSamples += int(d.numSamples)
			}
		}

		if s.headChunk != nil {
			totalSamples += s.headChunk.chunk.NumSamples()
		}

		// Removing the extra transactionIDs that are relevant for samples that
		// come after this chunk, from the total transactionIDs.
		appendIDsToConsider := s.txs.txIDCount - (totalSamples - (previousSamples + numSamples))

		// Iterate over the appendIDs, find the first one that the isolation state says not
		// to return.
		it := s.txs.iterator()
		for index := 0; index < appendIDsToConsider; index++ {
			appendID := it.At()
			if appendID <= isoState.maxAppendID { // Easy check first.
				if _, ok := isoState.incompleteAppends[appendID]; !ok {
					it.Next()
					continue
				}
			}
			stopAfter = numSamples - (appendIDsToConsider - index)
			if stopAfter < 0 {
				stopAfter = 0 // Stopped in a previous chunk.
			}
			break
		}
	}

	if stopAfter == 0 {
		return chunkenc.NewNopIterator()
	}

	if id-s.firstChunkID < len(s.mmappedChunks) {
		if stopAfter == numSamples {
			return c.chunk.Iterator(it)
		}
		if msIter, ok := it.(*stopIterator); ok {
			msIter.Iterator = c.chunk.Iterator(msIter.Iterator)
			msIter.i = -1
			msIter.stopAfter = stopAfter
			return msIter
		}
		return &stopIterator{
			Iterator:  c.chunk.Iterator(it),
			i:         -1,
			stopAfter: stopAfter,
		}
	}
	// Serve the last 4 samples for the last chunk from the sample buffer
	// as their compressed bytes may be mutated by added samples.
	if msIter, ok := it.(*memSafeIterator); ok {
		msIter.Iterator = c.chunk.Iterator(msIter.Iterator)
		msIter.i = -1
		msIter.total = numSamples
		msIter.stopAfter = stopAfter
		msIter.buf = s.sampleBuf
		return msIter
	}
	return &memSafeIterator{
		stopIterator: stopIterator{
			Iterator:  c.chunk.Iterator(it),
			i:         -1,
			stopAfter: stopAfter,
		},
		total: numSamples,
		buf:   s.sampleBuf,
	}
}

func (s *memSeries) head() *memChunk {
	return s.headChunk
}

type memChunk struct {
	chunk            chunkenc.Chunk
	minTime, maxTime int64
}

// OverlapsClosedInterval returns true if the chunk overlaps [mint, maxt].
func (mc *memChunk) OverlapsClosedInterval(mint, maxt int64) bool {
	return mc.minTime <= maxt && mint <= mc.maxTime
}

type stopIterator struct {
	chunkenc.Iterator

	i, stopAfter int
}

func (it *stopIterator) Next() bool {
	if it.i+1 >= it.stopAfter {
		return false
	}
	it.i++
	return it.Iterator.Next()
}

type memSafeIterator struct {
	stopIterator

	total int
	buf   [4]sample
}

func (it *memSafeIterator) Next() bool {
	if it.i+1 >= it.stopAfter {
		return false
	}
	it.i++
	if it.total-it.i > 4 {
		return it.Iterator.Next()
	}
	return true
}

func (it *memSafeIterator) At() (int64, float64) {
	if it.total-it.i > 4 {
		return it.Iterator.At()
	}
	s := it.buf[4-(it.total-it.i)]
	return s.t, s.v
}

type stringset map[string]struct{}

func (ss stringset) set(s string) {
	ss[s] = struct{}{}
}

func (ss stringset) String() string {
	return strings.Join(ss.slice(), ",")
}

func (ss stringset) slice() []string {
	slice := make([]string, 0, len(ss))
	for k := range ss {
		slice = append(slice, k)
	}
	sort.Strings(slice)
	return slice
}

type mmappedChunk struct {
	ref              uint64
	numSamples       uint16
	minTime, maxTime int64
}

// Returns true if the chunk overlaps [mint, maxt].
func (mc *mmappedChunk) OverlapsClosedInterval(mint, maxt int64) bool {
	return mc.minTime <= maxt && mint <= mc.maxTime
}

// SeriesLifecycleCallback specifies a list of callbacks that will be called during a lifecycle of a series.
// It is always a no-op in Prometheus and mainly meant for external users who import TSDB.
// All the callbacks should be safe to be called concurrently.
// It is upto the user to implement soft or hard consistency by making the callbacks
// atomic or non-atomic. Atomic callbacks can cause degradation performance.
type SeriesLifecycleCallback interface {
	// PreCreation is called before creating a series to indicate if the series can be created.
	// A non nil error means the series should not be created.
	PreCreation(labels.Labels) error
	// PostCreation is called after creating a series to indicate a creation of series.
	PostCreation(labels.Labels)
	// PostDeletion is called after deletion of series.
	PostDeletion(...labels.Labels)
}

type noopSeriesLifecycleCallback struct{}

func (noopSeriesLifecycleCallback) PreCreation(labels.Labels) error { return nil }
func (noopSeriesLifecycleCallback) PostCreation(labels.Labels)      {}
func (noopSeriesLifecycleCallback) PostDeletion(...labels.Labels)   {}
-												Add liecence file and headers

											
										
										
											2017-04-10 11:59:45 -07:00
+								// Copyright 2017 The Prometheus Authors
 								// Licensed under the Apache License, Version 2.0 (the "License");
 								// you may not use this file except in compliance with the License.
 								// You may obtain a copy of the License at
 								//
 								// http://www.apache.org/licenses/LICENSE-2.0
 								//
 								// Unless required by applicable law or agreed to in writing, software
 								// distributed under the License is distributed on an "AS IS" BASIS,
 								// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								// See the License for the specific language governing permissions and
 								// limitations under the License.
-												Add new interfaces and skeleton

											
										
										
											2016-12-04 04:16:11 -08:00
+								package tsdb
 								import (
-												Always create a new clean segment when starting the WAL. (#608)

* Always create a new clean segment when starting the WAL.
* Ensure we flush the last page after repairing and before recreating the
new segment in Repair.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-05-24 11:33:28 -07:00
+									"fmt"
-												Fix last timestamp initialization

This initializes the chunkDesc's last timestamp to the minimum
value so initial samples with a timestamp of 0 (e.g. in tests)
are not accidentally dropped.

											
										
										
											2017-01-04 05:06:40 -08:00
+									"math"
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									"path/filepath"
-												wal: parallelize sample processing

											
										
										
											2017-10-07 06:55:11 -07:00
+									"runtime"
-												Misc fixes for initial Prometheus integration

											
										
										
											2016-12-14 09:38:46 -08:00
+									"sort"
-												Move index and chunk encoders to own packages

											
										
										
											2017-11-30 06:34:49 -08:00
+									"strings"
-												Add new interfaces and skeleton

											
										
										
											2016-12-04 04:16:11 -08:00
+									"sync"
-												Count writer references on head blocks

											
										
										
											2017-02-04 02:53:52 -08:00
+									"sync/atomic"
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									"time"
-												Switch append refs to string

											
										
										
											2017-05-17 07:43:01 -07:00
-												Periodically fsync WAL, make head cut async

											
										
										
											2017-01-06 06:18:06 -08:00
+									"github.com/go-kit/kit/log"
-												Add levels to all log lines.

Signed-off-by: Goutham Veeramachaneni <cs14btech11014@iith.ac.in>

											
										
										
											2017-09-28 00:19:34 -07:00
+									"github.com/go-kit/kit/log/level"
-												Open db in Read only mode (#588)

* Added db read only open mode and use it for the tsdb cli.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2019-07-23 01:04:48 -07:00
+									"github.com/oklog/ulid"
-												Move stats into meta.json file, cleanup, docs

											
										
										
											2017-01-19 02:22:47 -08:00
+									"github.com/pkg/errors"
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									"github.com/prometheus/client_golang/prometheus"
-												Port tsdb to use pkg/labels. (#6326)

* Port tsdb to use pkg/labels.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Get tests passing.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Remove useless cast.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Appease linters.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-11-18 11:53:33 -08:00
+									"github.com/prometheus/prometheus/pkg/labels"
-												Unify Iterator interfaces. All point to storage now.

This is part of https://github.com/prometheus/prometheus/pull/5882 that can be done to simplify things.
All todos I added will be fixed in follow up PRs.

* querier.Querier, querier.Appender, querier.SeriesSet, and querier.Series interfaces merged
with storage interface.go. All imports that.
* querier.SeriesIterator replaced by chunkenc.Iterator
* Added chunkenc.Iterator.Seek method and tests for xor implementation (?)
* Since we properly handle SelectParams for Select methods I adjusted min max
based on that. This should help in terms of performance for queries with functions like offset.
* added Seek to deletedIterator and test.
* storage/tsdb was removed as it was only a unnecessary glue with incompatible structs.

No logic was changed, only different source of abstractions, so no need for benchmarks.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

											
										
										
											2020-02-06 07:58:38 -08:00
+									"github.com/prometheus/prometheus/storage"
-												Cleanup after merging tsdb into prometheus

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-08-13 01:34:14 -07:00
+									"github.com/prometheus/prometheus/tsdb/chunkenc"
 									"github.com/prometheus/prometheus/tsdb/chunks"
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
-												Cleanup after merging tsdb into prometheus

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-08-13 01:34:14 -07:00
+									"github.com/prometheus/prometheus/tsdb/index"
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+									"github.com/prometheus/prometheus/tsdb/record"
 									"github.com/prometheus/prometheus/tsdb/tombstones"
-												Cleanup after merging tsdb into prometheus

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-08-13 01:34:14 -07:00
+									"github.com/prometheus/prometheus/tsdb/wal"
-												Add new interfaces and skeleton

											
										
										
											2016-12-04 04:16:11 -08:00
+								)
-												Write to WAL before appending to memory storage

											
										
										
											2017-01-17 07:33:58 -08:00
+								var (
-												tsdb: error on series with duplicate labels (#6664)

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
											
										
										
											2020-01-20 03:05:27 -08:00
+									// ErrInvalidSample is returned if an appended sample is not valid and can't
 									// be ingested.
 									ErrInvalidSample = errors.New("invalid sample")
-												Write to WAL before appending to memory storage

											
										
										
											2017-01-17 07:33:58 -08:00
+								)
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+								// Head handles reads and writes of time series data within a time window.
 								type Head struct {
-												Merge the 2.13 release branch to master (#6117)


											
										
										
											2019-10-09 08:41:46 -07:00
+									// Keep all 64bit atomically accessed variables at the top of this struct.
 									// See https://golang.org/pkg/sync/atomic/#pkg-note-BUG for more info.
 									chunkRange       int64
 									numSeries        uint64
 									minTime, maxTime int64 // Current min and max of the samples included in the head.
 									minValidTime     int64 // Mint allowed to be added to the head. It shouldn't be lower than the maxt of the last persisted block.
 									lastSeriesID     uint64
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									metrics      *headMetrics
 									wal          *wal.WAL
 									logger       log.Logger
 									appendPool   sync.Pool
 									seriesPool   sync.Pool
 									bytesPool    sync.Pool
 									memChunkPool sync.Pool
-												Count writer references on head blocks

											
										
										
											2017-02-04 02:53:52 -08:00
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+									// All series addressable by their ID or hash.
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+									series         *stripeSeries
 									seriesCallback SeriesLifecycleCallback
-												Consolidate mem index into HeadBlock

											
										
										
											2016-12-21 16:12:28 -08:00
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+									symMtx  sync.RWMutex
 									symbols map[string]struct{}
-												Fix punctuation nits

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2020-02-17 10:37:09 -08:00
+									values  map[string]stringset // Label names to possible values.
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
-												Keep series that are still in WAL in checkpoints (#577)

If all the samples are deleted for a series,
we should still keep the series in the WAL as
anything else reading the WAL will still care
about it in order to understand the samples.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-04-09 06:16:24 -07:00
+									deletedMtx sync.Mutex
 									deleted    map[uint64]int // Deleted series, and what WAL segment they must be kept until.
-												Fix punctuation nits

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2020-02-17 10:37:09 -08:00
+									postings *index.MemPostings // Postings lists for terms.
-												Head Cardinality Status Page (#6125)

* Adding TSDB Head Stats like cardinality to Status Page

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Moving mutx to Head

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Renaming variabls

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Renaming variabls and html

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Removing unwanted whitespaces

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Adding Tests, Banchmarks and Max Heap for Postings Stats

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Adding more tests for postingstats and web handler

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Adding more tests for postingstats and web handler

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Remove generated asset file that is no longer used

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

* Changing comment and variable name for more readability

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Using time.Duration in postings status function and removing refresh button from web page

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

											
										
										
											2019-11-04 18:06:13 -08:00
-												Bring back tombstones to Head block (#6542)

* Bring back tombstones to Head block

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add test cases

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Cleanup

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-01-20 07:38:00 -08:00
+									tombstones *tombstones.MemTombstones
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+									iso *isolation
-												Head Cardinality Status Page (#6125)

* Adding TSDB Head Stats like cardinality to Status Page

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Moving mutx to Head

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Renaming variabls

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Renaming variabls and html

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Removing unwanted whitespaces

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Adding Tests, Banchmarks and Max Heap for Postings Stats

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Adding more tests for postingstats and web handler

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Adding more tests for postingstats and web handler

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Remove generated asset file that is no longer used

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

* Changing comment and variable name for more readability

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Using time.Duration in postings status function and removing refresh button from web page

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

											
										
										
											2019-11-04 18:06:13 -08:00
+									cardinalityMutex      sync.Mutex
-												Fix punctuation nits

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2020-02-17 10:37:09 -08:00
+									cardinalityCache      *index.PostingsStats // Posting stats cache which will expire after 30sec.
 									lastPostingsStatsCall time.Duration        // Last posting stats call (PostingsCardinalityStats()) time for caching.
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
 									// chunkDiskMapper is used to write and read Head chunks to/from disk.
 									chunkDiskMapper *chunks.ChunkDiskMapper
 									// chunkDirRoot is the parent directory of the chunks directory.
 									chunkDirRoot string
-												More explicit chunks and  head error handling. (#7277)


											
										
										
											2020-05-22 02:03:23 -07:00
 									closedMtx sync.Mutex
 									closed    bool
-												Add new interfaces and skeleton

											
										
										
											2016-12-04 04:16:11 -08:00
+								}
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+								type headMetrics struct {
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									activeAppenders          prometheus.Gauge
 									series                   prometheus.GaugeFunc
 									seriesCreated            prometheus.Counter
 									seriesRemoved            prometheus.Counter
 									seriesNotFound           prometheus.Counter
 									chunks                   prometheus.Gauge
 									chunksCreated            prometheus.Counter
 									chunksRemoved            prometheus.Counter
 									gcDuration               prometheus.Summary
 									samplesAppended          prometheus.Counter
 									outOfBoundSamples        prometheus.Counter
 									outOfOrderSamples        prometheus.Counter
 									walTruncateDuration      prometheus.Summary
 									walCorruptionsTotal      prometheus.Counter
 									headTruncateFail         prometheus.Counter
 									headTruncateTotal        prometheus.Counter
 									checkpointDeleteFail     prometheus.Counter
 									checkpointDeleteTotal    prometheus.Counter
 									checkpointCreationFail   prometheus.Counter
 									checkpointCreationTotal  prometheus.Counter
 									mmapChunkCorruptionTotal prometheus.Counter
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+								}
 								func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+									m := &headMetrics{
 										activeAppenders: prometheus.NewGauge(prometheus.GaugeOpts{
 											Name: "prometheus_tsdb_head_active_appenders",
 											Help: "Number of currently active appender transactions",
 										}),
 										series: prometheus.NewGaugeFunc(prometheus.GaugeOpts{
 											Name: "prometheus_tsdb_head_series",
 											Help: "Total number of series in the head block.",
 										}, func() float64 {
 											return float64(h.NumSeries())
 										}),
 										seriesCreated: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_head_series_created_total",
 											Help: "Total number of series created in the head",
 										}),
 										seriesRemoved: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_head_series_removed_total",
 											Help: "Total number of series removed in the head",
 										}),
 										seriesNotFound: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_head_series_not_found_total",
 											Help: "Total number of requests for series that were not found.",
 										}),
 										chunks: prometheus.NewGauge(prometheus.GaugeOpts{
 											Name: "prometheus_tsdb_head_chunks",
 											Help: "Total number of chunks in the head block.",
 										}),
 										chunksCreated: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_head_chunks_created_total",
 											Help: "Total number of chunks created in the head",
 										}),
 										chunksRemoved: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_head_chunks_removed_total",
 											Help: "Total number of chunks removed in the head",
 										}),
 										gcDuration: prometheus.NewSummary(prometheus.SummaryOpts{
 											Name: "prometheus_tsdb_head_gc_duration_seconds",
 											Help: "Runtime of garbage collection in the head block.",
 										}),
 										walTruncateDuration: prometheus.NewSummary(prometheus.SummaryOpts{
 											Name: "prometheus_tsdb_wal_truncate_duration_seconds",
 											Help: "Duration of WAL truncation.",
 										}),
 										walCorruptionsTotal: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_wal_corruptions_total",
 											Help: "Total number of WAL corruptions.",
 										}),
 										samplesAppended: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_head_samples_appended_total",
 											Help: "Total number of appended samples.",
 										}),
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+										outOfBoundSamples: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_out_of_bound_samples_total",
 											Help: "Total number of out of bound samples ingestion failed attempts.",
 										}),
 										outOfOrderSamples: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_out_of_order_samples_total",
 											Help: "Total number of out of order samples ingestion failed attempts.",
 										}),
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+										headTruncateFail: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_head_truncations_failed_total",
 											Help: "Total number of head truncations that failed.",
 										}),
 										headTruncateTotal: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_head_truncations_total",
 											Help: "Total number of head truncations attempted.",
 										}),
 										checkpointDeleteFail: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_checkpoint_deletions_failed_total",
 											Help: "Total number of checkpoint deletions that failed.",
 										}),
 										checkpointDeleteTotal: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_checkpoint_deletions_total",
 											Help: "Total number of checkpoint deletions attempted.",
 										}),
 										checkpointCreationFail: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_checkpoint_creations_failed_total",
 											Help: "Total number of checkpoint creations that failed.",
 										}),
 										checkpointCreationTotal: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_checkpoint_creations_total",
 											Help: "Total number of checkpoint creations attempted.",
 										}),
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+										mmapChunkCorruptionTotal: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_mmap_chunk_corruptions_total",
 											Help: "Total number of memory-mapped chunk corruptions.",
 										}),
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+									}
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
 									if r != nil {
 										r.MustRegister(
 											m.activeAppenders,
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+											m.series,
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+											m.chunks,
 											m.chunksCreated,
 											m.chunksRemoved,
 											m.seriesCreated,
 											m.seriesRemoved,
-												head: track number of series not found errors in metric

											
										
										
											2017-10-12 06:25:12 -07:00
+											m.seriesNotFound,
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+											m.gcDuration,
 											m.walTruncateDuration,
-												re-add the missing prometheus_tsdb_wal_corruptions_total (#473)

closes https://github.com/prometheus/tsdb/issues/471

after implementing the new WAL this metric was missing so adding it again.
Also added it in a test to make sure it works as expected.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-12-18 02:24:56 -08:00
+											m.walCorruptionsTotal,
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+											m.samplesAppended,
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+											m.outOfBoundSamples,
 											m.outOfOrderSamples,
-												Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2018-09-25 06:48:33 -07:00
+											m.headTruncateFail,
 											m.headTruncateTotal,
-												Add new metrics.

1. 'prometheus_tsdb_wal_truncate_fail' for failed WAL truncation.
2. 'prometheus_tsdb_checkpoint_delete_fail' for failed old checkpoint delete.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2018-09-25 04:49:09 -07:00
+											m.checkpointDeleteFail,
-												Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2018-09-25 06:48:33 -07:00
+											m.checkpointDeleteTotal,
 											m.checkpointCreationFail,
 											m.checkpointCreationTotal,
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+											m.mmapChunkCorruptionTotal,
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+											// Metrics bound to functions and not needed in tests
 											// can be created and registered on the spot.
 											prometheus.NewGaugeFunc(prometheus.GaugeOpts{
 												Name: "prometheus_tsdb_head_max_time",
 												Help: "Maximum timestamp of the head block. The unit is decided by the library consumer.",
 											}, func() float64 {
 												return float64(h.MaxTime())
 											}),
 											prometheus.NewGaugeFunc(prometheus.GaugeOpts{
 												Name: "prometheus_tsdb_head_min_time",
 												Help: "Minimum time bound of the head block. The unit is decided by the library consumer.",
 											}, func() float64 {
 												return float64(h.MinTime())
 											}),
 											prometheus.NewGaugeFunc(prometheus.GaugeOpts{
 												Name: "prometheus_tsdb_isolation_low_watermark",
 												Help: "The lowest TSDB append ID that is still referenced.",
 											}, func() float64 {
 												return float64(h.iso.lowWatermark())
 											}),
 											prometheus.NewGaugeFunc(prometheus.GaugeOpts{
 												Name: "prometheus_tsdb_isolation_high_watermark",
 												Help: "The highest TSDB append ID that has been given out.",
 											}, func() float64 {
-												Optimise lowWatermark in Isolation (#7332)

* Track open appenders in doubly-linked list to make lowWatermark O(1).
* Use RW locks.
* Added BenchmarkIsolationWithState.

Signed-off-by: Peter Štibraný <peter.stibrany@grafana.com>
											
										
										
											2020-06-03 11:09:05 -07:00
+												return float64(h.iso.lastAppendID())
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+											}),
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+										)
 									}
 									return m
 								}
-												Head Cardinality Status Page (#6125)

* Adding TSDB Head Stats like cardinality to Status Page

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Moving mutx to Head

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Renaming variabls

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Renaming variabls and html

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Removing unwanted whitespaces

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Adding Tests, Banchmarks and Max Heap for Postings Stats

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Adding more tests for postingstats and web handler

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Adding more tests for postingstats and web handler

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Remove generated asset file that is no longer used

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

* Changing comment and variable name for more readability

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Using time.Duration in postings status function and removing refresh button from web page

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

											
										
										
											2019-11-04 18:06:13 -08:00
+								const cardinalityCacheExpirationTime = time.Duration(30) * time.Second
 								// PostingsCardinalityStats returns top 10 highest cardinality stats By label and value names.
 								func (h *Head) PostingsCardinalityStats(statsByLabelName string) *index.PostingsStats {
 									h.cardinalityMutex.Lock()
 									defer h.cardinalityMutex.Unlock()
 									currentTime := time.Duration(time.Now().Unix()) * time.Second
 									seconds := currentTime - h.lastPostingsStatsCall
 									if seconds > cardinalityCacheExpirationTime {
 										h.cardinalityCache = nil
 									}
 									if h.cardinalityCache != nil {
 										return h.cardinalityCache
 									}
 									h.cardinalityCache = h.postings.Stats(statsByLabelName)
 									h.lastPostingsStatsCall = time.Duration(time.Now().Unix()) * time.Second
 									return h.cardinalityCache
 								}
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+								// NewHead opens the head block in dir.
-												made stripe size configurable (#6644)

Signed-off-by: Thor <thansen@digitalocean.com>
											
										
										
											2020-01-29 23:12:43 -08:00
+								// stripeSize sets the number of entries in the hash map, it must be a power of 2.
 								// A larger stripeSize will allocate more memory up-front, but will increase performance when handling a large number of series.
 								// A smaller stripeSize reduces the memory allocated, but can decrease performance with large number of series.
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+								func NewHead(r prometheus.Registerer, l log.Logger, wal *wal.WAL, chunkRange int64, chkDirRoot string, pool chunkenc.Pool, stripeSize int, seriesCallback SeriesLifecycleCallback) (*Head, error) {
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									if l == nil {
 										l = log.NewNopLogger()
 									}
 									if chunkRange < 1 {
 										return nil, errors.Errorf("invalid chunk range %d", chunkRange)
 									}
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+									if seriesCallback == nil {
 										seriesCallback = &noopSeriesLifecycleCallback{}
 									}
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+									h := &Head{
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+										wal:        wal,
 										logger:     l,
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+										chunkRange: chunkRange,
-												Properly initialize head time

This fixes various issues when initializing the head time range
under different starting conditions.

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-25 14:19:32 -07:00
+										minTime:    math.MaxInt64,
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+										maxTime:    math.MinInt64,
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+										series:     newStripeSeries(stripeSize, seriesCallback),
-												Implement Delete on HeadBlock

Signed-off-by: Goutham Veeramachaneni <cs14btech11014@iith.ac.in>

											
										
										
											2017-05-15 10:58:14 -07:00
+										values:     map[string]stringset{},
-												Persist series without allocating the full set

Change index persistence for series to not be accumulated in memory
before being written as one large batch. `Labels` and `ChunkMeta`
objects are reused.
This cuts down memory spikes during compaction of multiple blocks
significantly.

As part of the the Index{Reader,Writer} now have an explicit notion of
symbols and series must be inserted in order.

											
										
										
											2017-08-05 04:31:48 -07:00
+										symbols:    map[string]struct{}{},
-												Move index and chunk encoders to own packages

											
										
										
											2017-11-30 06:34:49 -08:00
+										postings:   index.NewUnorderedMemPostings(),
-												Bring back tombstones to Head block (#6542)

* Bring back tombstones to Head block

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add test cases

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Cleanup

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-01-20 07:38:00 -08:00
+										tombstones: tombstones.NewMemTombstones(),
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+										iso:        newIsolation(),
-												Keep series that are still in WAL in checkpoints (#577)

If all the samples are deleted for a series,
we should still keep the series in the WAL as
anything else reading the WAL will still care
about it in order to understand the samples.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-04-09 06:16:24 -07:00
+										deleted:    map[uint64]int{},
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+										memChunkPool: sync.Pool{
 											New: func() interface{} {
 												return &memChunk{}
 											},
 										},
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+										chunkDirRoot:   chkDirRoot,
 										seriesCallback: seriesCallback,
-												Fix races

											
										
										
											2017-01-07 07:20:32 -08:00
+									}
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									h.metrics = newHeadMetrics(h, r)
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									if pool == nil {
 										pool = chunkenc.NewPool()
 									}
 									var err error
 									h.chunkDiskMapper, err = chunks.NewChunkDiskMapper(mmappedChunksDir(chkDirRoot), pool)
 									if err != nil {
 										return nil, err
 									}
-												Filter WAL data in Head, misc fixes

											
										
										
											2017-09-06 07:20:37 -07:00
+									return h, nil
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+								}
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+								func mmappedChunksDir(dir string) string { return filepath.Join(dir, "chunks_head") }
-												wal: parallelize sample processing

											
										
										
											2017-10-07 06:55:11 -07:00
+								// processWALSamples adds a partition of samples it receives to the head and passes
 								// them on to other workers.
 								// Samples before the mint timestamp are discarded.
 								func (h *Head) processWALSamples(
-												Properly initialize head time

This fixes various issues when initializing the head time range
under different starting conditions.

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-25 14:19:32 -07:00
+									minValidTime int64,
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+									input <-chan []record.RefSample, output chan<- []record.RefSample,
-												wal: parallelize sample processing

											
										
										
											2017-10-07 06:55:11 -07:00
+								) (unknownRefs uint64) {
 									defer close(output)
-												Keep local cache of ids.

With the various goroutines running, the locking
in getByID is notable. This cuts cpu usage by ~25%
and walltime by ~20%.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>

											
										
										
											2018-10-31 05:51:21 -07:00
+									// Mitigate lock contention in getByID.
 									refSeries := map[uint64]*memSeries{}
-												Properly initialize head time

This fixes various issues when initializing the head time range
under different starting conditions.

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-25 14:19:32 -07:00
+									mint, maxt := int64(math.MaxInt64), int64(math.MinInt64)
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
-												wal: parallelize sample processing

											
										
										
											2017-10-07 06:55:11 -07:00
+									for samples := range input {
 										for _, s := range samples {
-												Only send WAL read workers the samples they need.

Calculating the modulus in each worker was a hotspot,
and meant that you had more work to do the more cores you had.
This cuts CPU usage (on my 8 core, 4 real core machine) by
33%, and walltime by 3%

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>

											
										
										
											2018-10-31 15:52:26 -07:00
+											if s.T < minValidTime {
-												wal: parallelize sample processing

											
										
										
											2017-10-07 06:55:11 -07:00
+												continue
 											}
-												Keep local cache of ids.

With the various goroutines running, the locking
in getByID is notable. This cuts cpu usage by ~25%
and walltime by ~20%.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>

											
										
										
											2018-10-31 05:51:21 -07:00
+											ms := refSeries[s.Ref]
-												wal: parallelize sample processing

											
										
										
											2017-10-07 06:55:11 -07:00
+											if ms == nil {
-												Keep local cache of ids.

With the various goroutines running, the locking
in getByID is notable. This cuts cpu usage by ~25%
and walltime by ~20%.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>

											
										
										
											2018-10-31 05:51:21 -07:00
+												ms = h.series.getByID(s.Ref)
 												if ms == nil {
 													unknownRefs++
 													continue
 												}
 												refSeries[s.Ref] = ms
-												wal: parallelize sample processing

											
										
										
											2017-10-07 06:55:11 -07:00
+											}
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+											if _, chunkCreated := ms.append(s.T, s.V, 0, h.chunkDiskMapper); chunkCreated {
-												wal: parallelize sample processing

											
										
										
											2017-10-07 06:55:11 -07:00
+												h.metrics.chunksCreated.Inc()
 												h.metrics.chunks.Inc()
 											}
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+											if s.T > maxt {
 												maxt = s.T
 											}
-												Properly initialize head time

This fixes various issues when initializing the head time range
under different starting conditions.

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-25 14:19:32 -07:00
+											if s.T < mint {
 												mint = s.T
 											}
-												wal: parallelize sample processing

											
										
										
											2017-10-07 06:55:11 -07:00
+										}
 										output <- samples
 									}
-												Properly initialize head time

This fixes various issues when initializing the head time range
under different starting conditions.

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-25 14:19:32 -07:00
+									h.updateMinMaxTime(mint, maxt)
 									return unknownRefs
 								}
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
-												Properly initialize head time

This fixes various issues when initializing the head time range
under different starting conditions.

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-25 14:19:32 -07:00
+								func (h *Head) updateMinMaxTime(mint, maxt int64) {
 									for {
 										lt := h.MinTime()
 										if mint >= lt {
 											break
 										}
 										if atomic.CompareAndSwapInt64(&h.minTime, lt, mint) {
 											break
 										}
 									}
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+									for {
 										ht := h.MaxTime()
 										if maxt <= ht {
 											break
 										}
 										if atomic.CompareAndSwapInt64(&h.maxTime, ht, maxt) {
 											break
 										}
 									}
-												wal: parallelize sample processing

											
										
										
											2017-10-07 06:55:11 -07:00
+								}
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+								func (h *Head) loadWAL(r *wal.Reader, multiRef map[uint64]uint64, mmappedChunks map[uint64][]*mmappedChunk) (err error) {
-												Use boolean function instead of postings to drop WAL series

There is not guarantee or requirement for WAL writers to only add
series entries in increasing order of IDs. A postings list cannot look
back and thus unordered WAL entries would skip over IDs to not truncate
from the WAL.
We replace it with a simple boolean check function that does not require
order.

											
										
										
											2017-09-21 02:02:30 -07:00
+									// Track number of samples that referenced a series we don't know about
 									// for error reporting.
-												wal: parallelize sample processing

											
										
										
											2017-10-07 06:55:11 -07:00
+									var unknownRefs uint64
 									// Start workers that each process samples for a partition of the series ID space.
 									// They are connected through a ring of channels which ensures that all sample batches
 									// read from the WAL are processed in order.
 									var (
-												Remove unnecessary lock in loadWAL (#6107)

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2019-10-07 22:47:40 -07:00
+										wg      sync.WaitGroup
 										n       = runtime.GOMAXPROCS(0)
 										inputs  = make([]chan []record.RefSample, n)
 										outputs = make([]chan []record.RefSample, n)
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
 										dec    record.Decoder
 										shards = make([][]record.RefSample, n)
 										decoded                      = make(chan interface{}, 10)
 										decodeErr, seriesCreationErr error
 										seriesPool                   = sync.Pool{
 											New: func() interface{} {
 												return []record.RefSeries{}
 											},
 										}
 										samplesPool = sync.Pool{
 											New: func() interface{} {
 												return []record.RefSample{}
 											},
 										}
 										tstonesPool = sync.Pool{
 											New: func() interface{} {
 												return []tombstones.Stone{}
 											},
 										}
-												wal: parallelize sample processing

											
										
										
											2017-10-07 06:55:11 -07:00
+									)
-												Ensure workers terminated fully before reading unknownRefs

											
										
										
											2017-10-11 01:12:29 -07:00
-												move the wal repair logic in db.Open (#633)

* move the wal repair logic in db.Open

This is to allow opening a wal in a read oly mode without triggering a
repair.

Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>
											
										
										
											2019-06-14 08:39:22 -07:00
+									defer func() {
 										// For CorruptionErr ensure to terminate all workers before exiting.
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+										_, ok := err.(*wal.CorruptionErr)
 										if ok || seriesCreationErr != nil {
-												move the wal repair logic in db.Open (#633)

* move the wal repair logic in db.Open

This is to allow opening a wal in a read oly mode without triggering a
repair.

Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>
											
										
										
											2019-06-14 08:39:22 -07:00
+											for i := 0; i < n; i++ {
 												close(inputs[i])
 												for range outputs[i] {
 												}
 											}
 											wg.Wait()
 										}
 									}()
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+									wg.Add(n)
-												wal: parallelize sample processing

											
										
										
											2017-10-07 06:55:11 -07:00
+									for i := 0; i < n; i++ {
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+										outputs[i] = make(chan []record.RefSample, 300)
 										inputs[i] = make(chan []record.RefSample, 300)
-												wal: parallelize sample processing

											
										
										
											2017-10-07 06:55:11 -07:00
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+										go func(input <-chan []record.RefSample, output chan<- []record.RefSample) {
-												no overlapping on compaction when an existing block is not within default boundaries. (#461)

closes https://github.com/prometheus/prometheus/issues/4643

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-12-04 02:30:49 -08:00
+											unknown := h.processWALSamples(h.minValidTime, input, output)
-												wal: parallelize sample processing

											
										
										
											2017-10-07 06:55:11 -07:00
+											atomic.AddUint64(&unknownRefs, unknown)
-												Ensure workers terminated fully before reading unknownRefs

											
										
										
											2017-10-11 01:12:29 -07:00
+											wg.Done()
-												Only send WAL read workers the samples they need.

Calculating the modulus in each worker was a hotspot,
and meant that you had more work to do the more cores you had.
This cuts CPU usage (on my 8 core, 4 real core machine) by
33%, and walltime by 3%

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>

											
										
										
											2018-10-31 15:52:26 -07:00
+										}(inputs[i], outputs[i])
-												wal: parallelize sample processing

											
										
										
											2017-10-07 06:55:11 -07:00
+									}
-												Use boolean function instead of postings to drop WAL series

There is not guarantee or requirement for WAL writers to only add
series entries in increasing order of IDs. A postings list cannot look
back and thus unordered WAL entries would skip over IDs to not truncate
from the WAL.
We replace it with a simple boolean check function that does not require
order.

											
										
										
											2017-09-21 02:02:30 -07:00
-												Decode WAL in Separate Goroutine (#6230)

* Make WAL replay benchmark more representative

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

* Move decoding records from the WAL into goroutine

Decoding the WAL records accounts for a significant amount of time on
startup, and can be done in parallel with creating series/samples to
speed up startup. However, records still must be handled in order, so
only a single goroutine can do the decoding.

benchmark
old ns/op     new ns/op     delta
BenchmarkLoadWAL/batches=10,seriesPerBatch=100,samplesPerSeries=7200-8
481607033     391971490     -18.61%
BenchmarkLoadWAL/batches=10,seriesPerBatch=10000,samplesPerSeries=50-8
836394378     629067006     -24.79%
BenchmarkLoadWAL/batches=10,seriesPerBatch=1000,samplesPerSeries=480-8
348238658     234218667     -32.74%

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

											
										
										
											2019-11-07 08:26:45 -08:00
+									go func() {
 										defer close(decoded)
 										for r.Next() {
 											rec := r.Record()
 											switch dec.Type(rec) {
 											case record.Series:
 												series := seriesPool.Get().([]record.RefSeries)[:0]
 												series, err = dec.Series(rec, series)
 												if err != nil {
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+													decodeErr = &wal.CorruptionErr{
-												Decode WAL in Separate Goroutine (#6230)

* Make WAL replay benchmark more representative

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

* Move decoding records from the WAL into goroutine

Decoding the WAL records accounts for a significant amount of time on
startup, and can be done in parallel with creating series/samples to
speed up startup. However, records still must be handled in order, so
only a single goroutine can do the decoding.

benchmark
old ns/op     new ns/op     delta
BenchmarkLoadWAL/batches=10,seriesPerBatch=100,samplesPerSeries=7200-8
481607033     391971490     -18.61%
BenchmarkLoadWAL/batches=10,seriesPerBatch=10000,samplesPerSeries=50-8
836394378     629067006     -24.79%
BenchmarkLoadWAL/batches=10,seriesPerBatch=1000,samplesPerSeries=480-8
348238658     234218667     -32.74%

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

											
										
										
											2019-11-07 08:26:45 -08:00
+														Err:     errors.Wrap(err, "decode series"),
 														Segment: r.Segment(),
 														Offset:  r.Offset(),
 													}
 													return
 												}
 												decoded <- series
 											case record.Samples:
 												samples := samplesPool.Get().([]record.RefSample)[:0]
 												samples, err = dec.Samples(rec, samples)
 												if err != nil {
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+													decodeErr = &wal.CorruptionErr{
-												Decode WAL in Separate Goroutine (#6230)

* Make WAL replay benchmark more representative

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

* Move decoding records from the WAL into goroutine

Decoding the WAL records accounts for a significant amount of time on
startup, and can be done in parallel with creating series/samples to
speed up startup. However, records still must be handled in order, so
only a single goroutine can do the decoding.

benchmark
old ns/op     new ns/op     delta
BenchmarkLoadWAL/batches=10,seriesPerBatch=100,samplesPerSeries=7200-8
481607033     391971490     -18.61%
BenchmarkLoadWAL/batches=10,seriesPerBatch=10000,samplesPerSeries=50-8
836394378     629067006     -24.79%
BenchmarkLoadWAL/batches=10,seriesPerBatch=1000,samplesPerSeries=480-8
348238658     234218667     -32.74%

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

											
										
										
											2019-11-07 08:26:45 -08:00
+														Err:     errors.Wrap(err, "decode samples"),
 														Segment: r.Segment(),
 														Offset:  r.Offset(),
 													}
 													return
 												}
 												decoded <- samples
 											case record.Tombstones:
 												tstones := tstonesPool.Get().([]tombstones.Stone)[:0]
 												tstones, err = dec.Tombstones(rec, tstones)
 												if err != nil {
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+													decodeErr = &wal.CorruptionErr{
-												Decode WAL in Separate Goroutine (#6230)

* Make WAL replay benchmark more representative

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

* Move decoding records from the WAL into goroutine

Decoding the WAL records accounts for a significant amount of time on
startup, and can be done in parallel with creating series/samples to
speed up startup. However, records still must be handled in order, so
only a single goroutine can do the decoding.

benchmark
old ns/op     new ns/op     delta
BenchmarkLoadWAL/batches=10,seriesPerBatch=100,samplesPerSeries=7200-8
481607033     391971490     -18.61%
BenchmarkLoadWAL/batches=10,seriesPerBatch=10000,samplesPerSeries=50-8
836394378     629067006     -24.79%
BenchmarkLoadWAL/batches=10,seriesPerBatch=1000,samplesPerSeries=480-8
348238658     234218667     -32.74%

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

											
										
										
											2019-11-07 08:26:45 -08:00
+														Err:     errors.Wrap(err, "decode tombstones"),
 														Segment: r.Segment(),
 														Offset:  r.Offset(),
 													}
 													return
 												}
 												decoded <- tstones
 											default:
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+												decodeErr = &wal.CorruptionErr{
-												Decode WAL in Separate Goroutine (#6230)

* Make WAL replay benchmark more representative

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

* Move decoding records from the WAL into goroutine

Decoding the WAL records accounts for a significant amount of time on
startup, and can be done in parallel with creating series/samples to
speed up startup. However, records still must be handled in order, so
only a single goroutine can do the decoding.

benchmark
old ns/op     new ns/op     delta
BenchmarkLoadWAL/batches=10,seriesPerBatch=100,samplesPerSeries=7200-8
481607033     391971490     -18.61%
BenchmarkLoadWAL/batches=10,seriesPerBatch=10000,samplesPerSeries=50-8
836394378     629067006     -24.79%
BenchmarkLoadWAL/batches=10,seriesPerBatch=1000,samplesPerSeries=480-8
348238658     234218667     -32.74%

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

											
										
										
											2019-11-07 08:26:45 -08:00
+													Err:     errors.Errorf("invalid record type %v", dec.Type(rec)),
-												repair wal when the record cannot be decoded (#453)

* repair wal when the record cannot be decoded

Currently repair is run only when the error happens in the reader.

A corruption can occur after the record is read and when it is decoded.
This change wraps the error at decoding as a CorruptionErr as this error
is expected to trigger a repair.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-11-30 03:37:04 -08:00
+													Segment: r.Segment(),
 													Offset:  r.Offset(),
 												}
-												Decode WAL in Separate Goroutine (#6230)

* Make WAL replay benchmark more representative

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

* Move decoding records from the WAL into goroutine

Decoding the WAL records accounts for a significant amount of time on
startup, and can be done in parallel with creating series/samples to
speed up startup. However, records still must be handled in order, so
only a single goroutine can do the decoding.

benchmark
old ns/op     new ns/op     delta
BenchmarkLoadWAL/batches=10,seriesPerBatch=100,samplesPerSeries=7200-8
481607033     391971490     -18.61%
BenchmarkLoadWAL/batches=10,seriesPerBatch=10000,samplesPerSeries=50-8
836394378     629067006     -24.79%
BenchmarkLoadWAL/batches=10,seriesPerBatch=1000,samplesPerSeries=480-8
348238658     234218667     -32.74%

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

											
										
										
											2019-11-07 08:26:45 -08:00
+												return
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+											}
-												Decode WAL in Separate Goroutine (#6230)

* Make WAL replay benchmark more representative

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

* Move decoding records from the WAL into goroutine

Decoding the WAL records accounts for a significant amount of time on
startup, and can be done in parallel with creating series/samples to
speed up startup. However, records still must be handled in order, so
only a single goroutine can do the decoding.

benchmark
old ns/op     new ns/op     delta
BenchmarkLoadWAL/batches=10,seriesPerBatch=100,samplesPerSeries=7200-8
481607033     391971490     -18.61%
BenchmarkLoadWAL/batches=10,seriesPerBatch=10000,samplesPerSeries=50-8
836394378     629067006     -24.79%
BenchmarkLoadWAL/batches=10,seriesPerBatch=1000,samplesPerSeries=480-8
348238658     234218667     -32.74%

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

											
										
										
											2019-11-07 08:26:45 -08:00
+										}
 									}()
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+								Outer:
-												Decode WAL in Separate Goroutine (#6230)

* Make WAL replay benchmark more representative

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

* Move decoding records from the WAL into goroutine

Decoding the WAL records accounts for a significant amount of time on
startup, and can be done in parallel with creating series/samples to
speed up startup. However, records still must be handled in order, so
only a single goroutine can do the decoding.

benchmark
old ns/op     new ns/op     delta
BenchmarkLoadWAL/batches=10,seriesPerBatch=100,samplesPerSeries=7200-8
481607033     391971490     -18.61%
BenchmarkLoadWAL/batches=10,seriesPerBatch=10000,samplesPerSeries=50-8
836394378     629067006     -24.79%
BenchmarkLoadWAL/batches=10,seriesPerBatch=1000,samplesPerSeries=480-8
348238658     234218667     -32.74%

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

											
										
										
											2019-11-07 08:26:45 -08:00
+									for d := range decoded {
 										switch v := d.(type) {
 										case []record.RefSeries:
 											for _, s := range v {
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+												series, created, err := h.getOrCreateWithID(s.Ref, s.Labels.Hash(), s.Labels)
 												if err != nil {
 													seriesCreationErr = err
 													break Outer
 												}
-												Handle multiple refs for the same series when WAL reading. (#623)

This can happen if a given series is created/truncated/recreated.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-06-06 06:28:54 -07:00
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+												if created {
 													// If this series gets a duplicate record, we don't restore its mmapped chunks,
 													// and instead restore everything from WAL records.
 													series.mmappedChunks = mmappedChunks[series.ref]
 													h.metrics.chunks.Add(float64(len(series.mmappedChunks)))
 													h.metrics.chunksCreated.Add(float64(len(series.mmappedChunks)))
 													if len(series.mmappedChunks) > 0 {
 														h.updateMinMaxTime(series.minTime(), series.maxTime())
 													}
 												} else {
 													// TODO(codesome) Discard old samples and mmapped chunks and use mmap chunks for the new series ID.
-												Handle multiple refs for the same series when WAL reading. (#623)

This can happen if a given series is created/truncated/recreated.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-06-06 06:28:54 -07:00
+													// There's already a different ref for this series.
 													multiRef[s.Ref] = series.ref
 												}
-												Create series with ID recorded in WAL when reading it back

											
										
										
											2017-09-19 01:20:19 -07:00
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+												if h.lastSeriesID < s.Ref {
 													h.lastSeriesID = s.Ref
 												}
-												Create series with ID recorded in WAL when reading it back

											
										
										
											2017-09-19 01:20:19 -07:00
+											}
-												Decode WAL in Separate Goroutine (#6230)

* Make WAL replay benchmark more representative

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

* Move decoding records from the WAL into goroutine

Decoding the WAL records accounts for a significant amount of time on
startup, and can be done in parallel with creating series/samples to
speed up startup. However, records still must be handled in order, so
only a single goroutine can do the decoding.

benchmark
old ns/op     new ns/op     delta
BenchmarkLoadWAL/batches=10,seriesPerBatch=100,samplesPerSeries=7200-8
481607033     391971490     -18.61%
BenchmarkLoadWAL/batches=10,seriesPerBatch=10000,samplesPerSeries=50-8
836394378     629067006     -24.79%
BenchmarkLoadWAL/batches=10,seriesPerBatch=1000,samplesPerSeries=480-8
348238658     234218667     -32.74%

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

											
										
										
											2019-11-07 08:26:45 -08:00
+											//lint:ignore SA6002 relax staticcheck verification.
 											seriesPool.Put(v)
 										case []record.RefSample:
 											samples := v
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+											// We split up the samples into chunks of 5000 samples or less.
 											// With O(300 * #cores) in-flight sample batches, large scrapes could otherwise
 											// cause thousands of very large in flight buffers occupying large amounts
 											// of unused memory.
 											for len(samples) > 0 {
-												Only send WAL read workers the samples they need.

Calculating the modulus in each worker was a hotspot,
and meant that you had more work to do the more cores you had.
This cuts CPU usage (on my 8 core, 4 real core machine) by
33%, and walltime by 3%

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>

											
										
										
											2018-10-31 15:52:26 -07:00
+												m := 5000
 												if len(samples) < m {
 													m = len(samples)
 												}
 												for i := 0; i < n; i++ {
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+													var buf []record.RefSample
-												Only send WAL read workers the samples they need.

Calculating the modulus in each worker was a hotspot,
and meant that you had more work to do the more cores you had.
This cuts CPU usage (on my 8 core, 4 real core machine) by
33%, and walltime by 3%

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>

											
										
										
											2018-10-31 15:52:26 -07:00
+													select {
 													case buf = <-outputs[i]:
 													default:
 													}
 													shards[i] = buf[:0]
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+												}
-												Only send WAL read workers the samples they need.

Calculating the modulus in each worker was a hotspot,
and meant that you had more work to do the more cores you had.
This cuts CPU usage (on my 8 core, 4 real core machine) by
33%, and walltime by 3%

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>

											
										
										
											2018-10-31 15:52:26 -07:00
+												for _, sam := range samples[:m] {
-												Handle multiple refs for the same series when WAL reading. (#623)

This can happen if a given series is created/truncated/recreated.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-06-06 06:28:54 -07:00
+													if r, ok := multiRef[sam.Ref]; ok {
 														sam.Ref = r
 													}
-												Only send WAL read workers the samples they need.

Calculating the modulus in each worker was a hotspot,
and meant that you had more work to do the more cores you had.
This cuts CPU usage (on my 8 core, 4 real core machine) by
33%, and walltime by 3%

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>

											
										
										
											2018-10-31 15:52:26 -07:00
+													mod := sam.Ref % uint64(n)
 													shards[mod] = append(shards[mod], sam)
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+												}
-												Only send WAL read workers the samples they need.

Calculating the modulus in each worker was a hotspot,
and meant that you had more work to do the more cores you had.
This cuts CPU usage (on my 8 core, 4 real core machine) by
33%, and walltime by 3%

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>

											
										
										
											2018-10-31 15:52:26 -07:00
+												for i := 0; i < n; i++ {
 													inputs[i] <- shards[i]
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+												}
-												Only send WAL read workers the samples they need.

Calculating the modulus in each worker was a hotspot,
and meant that you had more work to do the more cores you had.
This cuts CPU usage (on my 8 core, 4 real core machine) by
33%, and walltime by 3%

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>

											
										
										
											2018-10-31 15:52:26 -07:00
+												samples = samples[m:]
-												head: limit WAL sample processing batch size

											
										
										
											2017-10-23 07:22:24 -07:00
+											}
-												Decode WAL in Separate Goroutine (#6230)

* Make WAL replay benchmark more representative

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

* Move decoding records from the WAL into goroutine

Decoding the WAL records accounts for a significant amount of time on
startup, and can be done in parallel with creating series/samples to
speed up startup. However, records still must be handled in order, so
only a single goroutine can do the decoding.

benchmark
old ns/op     new ns/op     delta
BenchmarkLoadWAL/batches=10,seriesPerBatch=100,samplesPerSeries=7200-8
481607033     391971490     -18.61%
BenchmarkLoadWAL/batches=10,seriesPerBatch=10000,samplesPerSeries=50-8
836394378     629067006     -24.79%
BenchmarkLoadWAL/batches=10,seriesPerBatch=1000,samplesPerSeries=480-8
348238658     234218667     -32.74%

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

											
										
										
											2019-11-07 08:26:45 -08:00
+											//lint:ignore SA6002 relax staticcheck verification.
 											samplesPool.Put(v)
 										case []tombstones.Stone:
 											for _, s := range v {
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+												for _, itv := range s.Intervals {
-												no overlapping on compaction when an existing block is not within default boundaries. (#461)

closes https://github.com/prometheus/prometheus/issues/4643

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-12-04 02:30:49 -08:00
+													if itv.Maxt < h.minValidTime {
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+														continue
 													}
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+													if m := h.series.getByID(s.Ref); m == nil {
-												Don't crash on an unknown tombstone ref. (#604)

Fixes https://github.com/prometheus/prometheus/issues/5562

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-05-16 06:36:44 -07:00
+														unknownRefs++
 														continue
 													}
-												Bring back tombstones to Head block (#6542)

* Bring back tombstones to Head block

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add test cases

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Cleanup

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-01-20 07:38:00 -08:00
+													h.tombstones.AddInterval(s.Ref, itv)
-												Filter WAL data in Head, misc fixes

											
										
										
											2017-09-06 07:20:37 -07:00
+												}
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+											}
-												Decode WAL in Separate Goroutine (#6230)

* Make WAL replay benchmark more representative

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

* Move decoding records from the WAL into goroutine

Decoding the WAL records accounts for a significant amount of time on
startup, and can be done in parallel with creating series/samples to
speed up startup. However, records still must be handled in order, so
only a single goroutine can do the decoding.

benchmark
old ns/op     new ns/op     delta
BenchmarkLoadWAL/batches=10,seriesPerBatch=100,samplesPerSeries=7200-8
481607033     391971490     -18.61%
BenchmarkLoadWAL/batches=10,seriesPerBatch=10000,samplesPerSeries=50-8
836394378     629067006     -24.79%
BenchmarkLoadWAL/batches=10,seriesPerBatch=1000,samplesPerSeries=480-8
348238658     234218667     -32.74%

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

											
										
										
											2019-11-07 08:26:45 -08:00
+											//lint:ignore SA6002 relax staticcheck verification.
 											tstonesPool.Put(v)
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+										default:
-												Decode WAL in Separate Goroutine (#6230)

* Make WAL replay benchmark more representative

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

* Move decoding records from the WAL into goroutine

Decoding the WAL records accounts for a significant amount of time on
startup, and can be done in parallel with creating series/samples to
speed up startup. However, records still must be handled in order, so
only a single goroutine can do the decoding.

benchmark
old ns/op     new ns/op     delta
BenchmarkLoadWAL/batches=10,seriesPerBatch=100,samplesPerSeries=7200-8
481607033     391971490     -18.61%
BenchmarkLoadWAL/batches=10,seriesPerBatch=10000,samplesPerSeries=50-8
836394378     629067006     -24.79%
BenchmarkLoadWAL/batches=10,seriesPerBatch=1000,samplesPerSeries=480-8
348238658     234218667     -32.74%

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

											
										
										
											2019-11-07 08:26:45 -08:00
+											panic(fmt.Errorf("unexpected decoded type: %T", d))
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+										}
 									}
-												Use boolean function instead of postings to drop WAL series

There is not guarantee or requirement for WAL writers to only add
series entries in increasing order of IDs. A postings list cannot look
back and thus unordered WAL entries would skip over IDs to not truncate
from the WAL.
We replace it with a simple boolean check function that does not require
order.

											
										
										
											2017-09-21 02:02:30 -07:00
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+									if decodeErr != nil {
 										return decodeErr
 									}
 									if seriesCreationErr != nil {
 										// Drain the channel to unblock the goroutine.
 										for range decoded {
 										}
 										return seriesCreationErr
-												Decode WAL in Separate Goroutine (#6230)

* Make WAL replay benchmark more representative

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

* Move decoding records from the WAL into goroutine

Decoding the WAL records accounts for a significant amount of time on
startup, and can be done in parallel with creating series/samples to
speed up startup. However, records still must be handled in order, so
only a single goroutine can do the decoding.

benchmark
old ns/op     new ns/op     delta
BenchmarkLoadWAL/batches=10,seriesPerBatch=100,samplesPerSeries=7200-8
481607033     391971490     -18.61%
BenchmarkLoadWAL/batches=10,seriesPerBatch=10000,samplesPerSeries=50-8
836394378     629067006     -24.79%
BenchmarkLoadWAL/batches=10,seriesPerBatch=1000,samplesPerSeries=480-8
348238658     234218667     -32.74%

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

											
										
										
											2019-11-07 08:26:45 -08:00
+									}
-												Only send WAL read workers the samples they need.

Calculating the modulus in each worker was a hotspot,
and meant that you had more work to do the more cores you had.
This cuts CPU usage (on my 8 core, 4 real core machine) by
33%, and walltime by 3%

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>

											
										
										
											2018-10-31 15:52:26 -07:00
+									// Signal termination to each worker and wait for it to close its output channel.
 									for i := 0; i < n; i++ {
 										close(inputs[i])
 										for range outputs[i] {
 										}
-												wal: parallelize sample processing

											
										
										
											2017-10-07 06:55:11 -07:00
+									}
-												Ensure workers terminated fully before reading unknownRefs

											
										
										
											2017-10-11 01:12:29 -07:00
+									wg.Wait()
-												move the wal repair logic in db.Open (#633)

* move the wal repair logic in db.Open

This is to allow opening a wal in a read oly mode without triggering a
repair.

Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>
											
										
										
											2019-06-14 08:39:22 -07:00
+									if r.Err() != nil {
 										return errors.Wrap(r.Err(), "read records")
 									}
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+									if unknownRefs > 0 {
-												Capitalizing first letter of all log lines (#7043)

Signed-off-by: Marek Slabicki <thaniri@gmail.com>
											
										
										
											2020-04-11 01:22:18 -07:00
+										level.Warn(h.logger).Log("msg", "Unknown series references", "count", unknownRefs)
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+									}
 									return nil
 								}
 								// Init loads data from the write ahead log and prepares the head for writes.
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+								// It should be called before using an appender so that it
-												no overlapping on compaction when an existing block is not within default boundaries. (#461)

closes https://github.com/prometheus/prometheus/issues/4643

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-12-04 02:30:49 -08:00
+								// limits the ingested samples to the head min valid time.
 								func (h *Head) Init(minValidTime int64) error {
 									h.minValidTime = minValidTime
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+									defer h.postings.EnsureOrder()
-												no overlapping on compaction when an existing block is not within default boundaries. (#461)

closes https://github.com/prometheus/prometheus/issues/4643

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-12-04 02:30:49 -08:00
+									defer h.gc() // After loading the wal remove the obsolete data from the head.
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
 									if h.wal == nil {
 										return nil
 									}
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									level.Info(h.logger).Log("msg", "Replaying WAL and on-disk memory mappable chunks if any, this may take a while")
-												Log WAL replay duration

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-03-03 06:11:14 -08:00
+									start := time.Now()
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
 									mmappedChunks, err := h.loadMmappedChunks()
 									if err != nil {
 										level.Error(h.logger).Log("msg", "Loading on-disk chunks failed", "err", err)
 										if _, ok := errors.Cause(err).(*chunks.CorruptionErr); ok {
 											h.metrics.mmapChunkCorruptionTotal.Inc()
 										}
 										// If this fails, data will be recovered from WAL.
 										// Hence we wont lose any data (given WAL is not corrupt).
 										h.removeCorruptedMmappedChunks(err)
 									}
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+									// Backfill the checkpoint first if it exists.
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+									dir, startFrom, err := wal.LastCheckpoint(h.wal.Dir())
 									if err != nil && err != record.ErrNotFound {
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+										return errors.Wrap(err, "find last checkpoint")
 									}
-												Handle multiple refs for the same series when WAL reading. (#623)

This can happen if a given series is created/truncated/recreated.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-06-06 06:28:54 -07:00
+									multiRef := map[uint64]uint64{}
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+									if err == nil {
-												refactor NewSegmentsRangeReader to take multi WAL ranges (#449)

* refactor NewSegmentsRangeReader to take multi WAL ranges

In case of an error when checkpointing the WAL the error doesn't show
the exact WAL index that is corrupter. this is because it uses
MultiReader to read multiply WAL files.
This refactoring allows the NewSegmentsRangeReader to take more than a
single WAL range and it reads all of the ranges by iterating each one.

this changes the logs from
create checkpoint: read segments: corruption after 4841144384 bytes:...
to
create checkpoint: read segments: corruption in segment
data/wal/00017351 at 123142208: ...

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>

											
										
										
											2018-11-30 06:46:16 -08:00
+										sr, err := wal.NewSegmentsReader(dir)
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+										if err != nil {
 											return errors.Wrap(err, "open checkpoint")
 										}
-												move the wal repair logic in db.Open (#633)

* move the wal repair logic in db.Open

This is to allow opening a wal in a read oly mode without triggering a
repair.

Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>
											
										
										
											2019-06-14 08:39:22 -07:00
+										defer func() {
 											if err := sr.Close(); err != nil {
-												Capitalizing first letter of all log lines (#7043)

Signed-off-by: Marek Slabicki <thaniri@gmail.com>
											
										
										
											2020-04-11 01:22:18 -07:00
+												level.Warn(h.logger).Log("msg", "Error while closing the wal segments reader", "err", err)
-												move the wal repair logic in db.Open (#633)

* move the wal repair logic in db.Open

This is to allow opening a wal in a read oly mode without triggering a
repair.

Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>
											
										
										
											2019-06-14 08:39:22 -07:00
+											}
 										}()
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
 										// A corrupted checkpoint is a hard error for now and requires user
 										// intervention. There's likely little data that can be recovered anyway.
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+										if err := h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks); err != nil {
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+											return errors.Wrap(err, "backfill checkpoint")
 										}
-												more descriptive var names and some more logging. (#405)

* more descriptive checkpoint var names and some more logging.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-10-11 08:23:52 -07:00
+										startFrom++
-												Add logging during WAL replay

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

											
										
										
											2019-07-13 10:10:44 -07:00
+										level.Info(h.logger).Log("msg", "WAL checkpoint loaded")
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+									}
-												Always create a new clean segment when starting the WAL. (#608)

* Always create a new clean segment when starting the WAL.
* Ensure we flush the last page after repairing and before recreating the
new segment in Repair.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-05-24 11:33:28 -07:00
+									// Find the last segment.
 									_, last, err := h.wal.Segments()
-												wal: parallelize sample processing

											
										
										
											2017-10-07 06:55:11 -07:00
+									if err != nil {
-												Always create a new clean segment when starting the WAL. (#608)

* Always create a new clean segment when starting the WAL.
* Ensure we flush the last page after repairing and before recreating the
new segment in Repair.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-05-24 11:33:28 -07:00
+										return errors.Wrap(err, "finding WAL segments")
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									}
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
-												Always create a new clean segment when starting the WAL. (#608)

* Always create a new clean segment when starting the WAL.
* Ensure we flush the last page after repairing and before recreating the
new segment in Repair.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-05-24 11:33:28 -07:00
+									// Backfill segments from the most recent checkpoint onwards.
 									for i := startFrom; i <= last; i++ {
 										s, err := wal.OpenReadSegment(wal.SegmentName(h.wal.Dir(), i))
 										if err != nil {
 											return errors.Wrap(err, fmt.Sprintf("open WAL segment: %d", i))
 										}
 										sr := wal.NewSegmentBufReader(s)
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+										err = h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks)
-												move the wal repair logic in db.Open (#633)

* move the wal repair logic in db.Open

This is to allow opening a wal in a read oly mode without triggering a
repair.

Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>
											
										
										
											2019-06-14 08:39:22 -07:00
+										if err := sr.Close(); err != nil {
-												Capitalizing first letter of all log lines (#7043)

Signed-off-by: Marek Slabicki <thaniri@gmail.com>
											
										
										
											2020-04-11 01:22:18 -07:00
+											level.Warn(h.logger).Log("msg", "Error while closing the wal segments reader", "err", err)
-												Always create a new clean segment when starting the WAL. (#608)

* Always create a new clean segment when starting the WAL.
* Ensure we flush the last page after repairing and before recreating the
new segment in Repair.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-05-24 11:33:28 -07:00
+										}
-												move the wal repair logic in db.Open (#633)

* move the wal repair logic in db.Open

This is to allow opening a wal in a read oly mode without triggering a
repair.

Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>
											
										
										
											2019-06-14 08:39:22 -07:00
+										if err != nil {
 											return err
-												Always create a new clean segment when starting the WAL. (#608)

* Always create a new clean segment when starting the WAL.
* Ensure we flush the last page after repairing and before recreating the
new segment in Repair.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-05-24 11:33:28 -07:00
+										}
-												Add logging during WAL replay

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

											
										
										
											2019-07-13 10:10:44 -07:00
+										level.Info(h.logger).Log("msg", "WAL segment loaded", "segment", i, "maxSegment", last)
-												wal: parallelize sample processing

											
										
										
											2017-10-07 06:55:11 -07:00
+									}
-												Always create a new clean segment when starting the WAL. (#608)

* Always create a new clean segment when starting the WAL.
* Ensure we flush the last page after repairing and before recreating the
new segment in Repair.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-05-24 11:33:28 -07:00
-												Log WAL replay duration

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-03-03 06:11:14 -08:00
+									level.Info(h.logger).Log("msg", "WAL replay completed", "duration", time.Since(start).String())
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									return nil
-												Make WAL for HeadBlock composeable.

											
										
										
											2017-05-13 09:14:18 -07:00
+								}
-												Handle compaction trigger and reinitializing in DB

											
										
										
											2017-01-06 03:37:28 -08:00
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+								func (h *Head) loadMmappedChunks() (map[uint64][]*mmappedChunk, error) {
 									mmappedChunks := map[uint64][]*mmappedChunk{}
 									if err := h.chunkDiskMapper.IterateAllChunks(func(seriesRef, chunkRef uint64, mint, maxt int64, numSamples uint16) error {
 										if maxt < h.minValidTime {
 											return nil
 										}
 										slice := mmappedChunks[seriesRef]
 										if len(slice) > 0 {
 											if slice[len(slice)-1].maxTime >= mint {
 												return errors.Errorf("out of sequence m-mapped chunk for series ref %d", seriesRef)
 											}
 										}
 										slice = append(slice, &mmappedChunk{
 											ref:        chunkRef,
 											minTime:    mint,
 											maxTime:    maxt,
 											numSamples: numSamples,
 										})
 										mmappedChunks[seriesRef] = slice
 										return nil
 									}); err != nil {
 										return nil, errors.Wrap(err, "iterate on on-disk chunks")
 									}
 									return mmappedChunks, nil
 								}
 								// removeCorruptedMmappedChunks attempts to delete the corrupted mmapped chunks and if it fails, it clears all the previously
 								// loaded mmapped chunks.
 								func (h *Head) removeCorruptedMmappedChunks(err error) map[uint64][]*mmappedChunk {
 									level.Info(h.logger).Log("msg", "Deleting mmapped chunk files")
 									if err := h.chunkDiskMapper.DeleteCorrupted(err); err != nil {
 										level.Info(h.logger).Log("msg", "Deletion of mmap chunk files failed, discarding chunk files completely", "err", err)
 										return map[uint64][]*mmappedChunk{}
 									}
 									level.Info(h.logger).Log("msg", "Deletion of mmap chunk files successful, reattempting m-mapping the on-disk chunks")
 									mmappedChunks, err := h.loadMmappedChunks()
 									if err != nil {
 										level.Error(h.logger).Log("msg", "Loading on-disk chunks failed, discarding chunk files completely", "err", err)
 										mmappedChunks = map[uint64][]*mmappedChunk{}
 									}
 									return mmappedChunks
 								}
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+								// Truncate removes old data before mint from the head.
-												Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2018-09-25 06:48:33 -07:00
+								func (h *Head) Truncate(mint int64) (err error) {
 									defer func() {
 										if err != nil {
 											h.metrics.headTruncateFail.Inc()
 										}
 									}()
-												Properly initialize head time

This fixes various issues when initializing the head time range
under different starting conditions.

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-25 14:19:32 -07:00
+									initialize := h.MinTime() == math.MaxInt64
-												Filter WAL data in Head, misc fixes

											
										
										
											2017-09-06 07:20:37 -07:00
-												Properly initialize head time

This fixes various issues when initializing the head time range
under different starting conditions.

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-25 14:19:32 -07:00
+									if h.MinTime() >= mint && !initialize {
-												Add tests for GC and chunk truncation

											
										
										
											2017-09-01 05:38:49 -07:00
+										return nil
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									}
 									atomic.StoreInt64(&h.minTime, mint)
-												fix race for minValidTime (#479)

it happens when truncating the WAL and another goroutine creates a new
Appender()

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-12-14 03:42:07 -08:00
+									atomic.StoreInt64(&h.minValidTime, mint)
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
-												Fix min/max time handling and concurrent crc32 usage

											
										
										
											2017-09-07 04:04:02 -07:00
+									// Ensure that max time is at least as high as min time.
 									for h.MaxTime() < mint {
 										atomic.CompareAndSwapInt64(&h.maxTime, h.MaxTime(), mint)
 									}
-												Filter WAL data in Head, misc fixes

											
										
										
											2017-09-06 07:20:37 -07:00
+									// This was an initial call to Truncate after loading blocks on startup.
 									// We haven't read back the WAL yet, so do not attempt to truncate it.
 									if initialize {
 										return nil
 									}
-												Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2018-09-25 06:48:33 -07:00
+									h.metrics.headTruncateTotal.Inc()
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									start := time.Now()
 									h.gc()
-												Capitalizing first letter of all log lines (#7043)

Signed-off-by: Marek Slabicki <thaniri@gmail.com>
											
										
										
											2020-04-11 01:22:18 -07:00
+									level.Info(h.logger).Log("msg", "Head GC completed", "duration", time.Since(start))
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									h.metrics.gcDuration.Observe(time.Since(start).Seconds())
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									// Truncate the chunk m-mapper.
 									if err := h.chunkDiskMapper.Truncate(mint); err != nil {
 										return errors.Wrap(err, "truncate chunks.HeadReadWriter")
 									}
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+									if h.wal == nil {
 										return nil
 									}
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									start = time.Now()
-												more descriptive var names and some more logging. (#405)

* more descriptive checkpoint var names and some more logging.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-10-11 08:23:52 -07:00
+									first, last, err := h.wal.Segments()
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+									if err != nil {
 										return errors.Wrap(err, "get segment range")
 									}
-												Start a new WAL segement on head truncation. (#605)

This reduces disk space usage to not be a minimum of 3 128MB files
in small setups. This will possibly also help debug wal data issues,
by making things a bit more deterministic.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-06-07 03:35:02 -07:00
+									// Start a new segment, so low ingestion volume TSDB don't have more WAL than
 									// needed.
 									err = h.wal.NextSegment()
 									if err != nil {
 										return errors.Wrap(err, "next segment")
 									}
-												more descriptive var names and some more logging. (#405)

* more descriptive checkpoint var names and some more logging.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-10-11 08:23:52 -07:00
+									last-- // Never consider last segment for checkpoint.
 									if last < 0 {
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+										return nil // no segments yet.
 									}
-												Reduce how much old WAL we keep around. (#7098)

Previously we were keeping up to around 6 hours of WAL around by
removing 1/3 every hours. This was excessive, so switch to removing 2/3
which will up to around 3 hours of WAL around.

This will roughly halve the size of the WAL and halve startup time for
those who are I/O bound. This may increase the checkpoint size for
those with certain churn patterns, but by much less than we're saving
from the segments.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2020-04-07 03:25:57 -07:00
+									// The lower two thirds of segments should contain mostly obsolete samples.
 									// If we have less than two segments, it's not worth checkpointing yet.
 									// With the default 2h blocks, this will keeping up to around 3h worth
 									// of WAL segments.
 									last = first + (last-first)*2/3
-												more descriptive var names and some more logging. (#405)

* more descriptive checkpoint var names and some more logging.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-10-11 08:23:52 -07:00
+									if last <= first {
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+										return nil
 									}
-												Use boolean function instead of postings to drop WAL series

There is not guarantee or requirement for WAL writers to only add
series entries in increasing order of IDs. A postings list cannot look
back and thus unordered WAL entries would skip over IDs to not truncate
from the WAL.
We replace it with a simple boolean check function that does not require
order.

											
										
										
											2017-09-21 02:02:30 -07:00
+									keep := func(id uint64) bool {
-												Keep series that are still in WAL in checkpoints (#577)

If all the samples are deleted for a series,
we should still keep the series in the WAL as
anything else reading the WAL will still care
about it in order to understand the samples.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-04-09 06:16:24 -07:00
+										if h.series.getByID(id) != nil {
 											return true
 										}
 										h.deletedMtx.Lock()
 										_, ok := h.deleted[id]
 										h.deletedMtx.Unlock()
 										return ok
-												[WIP]: WAL implementation

Signed-off-by: Goutham Veeramachaneni <cs14btech11014@iith.ac.in>

											
										
										
											2017-08-31 02:39:22 -07:00
+									}
-												Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2018-09-25 06:48:33 -07:00
+									h.metrics.checkpointCreationTotal.Inc()
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+									if _, err = wal.Checkpoint(h.wal, first, last, keep, mint); err != nil {
-												Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2018-09-25 06:48:33 -07:00
+										h.metrics.checkpointCreationFail.Inc()
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+										return errors.Wrap(err, "create checkpoint")
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									}
-												more descriptive var names and some more logging. (#405)

* more descriptive checkpoint var names and some more logging.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-10-11 08:23:52 -07:00
+									if err := h.wal.Truncate(last + 1); err != nil {
-												Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2018-09-25 06:48:33 -07:00
+										// If truncating fails, we'll just try again at the next checkpoint.
 										// Leftover segments will just be ignored in the future if there's a checkpoint
 										// that supersedes them.
 										level.Error(h.logger).Log("msg", "truncating segments failed", "err", err)
 									}
-												Keep series that are still in WAL in checkpoints (#577)

If all the samples are deleted for a series,
we should still keep the series in the WAL as
anything else reading the WAL will still care
about it in order to understand the samples.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-04-09 06:16:24 -07:00
 									// The checkpoint is written and segments before it is truncated, so we no
 									// longer need to track deleted series that are before it.
 									h.deletedMtx.Lock()
 									for ref, segment := range h.deleted {
 										if segment < first {
 											delete(h.deleted, ref)
 										}
 									}
 									h.deletedMtx.Unlock()
-												Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2018-09-25 06:48:33 -07:00
+									h.metrics.checkpointDeleteTotal.Inc()
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+									if err := wal.DeleteCheckpoints(h.wal.Dir(), last); err != nil {
-												Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2018-09-25 06:48:33 -07:00
+										// Leftover old checkpoints do not cause problems down the line beyond
 										// occupying disk space.
 										// They will just be ignored since a higher checkpoint exists.
 										level.Error(h.logger).Log("msg", "delete old checkpoints", "err", err)
 										h.metrics.checkpointDeleteFail.Inc()
 									}
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									h.metrics.walTruncateDuration.Observe(time.Since(start).Seconds())
-												Add tests for GC and chunk truncation

											
										
										
											2017-09-01 05:38:49 -07:00
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+									level.Info(h.logger).Log("msg", "WAL checkpoint complete",
-												more descriptive var names and some more logging. (#405)

* more descriptive checkpoint var names and some more logging.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-10-11 08:23:52 -07:00
+										"first", first, "last", last, "duration", time.Since(start))
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
-												Add tests for GC and chunk truncation

											
										
										
											2017-09-01 05:38:49 -07:00
+									return nil
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+								}
 								// initTime initializes a head with the first timestamp. This only needs to be called
-												Fixs typo: "compltely" to "completely" (#470)

Fix a small typo.
											
										
										
											2018-12-11 12:09:17 -08:00
+								// for a completely fresh head with an empty WAL.
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+								// Returns true if the initialization took an effect.
 								func (h *Head) initTime(t int64) (initialized bool) {
-												Properly initialize head time

This fixes various issues when initializing the head time range
under different starting conditions.

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-25 14:19:32 -07:00
+									if !atomic.CompareAndSwapInt64(&h.minTime, math.MaxInt64, t) {
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+										return false
 									}
-												Fix min/max time handling and concurrent crc32 usage

											
										
										
											2017-09-07 04:04:02 -07:00
+									// Ensure that max time is initialized to at least the min time we just set.
 									// Concurrent appenders may already have set it to a higher value.
 									atomic.CompareAndSwapInt64(&h.maxTime, math.MinInt64, t)
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									return true
 								}
-												Fixed wrongly handled not ready TSDB on web and API. (#7182)

* fix federate endpoint panic

Signed-off-by: yeya24 <yb532204897@gmail.com>

* Fixed all cases of not ready TSDB being wrongly handled.

* Fixed issue for federation.
* Ensured this will never happen again thanks to interfaces
* Fixes same issue for stats.
* Added tests for readiness.
* Fixed bug in stats. It was:
   status.MaxTime = db.Head().MaxTime()
   status.MinTime = db.Head().MaxTime()


Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Addressed Brian's comments.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Addressed Brian's comments.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com>
											
										
										
											2020-04-29 09:16:14 -07:00
+								type Stats struct {
 									NumSeries         uint64
 									MinTime, MaxTime  int64
 									IndexPostingStats *index.PostingsStats
 								}
 								// Stats returns important current HEAD statistics. Note that it is expensive to
 								// calculate these.
 								func (h *Head) Stats(statsByLabelName string) *Stats {
 									return &Stats{
 										NumSeries:         h.NumSeries(),
 										MaxTime:           h.MaxTime(),
 										MinTime:           h.MinTime(),
 										IndexPostingStats: h.PostingsCardinalityStats(statsByLabelName),
 									}
 								}
-												Break DB.Compact and DB.compactHead and DB.compactBlocks. Add DB.CompactHead.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-02-14 01:50:24 -08:00
+								type RangeHead struct {
-												Add more verbose error handling for closing, reduce locking

This commit introduces error returns in various places and is explicit
about closing persisted blocks.
{Index,Chunk,Tombstone}Readers are more consistent about their Close()
method. Whenever a reader is retrieved, the corresponding close method
must eventually be called. We use this to track pending readers against
persisted blocks.

Querier's against the DB no longer hold a read lock for their entire
lifecycle. This avoids long running queriers to starve new ones when we
have to acquire a write lock when reloading blocks.

											
										
										
											2017-10-09 06:21:46 -07:00
+									head       *Head
 									mint, maxt int64
 								}
-												Reset comment

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>

											
										
										
											2020-03-25 16:17:56 -07:00
+								// NewRangeHead returns a *RangeHead.
-												Break DB.Compact and DB.compactHead and DB.compactBlocks. Add DB.CompactHead.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-02-14 01:50:24 -08:00
+								func NewRangeHead(head *Head, mint, maxt int64) *RangeHead {
 									return &RangeHead{
 										head: head,
 										mint: mint,
 										maxt: maxt,
 									}
 								}
-												Revert head posting optimization

This reverts commit 52630ad0c735f2dce4ce5bb851acb6c5d7df5eb1.

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>

											
										
										
											2020-03-25 12:13:47 -07:00
+								func (h *RangeHead) Index() (IndexReader, error) {
 									return h.head.indexRange(h.mint, h.maxt), nil
-												Add more verbose error handling for closing, reduce locking

This commit introduces error returns in various places and is explicit
about closing persisted blocks.
{Index,Chunk,Tombstone}Readers are more consistent about their Close()
method. Whenever a reader is retrieved, the corresponding close method
must eventually be called. We use this to track pending readers against
persisted blocks.

Querier's against the DB no longer hold a read lock for their entire
lifecycle. This avoids long running queriers to starve new ones when we
have to acquire a write lock when reloading blocks.

											
										
										
											2017-10-09 06:21:46 -07:00
+								}
-												Break DB.Compact and DB.compactHead and DB.compactBlocks. Add DB.CompactHead.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-02-14 01:50:24 -08:00
+								func (h *RangeHead) Chunks() (ChunkReader, error) {
-												More explicit chunks and  head error handling. (#7277)


											
										
										
											2020-05-22 02:03:23 -07:00
+									return h.head.chunksRange(h.mint, h.maxt, h.head.iso.State())
-												Add more verbose error handling for closing, reduce locking

This commit introduces error returns in various places and is explicit
about closing persisted blocks.
{Index,Chunk,Tombstone}Readers are more consistent about their Close()
method. Whenever a reader is retrieved, the corresponding close method
must eventually be called. We use this to track pending readers against
persisted blocks.

Querier's against the DB no longer hold a read lock for their entire
lifecycle. This avoids long running queriers to starve new ones when we
have to acquire a write lock when reloading blocks.

											
										
										
											2017-10-09 06:21:46 -07:00
+								}
-												Break DB.Compact and DB.compactHead and DB.compactBlocks. Add DB.CompactHead.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-02-14 01:50:24 -08:00
+								func (h *RangeHead) Tombstones() (tombstones.Reader, error) {
-												Bring back tombstones to Head block (#6542)

* Bring back tombstones to Head block

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add test cases

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Cleanup

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-01-20 07:38:00 -08:00
+									return h.head.tombstones, nil
-												Add more verbose error handling for closing, reduce locking

This commit introduces error returns in various places and is explicit
about closing persisted blocks.
{Index,Chunk,Tombstone}Readers are more consistent about their Close()
method. Whenever a reader is retrieved, the corresponding close method
must eventually be called. We use this to track pending readers against
persisted blocks.

Querier's against the DB no longer hold a read lock for their entire
lifecycle. This avoids long running queriers to starve new ones when we
have to acquire a write lock when reloading blocks.

											
										
										
											2017-10-09 06:21:46 -07:00
+								}
-												Break DB.Compact and DB.compactHead and DB.compactBlocks. Add DB.CompactHead.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-02-14 01:50:24 -08:00
+								func (h *RangeHead) MinTime() int64 {
-												Vertical query merging and compaction (#370)

* Vertical series iterator

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Select overlapped blocks first in compactor Plan()

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Added vertical compaction

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Code cleanup and comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix tests

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add benchmark for compaction

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Perform vertical compaction only when blocks are overlapping.

Actions for vertical compaction:
* Sorting chunk metas
* Calling chunks.MergeOverlappingChunks on the chunks

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Benchmark for vertical compaction

* BenchmarkNormalCompaction => BenchmarkCompaction
* Moved the benchmark from db_test.go to compact_test.go

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Benchmark for query iterator and seek for non overlapping blocks

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Vertical query merge only for overlapping blocks

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Simplify logging in Compact(...)

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Updated CHANGELOG.md

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Calculate overlapping inside populateBlock

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* MinTime and MaxTime for BlockReader.

Using this to find overlapping blocks in populateBlock()

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Sort blocks w.r.t. MinTime in reload()

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Log about overlapping in LeveledCompactor.write() instead of returning bool

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Log about overlapping inside LeveledCompactor.populateBlock()

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Refactor createBlock to take optional []Series

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* review1

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>

* Updated CHANGELOG and minor nits

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* nits

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Updated CHANGELOG

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Refactor iterator and seek benchmarks for Querier.

Also has as overlapping blocks.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Additional test case

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* genSeries takes optional labels. Updated BenchmarkQueryIterator and BenchmarkQuerySeek.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Split genSeries into genSeries and populateSeries

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Check error in benchmark

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Warn about overlapping blocks in reload()

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-02-14 05:29:41 -08:00
+									return h.mint
 								}
-												Break DB.Compact and DB.compactHead and DB.compactBlocks. Add DB.CompactHead.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-02-14 01:50:24 -08:00
+								func (h *RangeHead) MaxTime() int64 {
-												Vertical query merging and compaction (#370)

* Vertical series iterator

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Select overlapped blocks first in compactor Plan()

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Added vertical compaction

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Code cleanup and comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix tests

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add benchmark for compaction

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Perform vertical compaction only when blocks are overlapping.

Actions for vertical compaction:
* Sorting chunk metas
* Calling chunks.MergeOverlappingChunks on the chunks

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Benchmark for vertical compaction

* BenchmarkNormalCompaction => BenchmarkCompaction
* Moved the benchmark from db_test.go to compact_test.go

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Benchmark for query iterator and seek for non overlapping blocks

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Vertical query merge only for overlapping blocks

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Simplify logging in Compact(...)

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Updated CHANGELOG.md

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Calculate overlapping inside populateBlock

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* MinTime and MaxTime for BlockReader.

Using this to find overlapping blocks in populateBlock()

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Sort blocks w.r.t. MinTime in reload()

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Log about overlapping in LeveledCompactor.write() instead of returning bool

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Log about overlapping inside LeveledCompactor.populateBlock()

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Refactor createBlock to take optional []Series

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* review1

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>

* Updated CHANGELOG and minor nits

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* nits

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Updated CHANGELOG

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Refactor iterator and seek benchmarks for Querier.

Also has as overlapping blocks.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Additional test case

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* genSeries takes optional labels. Updated BenchmarkQueryIterator and BenchmarkQuerySeek.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Split genSeries into genSeries and populateSeries

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Check error in benchmark

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Warn about overlapping blocks in reload()

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-02-14 05:29:41 -08:00
+									return h.maxt
 								}
-												Break DB.Compact and DB.compactHead and DB.compactBlocks. Add DB.CompactHead.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-02-14 01:50:24 -08:00
+								func (h *RangeHead) NumSeries() uint64 {
-												Open db in Read only mode (#588)

* Added db read only open mode and use it for the tsdb cli.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2019-07-23 01:04:48 -07:00
+									return h.head.NumSeries()
 								}
-												Break DB.Compact and DB.compactHead and DB.compactBlocks. Add DB.CompactHead.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-02-14 01:50:24 -08:00
+								func (h *RangeHead) Meta() BlockMeta {
-												Open db in Read only mode (#588)

* Added db read only open mode and use it for the tsdb cli.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2019-07-23 01:04:48 -07:00
+									return BlockMeta{
 										MinTime: h.MinTime(),
 										MaxTime: h.MaxTime(),
 										ULID:    h.head.Meta().ULID,
 										Stats: BlockStats{
 											NumSeries: h.NumSeries(),
 										},
 									}
 								}
-												Fix typos in comments (#254)

a the -> the
timestmap -> timestamp
badded -> padded
its -> it is
callers -> caller's
											
										
										
											2018-01-13 09:51:50 -08:00
+								// initAppender is a helper to initialize the time bounds of the head
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+								// upon the first sample it receives.
 								type initAppender struct {
-												Unify Iterator interfaces. All point to storage now.

This is part of https://github.com/prometheus/prometheus/pull/5882 that can be done to simplify things.
All todos I added will be fixed in follow up PRs.

* querier.Querier, querier.Appender, querier.SeriesSet, and querier.Series interfaces merged
with storage interface.go. All imports that.
* querier.SeriesIterator replaced by chunkenc.Iterator
* Added chunkenc.Iterator.Seek method and tests for xor implementation (?)
* Since we properly handle SelectParams for Select methods I adjusted min max
based on that. This should help in terms of performance for queries with functions like offset.
* added Seek to deletedIterator and test.
* storage/tsdb was removed as it was only a unnecessary glue with incompatible structs.

No logic was changed, only different source of abstractions, so no need for benchmarks.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

											
										
										
											2020-02-06 07:58:38 -08:00
+									app  storage.Appender
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									head *Head
 								}
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+								func (a *initAppender) Add(lset labels.Labels, t int64, v float64) (uint64, error) {
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									if a.app != nil {
 										return a.app.Add(lset, t, v)
 									}
-												Remove defer statement in hot path

											
										
										
											2017-09-01 03:09:29 -07:00
+									a.head.initTime(t)
-												TSDB: Isolation: avoid creating appenderId's without appender (#7135)

Prior to this commit we could have situations where we are creating an
appenderId but never creating an appender to go with it, therefore
blocking the low watermak.

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
											
										
										
											2020-04-17 11:51:03 -07:00
+									a.app = a.head.appender()
-												Remove defer statement in hot path

											
										
										
											2017-09-01 03:09:29 -07:00
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									return a.app.Add(lset, t, v)
 								}
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+								func (a *initAppender) AddFast(ref uint64, t int64, v float64) error {
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									if a.app == nil {
-												Make TSDB use storage errors

This fixes #6992, which was introduced by #6777. There was an
intermediate component which translated TSDB errors into storage errors,
but that component was deleted and this bug went unnoticed, until we
were watching at the Prombench results. Without this, scrape will fail
instead of dropping samples or using "Add" when the series have been
garbage collected.

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>

											
										
										
											2020-03-16 14:52:02 -07:00
+										return storage.ErrNotFound
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									}
 									return a.app.AddFast(ref, t, v)
 								}
 								func (a *initAppender) Commit() error {
 									if a.app == nil {
 										return nil
 									}
 									return a.app.Commit()
 								}
 								func (a *initAppender) Rollback() error {
 									if a.app == nil {
 										return nil
 									}
 									return a.app.Rollback()
 								}
 								// Appender returns a new Appender on the database.
-												Unify Iterator interfaces. All point to storage now.

This is part of https://github.com/prometheus/prometheus/pull/5882 that can be done to simplify things.
All todos I added will be fixed in follow up PRs.

* querier.Querier, querier.Appender, querier.SeriesSet, and querier.Series interfaces merged
with storage interface.go. All imports that.
* querier.SeriesIterator replaced by chunkenc.Iterator
* Added chunkenc.Iterator.Seek method and tests for xor implementation (?)
* Since we properly handle SelectParams for Select methods I adjusted min max
based on that. This should help in terms of performance for queries with functions like offset.
* added Seek to deletedIterator and test.
* storage/tsdb was removed as it was only a unnecessary glue with incompatible structs.

No logic was changed, only different source of abstractions, so no need for benchmarks.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

											
										
										
											2020-02-06 07:58:38 -08:00
+								func (h *Head) Appender() storage.Appender {
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									h.metrics.activeAppenders.Inc()
 									// The head cache might not have a starting point yet. The init appender
 									// picks up the first appended timestamp as the base.
-												Properly initialize head time

This fixes various issues when initializing the head time range
under different starting conditions.

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-25 14:19:32 -07:00
+									if h.MinTime() == math.MaxInt64 {
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+										return &initAppender{
-												TSDB: Isolation: avoid creating appenderId's without appender (#7135)

Prior to this commit we could have situations where we are creating an
appenderId but never creating an appender to go with it, therefore
blocking the low watermak.

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
											
										
										
											2020-04-17 11:51:03 -07:00
+											head: h,
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+										}
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									}
-												TSDB: Isolation: avoid creating appenderId's without appender (#7135)

Prior to this commit we could have situations where we are creating an
appenderId but never creating an appender to go with it, therefore
blocking the low watermak.

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
											
										
										
											2020-04-17 11:51:03 -07:00
+									return h.appender()
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+								}
-												TSDB: Isolation: avoid creating appenderId's without appender (#7135)

Prior to this commit we could have situations where we are creating an
appenderId but never creating an appender to go with it, therefore
blocking the low watermak.

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
											
										
										
											2020-04-17 11:51:03 -07:00
+								func (h *Head) appender() *headAppender {
 									appendID := h.iso.newAppendID()
 									cleanupAppendIDsBelow := h.iso.lowWatermark()
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									return &headAppender{
-												no overlapping on compaction when an existing block is not within default boundaries. (#461)

closes https://github.com/prometheus/prometheus/issues/4643

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-12-04 02:30:49 -08:00
+										head: h,
-												Spelling (#6517)

* spelling: alertmanager

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: attributes

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: autocomplete

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: bootstrap

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: caught

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: chunkenc

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: compaction

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: corrupted

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: deletable

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: expected

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: fine-grained

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: initialized

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: iteration

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: javascript

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: multiple

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: number

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: overlapping

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: possible

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: postings

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: procedure

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: programmatic

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: queuing

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: querier

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: repairing

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: received

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: reproducible

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: retention

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: sample

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: segements

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: semantic

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: software [LICENSE]

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: staging

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: timestamp

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: unfortunately

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: uvarint

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: subsequently

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

* spelling: ressamples

Signed-off-by: Josh Soref <jsoref@users.noreply.github.com>

											
										
										
											2020-01-02 06:54:09 -08:00
+										// Set the minimum valid time to whichever is greater the head min valid time or the compaction window.
-												no overlapping on compaction when an existing block is not within default boundaries. (#461)

closes https://github.com/prometheus/prometheus/issues/4643

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-12-04 02:30:49 -08:00
+										// This ensures that no samples will be added within the compaction window to avoid races.
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+										minValidTime:          max(atomic.LoadInt64(&h.minValidTime), h.MaxTime()-h.chunkRange/2),
 										mint:                  math.MaxInt64,
 										maxt:                  math.MinInt64,
 										samples:               h.getAppendBuffer(),
 										sampleSeries:          h.getSeriesBuffer(),
 										appendID:              appendID,
 										cleanupAppendIDsBelow: cleanupAppendIDsBelow,
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									}
 								}
-												no overlapping on compaction when an existing block is not within default boundaries. (#461)

closes https://github.com/prometheus/prometheus/issues/4643

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-12-04 02:30:49 -08:00
+								func max(a, b int64) int64 {
 									if a > b {
 										return a
 									}
 									return b
 								}
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+								func (h *Head) getAppendBuffer() []record.RefSample {
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									b := h.appendPool.Get()
 									if b == nil {
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+										return make([]record.RefSample, 0, 512)
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									}
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+									return b.([]record.RefSample)
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+								}
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+								func (h *Head) putAppendBuffer(b []record.RefSample) {
-												fix statick check errors (#475)

fix the tests for `check_license` and `staticcheck`

the static check also found some actual bugs.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2019-01-02 08:48:42 -08:00
+									//lint:ignore SA6002 safe to ignore and actually fixing it has some performance penalty.
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									h.appendPool.Put(b[:0])
 								}
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+								func (h *Head) getSeriesBuffer() []*memSeries {
 									b := h.seriesPool.Get()
 									if b == nil {
 										return make([]*memSeries, 0, 512)
 									}
 									return b.([]*memSeries)
 								}
 								func (h *Head) putSeriesBuffer(b []*memSeries) {
 									//lint:ignore SA6002 safe to ignore and actually fixing it has some performance penalty.
 									h.seriesPool.Put(b[:0])
 								}
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+								func (h *Head) getBytesBuffer() []byte {
 									b := h.bytesPool.Get()
 									if b == nil {
 										return make([]byte, 0, 1024)
 									}
 									return b.([]byte)
 								}
 								func (h *Head) putBytesBuffer(b []byte) {
-												fix statick check errors (#475)

fix the tests for `check_license` and `staticcheck`

the static check also found some actual bugs.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2019-01-02 08:48:42 -08:00
+									//lint:ignore SA6002 safe to ignore and actually fixing it has some performance penalty.
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+									h.bytesPool.Put(b[:0])
 								}
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+								type headAppender struct {
-												Properly initialize head time

This fixes various issues when initializing the head time range
under different starting conditions.

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-25 14:19:32 -07:00
+									head         *Head
 									minValidTime int64 // No samples below this timestamp are allowed.
 									mint, maxt   int64
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+									series       []record.RefSeries
 									samples      []record.RefSample
 									sampleSeries []*memSeries
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
 									appendID, cleanupAppendIDsBelow uint64
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+								}
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+								func (a *headAppender) Add(lset labels.Labels, t int64, v float64) (uint64, error) {
-												Properly initialize head time

This fixes various issues when initializing the head time range
under different starting conditions.

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-25 14:19:32 -07:00
+									if t < a.minValidTime {
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+										a.head.metrics.outOfBoundSamples.Inc()
-												Make TSDB use storage errors

This fixes #6992, which was introduced by #6777. There was an
intermediate component which translated TSDB errors into storage errors,
but that component was deleted and this bug went unnoticed, until we
were watching at the Prombench results. Without this, scrape will fail
instead of dropping samples or using "Add" when the series have been
garbage collected.

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>

											
										
										
											2020-03-16 14:52:02 -07:00
+										return 0, storage.ErrOutOfBounds
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									}
-												Correctly handle empty labels. (#594)

Currently a time series with empty labels is not treated the same
as one with missing labels. Currently this can only come from
ALERTS&ALERT_FOR_STATE so it's unlikely anyone has actually hit it.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-05-07 03:00:16 -07:00
+									// Ensure no empty labels have gotten through.
 									lset = lset.WithoutEmpty()
-												tsdb: don't allow ingesting empty labelsets (#6891)

* tsdb: don't allow ingesting empty labelsets

When we ingest an empty labelset in the head, further blocks can not be
compacted, with the error:

```
level=error ts=2020-02-27T21:26:58.379Z caller=db.go:659 component=tsdb
msg="compaction failed" err="persist head block: write compaction:
add series: out-of-order series added with label set \"{}\" / prev:
\"{}\""
```

We should therefore reject those invalid empty labelsets upfront.

This can be reproduced with the following:

```
cat << END > prometheus.yml
scrape_configs:
  - job_name: 'prometheus'
    scrape_interval: 1s
    basic_auth:
      username: test
      password: test
    metric_relabel_configs:
    - regex: ".*"
      action: labeldrop

    static_configs:
    - targets:
      - 127.0.1.1:9090
END
./prometheus --storage.tsdb.min-block-duration=1m
```
And wait a few minutes.

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>

											
										
										
											2020-03-01 23:18:05 -08:00
+									if len(lset) == 0 {
 										return 0, errors.Wrap(ErrInvalidSample, "empty labelset")
 									}
-												tsdb: error on series with duplicate labels (#6664)

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
											
										
										
											2020-01-20 03:05:27 -08:00
+									if l, dup := lset.HasDuplicateLabelNames(); dup {
 										return 0, errors.Wrap(ErrInvalidSample, fmt.Sprintf(`label name "%s" is not unique`, l))
 									}
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+									s, created, err := a.head.getOrCreate(lset.Hash(), lset)
 									if err != nil {
 										return 0, err
 									}
-												Simplify series create logic in head

											
										
										
											2017-09-18 03:28:56 -07:00
+									if created {
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+										a.series = append(a.series, record.RefSeries{
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+											Ref:    s.ref,
 											Labels: lset,
 										})
 									}
 									return s.ref, a.AddFast(s.ref, t, v)
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+								}
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+								func (a *headAppender) AddFast(ref uint64, t int64, v float64) error {
-												Fix race condition between gc and committing (#378)

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>
											
										
										
											2018-09-17 09:58:42 -07:00
+									if t < a.minValidTime {
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+										a.head.metrics.outOfBoundSamples.Inc()
-												Make TSDB use storage errors

This fixes #6992, which was introduced by #6777. There was an
intermediate component which translated TSDB errors into storage errors,
but that component was deleted and this bug went unnoticed, until we
were watching at the Prombench results. Without this, scrape will fail
instead of dropping samples or using "Add" when the series have been
garbage collected.

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>

											
										
										
											2020-03-16 14:52:02 -07:00
+										return storage.ErrOutOfBounds
-												Fix race condition between gc and committing (#378)

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>
											
										
										
											2018-09-17 09:58:42 -07:00
+									}
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
-												Fix race condition between gc and committing (#378)

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>
											
										
										
											2018-09-17 09:58:42 -07:00
+									s := a.head.series.getByID(ref)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+									if s == nil {
-												Make TSDB use storage errors

This fixes #6992, which was introduced by #6777. There was an
intermediate component which translated TSDB errors into storage errors,
but that component was deleted and this bug went unnoticed, until we
were watching at the Prombench results. Without this, scrape will fail
instead of dropping samples or using "Add" when the series have been
garbage collected.

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>

											
										
										
											2020-03-16 14:52:02 -07:00
+										return errors.Wrap(storage.ErrNotFound, "unknown series")
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									}
-												Fix various races

											
										
										
											2017-09-07 23:48:19 -07:00
+									s.Lock()
-												Fix race condition between gc and committing (#378)

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>
											
										
										
											2018-09-17 09:58:42 -07:00
+									if err := s.appendable(t, v); err != nil {
 										s.Unlock()
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+										if err == storage.ErrOutOfOrderSample {
 											a.head.metrics.outOfOrderSamples.Inc()
 										}
-												Finish old WAL segment async, default to no fsync

We were still fsyncing while holding the write lock when we cut a new
segment. Given we cannot do anything but logging errors, we might just
as well complete segments asynchronously.

There's not realistic use case where one would fsync after every WAL
entry, thus make the default of a flush interval of 0 to never fsync
which is a much more likely use case.

											
										
										
											2017-09-08 01:12:28 -07:00
+										return err
 									}
-												Fix race condition between gc and committing (#378)

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>
											
										
										
											2018-09-17 09:58:42 -07:00
+									s.pendingCommit = true
 									s.Unlock()
-												Properly initialize head time

This fixes various issues when initializing the head time range
under different starting conditions.

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-25 14:19:32 -07:00
+									if t < a.mint {
 										a.mint = t
 									}
-												head: Rename highTimestamp to maxt

`maxt` seems more consistent with `mint` and other uses of `maxt`
elsewhere in the code, if I've understand the intent correctly.

											
										
										
											2018-02-21 08:01:12 -08:00
+									if t > a.maxt {
 										a.maxt = t
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									}
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+									a.samples = append(a.samples, record.RefSample{
 										Ref: ref,
 										T:   t,
 										V:   v,
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									})
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+									a.sampleSeries = append(a.sampleSeries, s)
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									return nil
 								}
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+								func (a *headAppender) log() error {
 									if a.head.wal == nil {
 										return nil
 									}
 									buf := a.head.getBytesBuffer()
 									defer func() { a.head.putBytesBuffer(buf) }()
 									var rec []byte
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+									var enc record.Encoder
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
 									if len(a.series) > 0 {
 										rec = enc.Series(a.series, buf)
 										buf = rec[:0]
 										if err := a.head.wal.Log(rec); err != nil {
 											return errors.Wrap(err, "log series")
 										}
 									}
 									if len(a.samples) > 0 {
 										rec = enc.Samples(a.samples, buf)
 										buf = rec[:0]
 										if err := a.head.wal.Log(rec); err != nil {
 											return errors.Wrap(err, "log samples")
 										}
 									}
 									return nil
 								}
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+								func (a *headAppender) Commit() error {
-												tsdb: Do a full rollback upon commit error

I think the previous behavior is problematic as it will leave
`memSeries` around that still have `pendingCommit` set to `true`.

The only case where this can happen in this code path is a failure to
write to the WAL, in which case we are probably in trouble anyway. I
believe, however, we should still try to do the right thing and do the
full rollback. This will implicitly try to write to the WAL again, but
this time without samples, which may even succeed. (But we propagate
the previous error in any case.)

This also adds `a.head.putSeriesBuffer(a.sampleSeries)` to Rollback,
which was previously missing.

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2020-03-09 10:24:18 -07:00
+									if err := a.log(); err != nil {
 										//nolint: errcheck
 										a.Rollback() // Most likely the same error will happen again.
 										return errors.Wrap(err, "write to WAL")
 									}
-												Log series on rollback

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-06-28 06:04:07 -07:00
+									defer a.head.metrics.activeAppenders.Dec()
 									defer a.head.putAppendBuffer(a.samples)
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+									defer a.head.putSeriesBuffer(a.sampleSeries)
-												Defer call to iso.closeAppend

This is taken from #6918. Since we probably won't merge #6918 before
the relase, we have to do this bit of it as it fixes an actual bug
(iso.closeAppend is not called if the append fails because of an error
logging to the WAL).

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2020-03-04 07:16:05 -08:00
+									defer a.head.iso.closeAppend(a.appendID)
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+									total := len(a.samples)
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+									var series *memSeries
 									for i, s := range a.samples {
 										series = a.sampleSeries[i]
 										series.Lock()
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+										ok, chunkCreated := series.append(s.T, s.V, a.appendID, a.head.chunkDiskMapper)
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+										series.cleanupAppendIDsBelow(a.cleanupAppendIDsBelow)
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+										series.pendingCommit = false
 										series.Unlock()
-												Fix various races

											
										
										
											2017-09-07 23:48:19 -07:00
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+										if !ok {
 											total--
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+											a.head.metrics.outOfOrderSamples.Inc()
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+										}
 										if chunkCreated {
 											a.head.metrics.chunks.Inc()
 											a.head.metrics.chunksCreated.Inc()
 										}
 									}
 									a.head.metrics.samplesAppended.Add(float64(total))
-												Properly initialize head time

This fixes various issues when initializing the head time range
under different starting conditions.

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-25 14:19:32 -07:00
+									a.head.updateMinMaxTime(a.mint, a.maxt)
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
 									return nil
 								}
 								func (a *headAppender) Rollback() error {
-												Move crucial actions to defer (#6918)

With defer having less of a performance penalty, there is no reason
not to do those crucial operations via defer.

Context: With isolation in place, if we forget to Commit/Rollback, the
low watermark will get stuck forever.

The current code should not have any bugs, but moving to defer helps
to avoid future bugs.

This is also moving the `closeAppend` in the `Commit` implementation
itself to defer. If logging to the WAL fails, we would have missed the
`closeAppend`.

Signed-off-by: beorn7 <beorn@grafana.com>
											
										
										
											2020-03-13 12:54:47 -07:00
+									defer a.head.metrics.activeAppenders.Dec()
 									defer a.head.iso.closeAppend(a.appendID)
 									defer a.head.putSeriesBuffer(a.sampleSeries)
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+									var series *memSeries
 									for i := range a.samples {
 										series = a.sampleSeries[i]
 										series.Lock()
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+										series.cleanupAppendIDsBelow(a.cleanupAppendIDsBelow)
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+										series.pendingCommit = false
 										series.Unlock()
-												Fix race condition between gc and committing (#378)

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>
											
										
										
											2018-09-17 09:58:42 -07:00
+									}
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									a.head.putAppendBuffer(a.samples)
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+									a.samples = nil
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
-												Log series on rollback

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-06-28 06:04:07 -07:00
+									// Series are created in the head memory regardless of rollback. Thus we have
 									// to log them to the WAL in any case.
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+									return a.log()
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+								}
 								// Delete all samples in the range of [mint, maxt] for series that satisfy the given
 								// label matchers.
-												Port tsdb to use pkg/labels. (#6326)

* Port tsdb to use pkg/labels.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Get tests passing.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Remove useless cast.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Appease linters.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-11-18 11:53:33 -08:00
+								func (h *Head) Delete(mint, maxt int64, ms ...*labels.Matcher) error {
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									// Do not delete anything beyond the currently valid range.
 									mint, maxt = clampInterval(mint, maxt, h.MinTime(), h.MaxTime())
 									ir := h.indexRange(mint, maxt)
-												Select series with label unset for != and !~

Fixes https://github.com/prometheus/prometheus/issues/3575

Signed-off-by: Goutham Veeramachaneni <cs14btech11014@iith.ac.in>
Signed-off-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2017-12-17 10:08:21 -08:00
+									p, err := PostingsForMatchers(ir, ms...)
-												Add explicit error to Querier.Select

This has been a frequent source of debugging pain since errors are
potentially delayed to a much later point. They bubble up in an
unrelated execution path.

											
										
										
											2017-11-13 03:16:58 -08:00
+									if err != nil {
 										return errors.Wrap(err, "select series")
 									}
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+									var stones []tombstones.Stone
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									for p.Next() {
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+										series := h.series.getByID(p.At())
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
-												tsdb: fix races around head chunks (#6985)

* tsdb: fix races around head chunks

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
											
										
										
											2020-03-16 05:59:22 -07:00
+										series.RLock()
-												Fix crash when a series has no block

											
										
										
											2018-02-07 05:43:21 -08:00
+										t0, t1 := series.minTime(), series.maxTime()
-												tsdb: fix races around head chunks (#6985)

* tsdb: fix races around head chunks

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
											
										
										
											2020-03-16 05:59:22 -07:00
+										series.RUnlock()
-												Fix crash when a series has no block

											
										
										
											2018-02-07 05:43:21 -08:00
+										if t0 == math.MinInt64 || t1 == math.MinInt64 {
 											continue
 										}
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+										// Delete only until the current values and not beyond.
-												Fix crash when a series has no block

											
										
										
											2018-02-07 05:43:21 -08:00
+										t0, t1 = clampInterval(mint, maxt, t0, t1)
-												Bring back tombstones to Head block (#6542)

* Bring back tombstones to Head block

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add test cases

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Cleanup

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-01-20 07:38:00 -08:00
+										stones = append(stones, tombstones.Stone{Ref: p.At(), Intervals: tombstones.Intervals{{Mint: t0, Maxt: t1}}})
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									}
 									if p.Err() != nil {
 										return p.Err()
 									}
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+									if h.wal != nil {
-												Bring back tombstones to Head block (#6542)

* Bring back tombstones to Head block

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add test cases

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Cleanup

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-01-20 07:38:00 -08:00
+										var enc record.Encoder
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+										if err := h.wal.Log(enc.Tombstones(stones, nil)); err != nil {
 											return err
 										}
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									}
-												Bring back tombstones to Head block (#6542)

* Bring back tombstones to Head block

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add test cases

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Cleanup

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-01-20 07:38:00 -08:00
+									for _, s := range stones {
 										h.tombstones.AddInterval(s.Ref, s.Intervals[0])
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									}
-												Dont store stones in head, delete samples directly

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-01-08 09:08:41 -08:00
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									return nil
 								}
-												Fix typos in comments (#254)

a the -> the
timestmap -> timestamp
badded -> padded
its -> it is
callers -> caller's
											
										
										
											2018-01-13 09:51:50 -08:00
+								// gc removes data before the minimum timestamp from the head.
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+								func (h *Head) gc() {
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+									// Only data strictly lower than this timestamp must be deleted.
 									mint := h.MinTime()
-												Properly track and write meta file

											
										
										
											2017-01-19 05:01:38 -08:00
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+									// Drop old chunks and remember series IDs and hashes if they can be
 									// deleted entirely.
 									deleted, chunksRemoved := h.series.gc(mint)
 									seriesRemoved := len(deleted)
-												Trigger reload correctly on interrupted compaction

											
										
										
											2017-03-20 02:41:43 -07:00
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+									h.metrics.seriesRemoved.Add(float64(seriesRemoved))
 									h.metrics.chunksRemoved.Add(float64(chunksRemoved))
 									h.metrics.chunks.Sub(float64(chunksRemoved))
-												fix spelling mistakes in docs (#5952)

Signed-off-by: hwdef <hwdef97@gmail.com>
											
										
										
											2019-08-27 10:33:40 -07:00
+									// Using AddUint64 to subtract series removed.
-												Open db in Read only mode (#588)

* Added db read only open mode and use it for the tsdb cli.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2019-07-23 01:04:48 -07:00
+									// See: https://golang.org/pkg/sync/atomic/#AddUint64.
 									atomic.AddUint64(&h.numSeries, ^uint64(seriesRemoved-1))
-												Add separate head mutex

Introduce a seperate mutex for the head blocks to avoid a race where
a post-compaction reload may run between switching the DB's base mutex
to create a new head block in an appender.

											
										
										
											2017-03-04 07:50:48 -08:00
-												Move index and chunk encoders to own packages

											
										
										
											2017-11-30 06:34:49 -08:00
+									// Remove deleted series IDs from the postings lists.
 									h.postings.Delete(deleted)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
-												Keep series that are still in WAL in checkpoints (#577)

If all the samples are deleted for a series,
we should still keep the series in the WAL as
anything else reading the WAL will still care
about it in order to understand the samples.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-04-09 06:16:24 -07:00
+									if h.wal != nil {
 										_, last, _ := h.wal.Segments()
 										h.deletedMtx.Lock()
 										// Keep series records until we're past segment 'last'
 										// because the WAL will still have samples records with
 										// this ref ID. If we didn't keep these series records then
 										// on start up when we replay the WAL, or any other code
 										// that reads the WAL, wouldn't be able to use those
 										// samples since we would have no labels for that ref ID.
 										for ref := range deleted {
 											h.deleted[ref] = last
 										}
 										h.deletedMtx.Unlock()
 									}
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+									// Rebuild symbols and label value indices from what is left in the postings terms.
-												Make Head.symbols map with size hint (#552)

To reduce the number of times the map is resized

Signed-off-by: zhulongcheng <zhulongcheng.me@gmail.com>
											
										
										
											2019-03-20 01:43:07 -07:00
+									symbols := make(map[string]struct{}, len(h.symbols))
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+									values := make(map[string]stringset, len(h.values))
-												Initial implementation of HeadBlock Snapshots

Signed-off-by: Goutham Veeramachaneni <cs14btech11014@iith.ac.in>

											
										
										
											2017-06-05 01:18:31 -07:00
-												Handle a bunch of unchecked errors (#365)

As discovered by "gosec".

Signed-off-by: Julius Volz <julius.volz@gmail.com>
											
										
										
											2018-09-20 01:33:52 -07:00
+									if err := h.postings.Iter(func(t labels.Label, _ index.Postings) error {
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+										symbols[t.Name] = struct{}{}
 										symbols[t.Value] = struct{}{}
-												Persist series without allocating the full set

Change index persistence for series to not be accumulated in memory
before being written as one large batch. `Labels` and `ChunkMeta`
objects are reused.
This cuts down memory spikes during compaction of multiple blocks
significantly.

As part of the the Index{Reader,Writer} now have an explicit notion of
symbols and series must be inserted in order.

											
										
										
											2017-08-05 04:31:48 -07:00
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+										ss, ok := values[t.Name]
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+										if !ok {
 											ss = stringset{}
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+											values[t.Name] = ss
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+										}
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+										ss.set(t.Value)
-												Move index and chunk encoders to own packages

											
										
										
											2017-11-30 06:34:49 -08:00
+										return nil
-												Handle a bunch of unchecked errors (#365)

As discovered by "gosec".

Signed-off-by: Julius Volz <julius.volz@gmail.com>
											
										
										
											2018-09-20 01:33:52 -07:00
+									}); err != nil {
 										// This should never happen, as the iteration function only returns nil.
 										panic(err)
 									}
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
 									h.symMtx.Lock()
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+									h.symbols = symbols
 									h.values = values
-												Add various metrics

											
										
										
											2017-08-30 08:38:25 -07:00
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+									h.symMtx.Unlock()
-												Add Queryable interface to Block

This adds the Queryable interface to the Block interface. Head and
persisted blocks now implement their own Querier() method and thus
isolate customization (e.g. remapPostings) more cleanly.

											
										
										
											2017-03-20 02:21:21 -07:00
+								}
-												Add more verbose error handling for closing, reduce locking

This commit introduces error returns in various places and is explicit
about closing persisted blocks.
{Index,Chunk,Tombstone}Readers are more consistent about their Close()
method. Whenever a reader is retrieved, the corresponding close method
must eventually be called. We use this to track pending readers against
persisted blocks.

Querier's against the DB no longer hold a read lock for their entire
lifecycle. This avoids long running queriers to starve new ones when we
have to acquire a write lock when reloading blocks.

											
										
										
											2017-10-09 06:21:46 -07:00
+								// Tombstones returns a new reader over the head's tombstones
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 02:15:41 -07:00
+								func (h *Head) Tombstones() (tombstones.Reader, error) {
-												Bring back tombstones to Head block (#6542)

* Bring back tombstones to Head block

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add test cases

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Cleanup

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-01-20 07:38:00 -08:00
+									return h.tombstones, nil
-												Compact head block early

Let older head blocks be compacted once the newest once has samples at
50% of its total range. This allows the memory of the compacted blocks
to be released and garbage collected before a new head block gets
created. Thereby the number of head blocks is 1 or 2 instead of 2 or 3
and memory spikes are reduced.

											
										
										
											2017-06-25 10:02:02 -07:00
+								}
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+								// Index returns an IndexReader against the block.
-												Revert head posting optimization

This reverts commit 52630ad0c735f2dce4ce5bb851acb6c5d7df5eb1.

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>

											
										
										
											2020-03-25 12:13:47 -07:00
+								func (h *Head) Index() (IndexReader, error) {
 									return h.indexRange(math.MinInt64, math.MaxInt64), nil
-												Add composed Block interfaces, remove head generation

This adds more lower-leve interfaces which are used to compose
to different Block interfaces.
The DB only uses interfaces instead of explicit persistedBlock and
headBlock. The headBlock generation property is dropped as the use-case
can be implemented using block sequence numbers.

											
										
										
											2017-03-20 00:41:56 -07:00
+								}
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+								func (h *Head) indexRange(mint, maxt int64) *headIndexReader {
 									if hmin := h.MinTime(); hmin > mint {
 										mint = hmin
-												Use buffer pool for head appenders

											
										
										
											2017-01-12 11:00:36 -08:00
+									}
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+									return &headIndexReader{head: h, mint: mint, maxt: maxt}
-												Expose series references to clients

This exposes a reference number of a series represented by a label set
to clients.
Subsequent samples can be directly added via the reference rather than
repeatedly passing in the full labels. This drasitcally speeds up the
append process.

The appender chain uses different sections of the reference number for
assignment to child appenders and invalidating reference numbers as
necessary.

Clients can either pass out reference numbers themselves or have their
own optimized lookup, i.e. by directly associating unparsed metric
descriptors strings with reference numbers.

											
										
										
											2017-01-12 10:18:51 -08:00
+								}
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+								// Chunks returns a ChunkReader against the block.
-												Add more verbose error handling for closing, reduce locking

This commit introduces error returns in various places and is explicit
about closing persisted blocks.
{Index,Chunk,Tombstone}Readers are more consistent about their Close()
method. Whenever a reader is retrieved, the corresponding close method
must eventually be called. We use this to track pending readers against
persisted blocks.

Querier's against the DB no longer hold a read lock for their entire
lifecycle. This avoids long running queriers to starve new ones when we
have to acquire a write lock when reloading blocks.

											
										
										
											2017-10-09 06:21:46 -07:00
+								func (h *Head) Chunks() (ChunkReader, error) {
-												More explicit chunks and  head error handling. (#7277)


											
										
										
											2020-05-22 02:03:23 -07:00
+									return h.chunksRange(math.MinInt64, math.MaxInt64, h.iso.State())
-												Expose series references to clients

This exposes a reference number of a series represented by a label set
to clients.
Subsequent samples can be directly added via the reference rather than
repeatedly passing in the full labels. This drasitcally speeds up the
append process.

The appender chain uses different sections of the reference number for
assignment to child appenders and invalidating reference numbers as
necessary.

Clients can either pass out reference numbers themselves or have their
own optimized lookup, i.e. by directly associating unparsed metric
descriptors strings with reference numbers.

											
										
										
											2017-01-12 10:18:51 -08:00
+								}
-												More explicit chunks and  head error handling. (#7277)


											
										
										
											2020-05-22 02:03:23 -07:00
+								func (h *Head) chunksRange(mint, maxt int64, is *isolationState) (*headChunkReader, error) {
 									h.closedMtx.Lock()
 									defer h.closedMtx.Unlock()
 									if h.closed {
 										return nil, errors.New("can't read from a closed head")
 									}
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+									if hmin := h.MinTime(); hmin > mint {
 										mint = hmin
-												Expose series references to clients

This exposes a reference number of a series represented by a label set
to clients.
Subsequent samples can be directly added via the reference rather than
repeatedly passing in the full labels. This drasitcally speeds up the
append process.

The appender chain uses different sections of the reference number for
assignment to child appenders and invalidating reference numbers as
necessary.

Clients can either pass out reference numbers themselves or have their
own optimized lookup, i.e. by directly associating unparsed metric
descriptors strings with reference numbers.

											
										
										
											2017-01-12 10:18:51 -08:00
+									}
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+									return &headChunkReader{
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+										head:         h,
 										mint:         mint,
 										maxt:         maxt,
 										isoState:     is,
 										memChunkPool: &h.memChunkPool,
-												More explicit chunks and  head error handling. (#7277)


											
										
										
											2020-05-22 02:03:23 -07:00
+									}, nil
-												Expose series references to clients

This exposes a reference number of a series represented by a label set
to clients.
Subsequent samples can be directly added via the reference rather than
repeatedly passing in the full labels. This drasitcally speeds up the
append process.

The appender chain uses different sections of the reference number for
assignment to child appenders and invalidating reference numbers as
necessary.

Clients can either pass out reference numbers themselves or have their
own optimized lookup, i.e. by directly associating unparsed metric
descriptors strings with reference numbers.

											
										
										
											2017-01-12 10:18:51 -08:00
+								}
-												Open db in Read only mode (#588)

* Added db read only open mode and use it for the tsdb cli.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2019-07-23 01:04:48 -07:00
+								// NumSeries returns the number of active series in the head.
 								func (h *Head) NumSeries() uint64 {
 									return atomic.LoadUint64(&h.numSeries)
 								}
 								// Meta returns meta information about the head.
 								// The head is dynamic so will return dynamic results.
 								func (h *Head) Meta() BlockMeta {
 									var id [16]byte
 									copy(id[:], "______head______")
 									return BlockMeta{
 										MinTime: h.MinTime(),
 										MaxTime: h.MaxTime(),
 										ULID:    ulid.ULID(id),
 										Stats: BlockStats{
 											NumSeries: h.NumSeries(),
 										},
 									}
 								}
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+								// MinTime returns the lowest time bound on visible data in the head.
 								func (h *Head) MinTime() int64 {
 									return atomic.LoadInt64(&h.minTime)
-												Expose series references to clients

This exposes a reference number of a series represented by a label set
to clients.
Subsequent samples can be directly added via the reference rather than
repeatedly passing in the full labels. This drasitcally speeds up the
append process.

The appender chain uses different sections of the reference number for
assignment to child appenders and invalidating reference numbers as
necessary.

Clients can either pass out reference numbers themselves or have their
own optimized lookup, i.e. by directly associating unparsed metric
descriptors strings with reference numbers.

											
										
										
											2017-01-12 10:18:51 -08:00
+								}
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+								// MaxTime returns the highest timestamp seen in data of the head.
 								func (h *Head) MaxTime() int64 {
 									return atomic.LoadInt64(&h.maxTime)
-												Expose series references to clients

This exposes a reference number of a series represented by a label set
to clients.
Subsequent samples can be directly added via the reference rather than
repeatedly passing in the full labels. This drasitcally speeds up the
append process.

The appender chain uses different sections of the reference number for
assignment to child appenders and invalidating reference numbers as
necessary.

Clients can either pass out reference numbers themselves or have their
own optimized lookup, i.e. by directly associating unparsed metric
descriptors strings with reference numbers.

											
										
										
											2017-01-12 10:18:51 -08:00
+								}
-												Add Head.compactable method (#542)

* Add Head.compactable method

Signed-off-by: zhulongcheng <zhulongcheng.me@gmail.com>
											
										
										
											2019-04-01 01:19:06 -07:00
+								// compactable returns whether the head has a compactable range.
 								// The head has a compactable range when the head time range is 1.5 times the chunk range.
 								// The 0.5 acts as a buffer of the appendable window.
 								func (h *Head) compactable() bool {
 									return h.MaxTime()-h.MinTime() > h.chunkRange/2*3
 								}
-												Close WAL when closing the DB

Also, the `wal` field of the `DB` was not used anywhere, so this removes
it.

											
										
										
											2017-11-10 12:19:39 -08:00
+								// Close flushes the WAL and closes the head.
 								func (h *Head) Close() error {
-												More explicit chunks and  head error handling. (#7277)


											
										
										
											2020-05-22 02:03:23 -07:00
+									h.closedMtx.Lock()
 									defer h.closedMtx.Unlock()
 									h.closed = true
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									var merr tsdb_errors.MultiError
 									merr.Add(h.chunkDiskMapper.Close())
 									if h.wal != nil {
 										merr.Add(h.wal.Close())
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 06:04:32 -07:00
+									}
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									return merr.Err()
-												Close WAL when closing the DB

Also, the `wal` field of the `DB` was not used anywhere, so this removes
it.

											
										
										
											2017-11-10 12:19:39 -08:00
+								}
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+								type headChunkReader struct {
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									head         *Head
 									mint, maxt   int64
 									isoState     *isolationState
 									memChunkPool *sync.Pool
-												Expose series references to clients

This exposes a reference number of a series represented by a label set
to clients.
Subsequent samples can be directly added via the reference rather than
repeatedly passing in the full labels. This drasitcally speeds up the
append process.

The appender chain uses different sections of the reference number for
assignment to child appenders and invalidating reference numbers as
necessary.

Clients can either pass out reference numbers themselves or have their
own optimized lookup, i.e. by directly associating unparsed metric
descriptors strings with reference numbers.

											
										
										
											2017-01-12 10:18:51 -08:00
+								}
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+								func (h *headChunkReader) Close() error {
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+									h.isoState.Close()
-												Expose series references to clients

This exposes a reference number of a series represented by a label set
to clients.
Subsequent samples can be directly added via the reference rather than
repeatedly passing in the full labels. This drasitcally speeds up the
append process.

The appender chain uses different sections of the reference number for
assignment to child appenders and invalidating reference numbers as
necessary.

Clients can either pass out reference numbers themselves or have their
own optimized lookup, i.e. by directly associating unparsed metric
descriptors strings with reference numbers.

											
										
										
											2017-01-12 10:18:51 -08:00
+									return nil
 								}
-												Change series ID from uint32 to uint64

											
										
										
											2017-09-04 07:08:38 -07:00
+								// packChunkID packs a seriesID and a chunkID within it into a global 8 byte ID.
 								// It panicks if the seriesID exceeds 5 bytes or the chunk ID 3 bytes.
 								func packChunkID(seriesID, chunkID uint64) uint64 {
 									if seriesID > (1<<40)-1 {
 										panic("series ID exceeds 5 bytes")
 									}
 									if chunkID > (1<<24)-1 {
 										panic("chunk ID exceeds 3 bytes")
 									}
 									return (seriesID << 24) | chunkID
 								}
 								func unpackChunkID(id uint64) (seriesID, chunkID uint64) {
 									return id >> 24, (id << 40) >> 40
 								}
-												Add stats serialization, load querier of all blocks

											
										
										
											2016-12-15 07:14:33 -08:00
+								// Chunk returns the chunk for the reference number.
-												Move index and chunk encoders to own packages

											
										
										
											2017-11-30 06:34:49 -08:00
+								func (h *headChunkReader) Chunk(ref uint64) (chunkenc.Chunk, error) {
-												Change series ID from uint32 to uint64

											
										
										
											2017-09-04 07:08:38 -07:00
+									sid, cid := unpackChunkID(ref)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
 									s := h.head.series.getByID(sid)
-												Make sure gc'ed chunks are handled properly

Signed-off-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2017-12-13 12:58:21 -08:00
+									// This means that the series has been garbage collected.
 									if s == nil {
-												Make TSDB use storage errors

This fixes #6992, which was introduced by #6777. There was an
intermediate component which translated TSDB errors into storage errors,
but that component was deleted and this bug went unnoticed, until we
were watching at the Prombench results. Without this, scrape will fail
instead of dropping samples or using "Add" when the series have been
garbage collected.

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>

											
										
										
											2020-03-16 14:52:02 -07:00
+										return nil, storage.ErrNotFound
-												Make sure gc'ed chunks are handled properly

Signed-off-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2017-12-13 12:58:21 -08:00
+									}
-												Use separate lock for series creation

This uses the head block's own lock to only lock if new series were
encountered.
In the general append case we just need to hold a

											
										
										
											2017-01-06 08:23:12 -08:00
-												Fix various races

											
										
										
											2017-09-07 23:48:19 -07:00
+									s.Lock()
-												More explicit chunks and  head error handling. (#7277)


											
										
										
											2020-05-22 02:03:23 -07:00
+									c, garbageCollect, err := s.chunk(int(cid), h.head.chunkDiskMapper)
 									if err != nil {
 										s.Unlock()
 										return nil, err
 									}
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									defer func() {
 										if garbageCollect {
 											// Set this to nil so that Go GC can collect it after it has been used.
 											c.chunk = nil
 											h.memChunkPool.Put(c)
 										}
 									}()
-												Make sure gc'ed chunks are handled properly

Signed-off-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2017-12-13 12:58:21 -08:00
-												More explicit chunks and  head error handling. (#7277)


											
										
										
											2020-05-22 02:03:23 -07:00
+									// This means that the chunk is outside the specified range.
 									if !c.OverlapsClosedInterval(h.mint, h.maxt) {
-												Make sure gc'ed chunks are handled properly

Signed-off-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2017-12-13 12:58:21 -08:00
+										s.Unlock()
-												Make TSDB use storage errors

This fixes #6992, which was introduced by #6777. There was an
intermediate component which translated TSDB errors into storage errors,
but that component was deleted and this bug went unnoticed, until we
were watching at the Prombench results. Without this, scrape will fail
instead of dropping samples or using "Add" when the series have been
garbage collected.

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>

											
										
										
											2020-03-16 14:52:02 -07:00
+										return nil, storage.ErrNotFound
-												Make sure gc'ed chunks are handled properly

Signed-off-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2017-12-13 12:58:21 -08:00
+									}
-												Fix various races

											
										
										
											2017-09-07 23:48:19 -07:00
+									s.Unlock()
-												Support multiple chunk files in read path

											
										
										
											2017-02-18 08:33:20 -08:00
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+									return &safeChunk{
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+										Chunk:           c.chunk,
 										s:               s,
 										cid:             int(cid),
 										isoState:        h.isoState,
 										chunkDiskMapper: h.head.chunkDiskMapper,
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+									}, nil
-												Misc fixes for initial Prometheus integration

											
										
										
											2016-12-14 09:38:46 -08:00
+								}
-												Make concurrent head chunk reads safe, fix misc races

This adds a 4 sample buffer to every head chunk. The XOR
compression scheme may edit bytes in place. The minimum size
of a sample is 2 bits. So keeping the last 4 samples in an in-memory
buffer makes it safe to query the preceeding ones while samples
are added

											
										
										
											2017-01-09 07:51:39 -08:00
+								type safeChunk struct {
-												Move index and chunk encoders to own packages

											
										
										
											2017-11-30 06:34:49 -08:00
+									chunkenc.Chunk
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									s               *memSeries
 									cid             int
 									isoState        *isolationState
 									chunkDiskMapper *chunks.ChunkDiskMapper
-												Make concurrent head chunk reads safe, fix misc races

This adds a 4 sample buffer to every head chunk. The XOR
compression scheme may edit bytes in place. The minimum size
of a sample is 2 bits. So keeping the last 4 samples in an in-memory
buffer makes it safe to query the preceeding ones while samples
are added

											
										
										
											2017-01-09 07:51:39 -08:00
+								}
-												Reuse Chunk Iterator (#642)

* Reset method for chunkenc.Iterator

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Reset method only for XORIterator

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Use Reset(...) in querier.go

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Reuse deletedIterator

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Another way of reusing chunk iterators

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Unexport xorIterator

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix memSeries.iterator(...)

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add some comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-07-09 02:49:34 -07:00
+								func (c *safeChunk) Iterator(reuseIter chunkenc.Iterator) chunkenc.Iterator {
-												Fix various races

											
										
										
											2017-09-07 23:48:19 -07:00
+									c.s.Lock()
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									it := c.s.iterator(c.cid, c.isoState, c.chunkDiskMapper, reuseIter)
-												Fix various races

											
										
										
											2017-09-07 23:48:19 -07:00
+									c.s.Unlock()
 									return it
-												Make concurrent head chunk reads safe, fix misc races

This adds a 4 sample buffer to every head chunk. The XOR
compression scheme may edit bytes in place. The minimum size
of a sample is 2 bits. So keeping the last 4 samples in an in-memory
buffer makes it safe to query the preceeding ones while samples
are added

											
										
										
											2017-01-09 07:51:39 -08:00
+								}
-												Unexport HeadBlock, export Block interface

											
										
										
											2017-01-10 06:28:22 -08:00
+								type headIndexReader struct {
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+									head       *Head
 									mint, maxt int64
 								}
 								func (h *headIndexReader) Close() error {
 									return nil
-												Persist series without allocating the full set

Change index persistence for series to not be accumulated in memory
before being written as one large batch. `Labels` and `ChunkMeta`
objects are reused.
This cuts down memory spikes during compaction of multiple blocks
significantly.

As part of the the Index{Reader,Writer} now have an explicit notion of
symbols and series must be inserted in order.

											
										
										
											2017-08-05 04:31:48 -07:00
+								}
-												Stream symbols during compaction. (#6468)

Rather than buffer up symbols in RAM, do it one by one
during compaction. Then use the reader's symbol handling
for symbol lookups during the rest of the index write.

There is some slowdown in compaction, due to having to look through a file
rather than a hash lookup. This is noise to the overall cost of compacting
series with thousands of samples though.

benchmark                                                                                   old ns/op       new ns/op       delta
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4        539917175       675341565       +25.08%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4       2441815993      2477453524      +1.46%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4       3978543559      3922909687      -1.40%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4       8430219716      8586610007      +1.86%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4      1786424591      1909552782      +6.89%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4     5328998202      6020839950      +12.98%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4     10085059958     11085278690     +9.92%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4     25497010155     27018079806     +5.97%
BenchmarkCompactionFromHead/labelnames=1,labelvalues=100000-4                               2427391406      2817217987      +16.06%
BenchmarkCompactionFromHead/labelnames=10,labelvalues=10000-4                               2592965497      2538805050      -2.09%
BenchmarkCompactionFromHead/labelnames=100,labelvalues=1000-4                               2437388343      2668012858      +9.46%
BenchmarkCompactionFromHead/labelnames=1000,labelvalues=100-4                               2317095324      2787423966      +20.30%
BenchmarkCompactionFromHead/labelnames=10000,labelvalues=10-4                               2600239857      2096973860      -19.35%

benchmark                                                                                   old allocs     new allocs     delta
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4        500851         470794         -6.00%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4       821527         791451         -3.66%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4       1141562        1111508        -2.63%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4       2141576        2111504        -1.40%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4      871466         841424         -3.45%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4     1941428        1911415        -1.55%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4     3071573        3041510        -0.98%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4     6771648        6741509        -0.45%
BenchmarkCompactionFromHead/labelnames=1,labelvalues=100000-4                               731493         824888         +12.77%
BenchmarkCompactionFromHead/labelnames=10,labelvalues=10000-4                               793918         887311         +11.76%
BenchmarkCompactionFromHead/labelnames=100,labelvalues=1000-4                               811842         905204         +11.50%
BenchmarkCompactionFromHead/labelnames=1000,labelvalues=100-4                               832244         925081         +11.16%
BenchmarkCompactionFromHead/labelnames=10000,labelvalues=10-4                               921553         1019162        +10.59%

benchmark                                                                                   old bytes      new bytes      delta
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4        40532648       35698276       -11.93%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4       60340216       53409568       -11.49%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4       81087336       72065552       -11.13%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4       142485576      120878544      -15.16%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4      208661368      203831136      -2.31%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4     347345904      340484696      -1.98%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4     585185856      576244648      -1.53%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4     1357641792     1358966528     +0.10%
BenchmarkCompactionFromHead/labelnames=1,labelvalues=100000-4                               126486664      119666744      -5.39%
BenchmarkCompactionFromHead/labelnames=10,labelvalues=10000-4                               122323192      115117224      -5.89%
BenchmarkCompactionFromHead/labelnames=100,labelvalues=1000-4                               126404504      119469864      -5.49%
BenchmarkCompactionFromHead/labelnames=1000,labelvalues=100-4                               119047832      112230408      -5.73%
BenchmarkCompactionFromHead/labelnames=10000,labelvalues=10-4                               136576016      116634800      -14.60%

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-12-17 11:49:54 -08:00
+								func (h *headIndexReader) Symbols() index.StringIter {
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+									h.head.symMtx.RLock()
-												Stream symbols during compaction. (#6468)

Rather than buffer up symbols in RAM, do it one by one
during compaction. Then use the reader's symbol handling
for symbol lookups during the rest of the index write.

There is some slowdown in compaction, due to having to look through a file
rather than a hash lookup. This is noise to the overall cost of compacting
series with thousands of samples though.

benchmark                                                                                   old ns/op       new ns/op       delta
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4        539917175       675341565       +25.08%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4       2441815993      2477453524      +1.46%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4       3978543559      3922909687      -1.40%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4       8430219716      8586610007      +1.86%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4      1786424591      1909552782      +6.89%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4     5328998202      6020839950      +12.98%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4     10085059958     11085278690     +9.92%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4     25497010155     27018079806     +5.97%
BenchmarkCompactionFromHead/labelnames=1,labelvalues=100000-4                               2427391406      2817217987      +16.06%
BenchmarkCompactionFromHead/labelnames=10,labelvalues=10000-4                               2592965497      2538805050      -2.09%
BenchmarkCompactionFromHead/labelnames=100,labelvalues=1000-4                               2437388343      2668012858      +9.46%
BenchmarkCompactionFromHead/labelnames=1000,labelvalues=100-4                               2317095324      2787423966      +20.30%
BenchmarkCompactionFromHead/labelnames=10000,labelvalues=10-4                               2600239857      2096973860      -19.35%

benchmark                                                                                   old allocs     new allocs     delta
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4        500851         470794         -6.00%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4       821527         791451         -3.66%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4       1141562        1111508        -2.63%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4       2141576        2111504        -1.40%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4      871466         841424         -3.45%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4     1941428        1911415        -1.55%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4     3071573        3041510        -0.98%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4     6771648        6741509        -0.45%
BenchmarkCompactionFromHead/labelnames=1,labelvalues=100000-4                               731493         824888         +12.77%
BenchmarkCompactionFromHead/labelnames=10,labelvalues=10000-4                               793918         887311         +11.76%
BenchmarkCompactionFromHead/labelnames=100,labelvalues=1000-4                               811842         905204         +11.50%
BenchmarkCompactionFromHead/labelnames=1000,labelvalues=100-4                               832244         925081         +11.16%
BenchmarkCompactionFromHead/labelnames=10000,labelvalues=10-4                               921553         1019162        +10.59%

benchmark                                                                                   old bytes      new bytes      delta
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4        40532648       35698276       -11.93%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4       60340216       53409568       -11.49%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4       81087336       72065552       -11.13%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4       142485576      120878544      -15.16%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4      208661368      203831136      -2.31%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4     347345904      340484696      -1.98%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4     585185856      576244648      -1.53%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4     1357641792     1358966528     +0.10%
BenchmarkCompactionFromHead/labelnames=1,labelvalues=100000-4                               126486664      119666744      -5.39%
BenchmarkCompactionFromHead/labelnames=10,labelvalues=10000-4                               122323192      115117224      -5.89%
BenchmarkCompactionFromHead/labelnames=100,labelvalues=1000-4                               126404504      119469864      -5.49%
BenchmarkCompactionFromHead/labelnames=1000,labelvalues=100-4                               119047832      112230408      -5.73%
BenchmarkCompactionFromHead/labelnames=10000,labelvalues=10-4                               136576016      116634800      -14.60%

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-12-17 11:49:54 -08:00
+									res := make([]string, 0, len(h.head.symbols))
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
 									for s := range h.head.symbols {
-												Stream symbols during compaction. (#6468)

Rather than buffer up symbols in RAM, do it one by one
during compaction. Then use the reader's symbol handling
for symbol lookups during the rest of the index write.

There is some slowdown in compaction, due to having to look through a file
rather than a hash lookup. This is noise to the overall cost of compacting
series with thousands of samples though.

benchmark                                                                                   old ns/op       new ns/op       delta
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4        539917175       675341565       +25.08%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4       2441815993      2477453524      +1.46%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4       3978543559      3922909687      -1.40%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4       8430219716      8586610007      +1.86%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4      1786424591      1909552782      +6.89%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4     5328998202      6020839950      +12.98%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4     10085059958     11085278690     +9.92%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4     25497010155     27018079806     +5.97%
BenchmarkCompactionFromHead/labelnames=1,labelvalues=100000-4                               2427391406      2817217987      +16.06%
BenchmarkCompactionFromHead/labelnames=10,labelvalues=10000-4                               2592965497      2538805050      -2.09%
BenchmarkCompactionFromHead/labelnames=100,labelvalues=1000-4                               2437388343      2668012858      +9.46%
BenchmarkCompactionFromHead/labelnames=1000,labelvalues=100-4                               2317095324      2787423966      +20.30%
BenchmarkCompactionFromHead/labelnames=10000,labelvalues=10-4                               2600239857      2096973860      -19.35%

benchmark                                                                                   old allocs     new allocs     delta
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4        500851         470794         -6.00%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4       821527         791451         -3.66%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4       1141562        1111508        -2.63%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4       2141576        2111504        -1.40%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4      871466         841424         -3.45%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4     1941428        1911415        -1.55%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4     3071573        3041510        -0.98%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4     6771648        6741509        -0.45%
BenchmarkCompactionFromHead/labelnames=1,labelvalues=100000-4                               731493         824888         +12.77%
BenchmarkCompactionFromHead/labelnames=10,labelvalues=10000-4                               793918         887311         +11.76%
BenchmarkCompactionFromHead/labelnames=100,labelvalues=1000-4                               811842         905204         +11.50%
BenchmarkCompactionFromHead/labelnames=1000,labelvalues=100-4                               832244         925081         +11.16%
BenchmarkCompactionFromHead/labelnames=10000,labelvalues=10-4                               921553         1019162        +10.59%

benchmark                                                                                   old bytes      new bytes      delta
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4        40532648       35698276       -11.93%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4       60340216       53409568       -11.49%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4       81087336       72065552       -11.13%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4       142485576      120878544      -15.16%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4      208661368      203831136      -2.31%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4     347345904      340484696      -1.98%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4     585185856      576244648      -1.53%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4     1357641792     1358966528     +0.10%
BenchmarkCompactionFromHead/labelnames=1,labelvalues=100000-4                               126486664      119666744      -5.39%
BenchmarkCompactionFromHead/labelnames=10,labelvalues=10000-4                               122323192      115117224      -5.89%
BenchmarkCompactionFromHead/labelnames=100,labelvalues=1000-4                               126404504      119469864      -5.49%
BenchmarkCompactionFromHead/labelnames=1000,labelvalues=100-4                               119047832      112230408      -5.73%
BenchmarkCompactionFromHead/labelnames=10000,labelvalues=10-4                               136576016      116634800      -14.60%

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-12-17 11:49:54 -08:00
+										res = append(res, s)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+									}
-												Stream symbols during compaction. (#6468)

Rather than buffer up symbols in RAM, do it one by one
during compaction. Then use the reader's symbol handling
for symbol lookups during the rest of the index write.

There is some slowdown in compaction, due to having to look through a file
rather than a hash lookup. This is noise to the overall cost of compacting
series with thousands of samples though.

benchmark                                                                                   old ns/op       new ns/op       delta
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4        539917175       675341565       +25.08%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4       2441815993      2477453524      +1.46%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4       3978543559      3922909687      -1.40%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4       8430219716      8586610007      +1.86%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4      1786424591      1909552782      +6.89%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4     5328998202      6020839950      +12.98%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4     10085059958     11085278690     +9.92%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4     25497010155     27018079806     +5.97%
BenchmarkCompactionFromHead/labelnames=1,labelvalues=100000-4                               2427391406      2817217987      +16.06%
BenchmarkCompactionFromHead/labelnames=10,labelvalues=10000-4                               2592965497      2538805050      -2.09%
BenchmarkCompactionFromHead/labelnames=100,labelvalues=1000-4                               2437388343      2668012858      +9.46%
BenchmarkCompactionFromHead/labelnames=1000,labelvalues=100-4                               2317095324      2787423966      +20.30%
BenchmarkCompactionFromHead/labelnames=10000,labelvalues=10-4                               2600239857      2096973860      -19.35%

benchmark                                                                                   old allocs     new allocs     delta
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4        500851         470794         -6.00%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4       821527         791451         -3.66%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4       1141562        1111508        -2.63%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4       2141576        2111504        -1.40%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4      871466         841424         -3.45%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4     1941428        1911415        -1.55%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4     3071573        3041510        -0.98%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4     6771648        6741509        -0.45%
BenchmarkCompactionFromHead/labelnames=1,labelvalues=100000-4                               731493         824888         +12.77%
BenchmarkCompactionFromHead/labelnames=10,labelvalues=10000-4                               793918         887311         +11.76%
BenchmarkCompactionFromHead/labelnames=100,labelvalues=1000-4                               811842         905204         +11.50%
BenchmarkCompactionFromHead/labelnames=1000,labelvalues=100-4                               832244         925081         +11.16%
BenchmarkCompactionFromHead/labelnames=10000,labelvalues=10-4                               921553         1019162        +10.59%

benchmark                                                                                   old bytes      new bytes      delta
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4        40532648       35698276       -11.93%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4       60340216       53409568       -11.49%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4       81087336       72065552       -11.13%
BenchmarkCompaction/type=normal,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4       142485576      120878544      -15.16%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=101-4      208661368      203831136      -2.31%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=1001-4     347345904      340484696      -1.98%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=2001-4     585185856      576244648      -1.53%
BenchmarkCompaction/type=vertical,blocks=4,series=10000,samplesPerSeriesPerBlock=5001-4     1357641792     1358966528     +0.10%
BenchmarkCompactionFromHead/labelnames=1,labelvalues=100000-4                               126486664      119666744      -5.39%
BenchmarkCompactionFromHead/labelnames=10,labelvalues=10000-4                               122323192      115117224      -5.89%
BenchmarkCompactionFromHead/labelnames=100,labelvalues=1000-4                               126404504      119469864      -5.49%
BenchmarkCompactionFromHead/labelnames=1000,labelvalues=100-4                               119047832      112230408      -5.73%
BenchmarkCompactionFromHead/labelnames=10000,labelvalues=10-4                               136576016      116634800      -14.60%

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-12-17 11:49:54 -08:00
+									h.head.symMtx.RUnlock()
 									sort.Strings(res)
 									return index.NewStringListIter(res)
-												Misc fixes for initial Prometheus integration

											
										
										
											2016-12-14 09:38:46 -08:00
+								}
-												Added time range parameters to labelNames API (#7288)

* add time range params to labelNames api

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* evaluate min/max time range when reading labels from the head

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* add time range params to labelValues api

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* fix test, add docs

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* add a test for head min max range

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* fix test to match comment

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* address CR comments

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* combine vars only used once

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* add time range params to labelNames api

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* evaluate min/max time range when reading labels from the head

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* add time range params to labelValues api

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* fix test, add docs

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* add a test for head min max range

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* fix test to match comment

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* address CR comments

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* combine vars only used once

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* fix test

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* restart ci

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* use range expectedLabelNames instead of range actualLabelNames in test

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>
											
										
										
											2020-05-30 05:50:09 -07:00
+								// LabelValues returns label values present in the head for the
 								// specific label name that are within the time range mint to maxt.
-												Replace StringTuples with []string

Benchmarks show slight cpu/allocs improvements.

benchmark                                                               old ns/op      new ns/op      delta
BenchmarkPostingsForMatchers/Head/n="1"-4                               269978625      235305110      -12.84%
BenchmarkPostingsForMatchers/Head/n="1",j="foo"-4                       129739974      121646193      -6.24%
BenchmarkPostingsForMatchers/Head/j="foo",n="1"-4                       123826274      122056253      -1.43%
BenchmarkPostingsForMatchers/Head/n="1",j!="foo"-4                      126962188      130038235      +2.42%
BenchmarkPostingsForMatchers/Head/i=~".*"-4                             6423653989     5991126455     -6.73%
BenchmarkPostingsForMatchers/Head/i=~".+"-4                             6934647521     7033370634     +1.42%
BenchmarkPostingsForMatchers/Head/i=~""-4                               1177781285     1121497736     -4.78%
BenchmarkPostingsForMatchers/Head/i!=""-4                               7033680256     7246094991     +3.02%
BenchmarkPostingsForMatchers/Head/n="1",i=~".*",j="foo"-4               293702332      287440212      -2.13%
BenchmarkPostingsForMatchers/Head/n="1",i=~".*",i!="2",j="foo"-4        307628268      307039964      -0.19%
BenchmarkPostingsForMatchers/Head/n="1",i!=""-4                         512247746      480003862      -6.29%
BenchmarkPostingsForMatchers/Head/n="1",i!="",j="foo"-4                 361199794      367066917      +1.62%
BenchmarkPostingsForMatchers/Head/n="1",i=~".+",j="foo"-4               478863761      476037784      -0.59%
BenchmarkPostingsForMatchers/Head/n="1",i=~"1.+",j="foo"-4              103394659      102902098      -0.48%
BenchmarkPostingsForMatchers/Head/n="1",i=~".+",i!="2",j="foo"-4        482552781      475453903      -1.47%
BenchmarkPostingsForMatchers/Head/n="1",i=~".+",i!~"2.*",j="foo"-4      559257389      589297047      +5.37%
BenchmarkPostingsForMatchers/Block/n="1"-4                              36492          37012          +1.42%
BenchmarkPostingsForMatchers/Block/n="1",j="foo"-4                      557788         611903         +9.70%
BenchmarkPostingsForMatchers/Block/j="foo",n="1"-4                      554443         573814         +3.49%
BenchmarkPostingsForMatchers/Block/n="1",j!="foo"-4                     553227         553826         +0.11%
BenchmarkPostingsForMatchers/Block/i=~".*"-4                            113855090      111707221      -1.89%
BenchmarkPostingsForMatchers/Block/i=~".+"-4                            133994674      136520728      +1.89%
BenchmarkPostingsForMatchers/Block/i=~""-4                              38138091       36299898       -4.82%
BenchmarkPostingsForMatchers/Block/i!=""-4                              28861213       27396723       -5.07%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",j="foo"-4              112699941      110853868      -1.64%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",i!="2",j="foo"-4       113198026      111389742      -1.60%
BenchmarkPostingsForMatchers/Block/n="1",i!=""-4                        28994069       27363804       -5.62%
BenchmarkPostingsForMatchers/Block/n="1",i!="",j="foo"-4                29709406       28589223       -3.77%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",j="foo"-4              134695119      135736971      +0.77%
BenchmarkPostingsForMatchers/Block/n="1",i=~"1.+",j="foo"-4             26783286       25826928       -3.57%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!="2",j="foo"-4       134733254      134116739      -0.46%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!~"2.*",j="foo"-4     160713937      158802768      -1.19%

benchmark                                                               old allocs     new allocs     delta
BenchmarkPostingsForMatchers/Head/n="1"-4                               36             36             +0.00%
BenchmarkPostingsForMatchers/Head/n="1",j="foo"-4                       38             38             +0.00%
BenchmarkPostingsForMatchers/Head/j="foo",n="1"-4                       38             38             +0.00%
BenchmarkPostingsForMatchers/Head/n="1",j!="foo"-4                      42             40             -4.76%
BenchmarkPostingsForMatchers/Head/i=~".*"-4                             61             59             -3.28%
BenchmarkPostingsForMatchers/Head/i=~".+"-4                             100088         100087         -0.00%
BenchmarkPostingsForMatchers/Head/i=~""-4                               100053         100051         -0.00%
BenchmarkPostingsForMatchers/Head/i!=""-4                               100087         100085         -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i=~".*",j="foo"-4               44             42             -4.55%
BenchmarkPostingsForMatchers/Head/n="1",i=~".*",i!="2",j="foo"-4        50             48             -4.00%
BenchmarkPostingsForMatchers/Head/n="1",i!=""-4                         100076         100074         -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i!="",j="foo"-4                 100077         100075         -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i=~".+",j="foo"-4               100077         100074         -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i=~"1.+",j="foo"-4              11167          11165          -0.02%
BenchmarkPostingsForMatchers/Head/n="1",i=~".+",i!="2",j="foo"-4        100082         100080         -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i=~".+",i!~"2.*",j="foo"-4      111265         111261         -0.00%
BenchmarkPostingsForMatchers/Block/n="1"-4                              6              6              +0.00%
BenchmarkPostingsForMatchers/Block/n="1",j="foo"-4                      11             11             +0.00%
BenchmarkPostingsForMatchers/Block/j="foo",n="1"-4                      11             11             +0.00%
BenchmarkPostingsForMatchers/Block/n="1",j!="foo"-4                     15             13             -13.33%
BenchmarkPostingsForMatchers/Block/i=~".*"-4                            12             10             -16.67%
BenchmarkPostingsForMatchers/Block/i=~".+"-4                            100040         100038         -0.00%
BenchmarkPostingsForMatchers/Block/i=~""-4                              100045         100043         -0.00%
BenchmarkPostingsForMatchers/Block/i!=""-4                              100041         100039         -0.00%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",j="foo"-4              17             15             -11.76%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",i!="2",j="foo"-4       23             21             -8.70%
BenchmarkPostingsForMatchers/Block/n="1",i!=""-4                        100046         100044         -0.00%
BenchmarkPostingsForMatchers/Block/n="1",i!="",j="foo"-4                100050         100048         -0.00%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",j="foo"-4              100049         100047         -0.00%
BenchmarkPostingsForMatchers/Block/n="1",i=~"1.+",j="foo"-4             11150          11148          -0.02%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!="2",j="foo"-4       100055         100053         -0.00%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!~"2.*",j="foo"-4     111238         111234         -0.00%

benchmark                                                               old bytes     new bytes     delta
BenchmarkPostingsForMatchers/Head/n="1"-4                               10887816      10887817      +0.00%
BenchmarkPostingsForMatchers/Head/n="1",j="foo"-4                       5456648       5456648       +0.00%
BenchmarkPostingsForMatchers/Head/j="foo",n="1"-4                       5456648       5456648       +0.00%
BenchmarkPostingsForMatchers/Head/n="1",j!="foo"-4                      5456792       5456712       -0.00%
BenchmarkPostingsForMatchers/Head/i=~".*"-4                             258254408     258254328     -0.00%
BenchmarkPostingsForMatchers/Head/i=~".+"-4                             273912888     273912904     +0.00%
BenchmarkPostingsForMatchers/Head/i=~""-4                               17266680      17266600      -0.00%
BenchmarkPostingsForMatchers/Head/i!=""-4                               273912416     273912336     -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i=~".*",j="foo"-4               7062578       7062498       -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i=~".*",i!="2",j="foo"-4        7062770       7062690       -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i!=""-4                         28152346      28152266      -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i!="",j="foo"-4                 22721178      22721098      -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i=~".+",j="foo"-4               22721336      22721224      -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i=~"1.+",j="foo"-4              3623804       3623733       -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i=~".+",i!="2",j="foo"-4        22721480      22721400      -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i=~".+",i!~"2.*",j="foo"-4      24816652      24816444      -0.00%
BenchmarkPostingsForMatchers/Block/n="1"-4                              296           296           +0.00%
BenchmarkPostingsForMatchers/Block/n="1",j="foo"-4                      424           424           +0.00%
BenchmarkPostingsForMatchers/Block/j="foo",n="1"-4                      424           424           +0.00%
BenchmarkPostingsForMatchers/Block/n="1",j!="foo"-4                     1544          1464          -5.18%
BenchmarkPostingsForMatchers/Block/i=~".*"-4                            1606114       1606045       -0.00%
BenchmarkPostingsForMatchers/Block/i=~".+"-4                            17264709      17264629      -0.00%
BenchmarkPostingsForMatchers/Block/i=~""-4                              17264780      17264696      -0.00%
BenchmarkPostingsForMatchers/Block/i!=""-4                              17264680      17264600      -0.00%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",j="foo"-4              1606253       1606165       -0.01%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",i!="2",j="foo"-4       1606445       1606348       -0.01%
BenchmarkPostingsForMatchers/Block/n="1",i!=""-4                        17264808      17264728      -0.00%
BenchmarkPostingsForMatchers/Block/n="1",i!="",j="foo"-4                17264936      17264856      -0.00%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",j="foo"-4              17264965      17264885      -0.00%
BenchmarkPostingsForMatchers/Block/n="1",i=~"1.+",j="foo"-4             3148262       3148182       -0.00%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!="2",j="foo"-4       17265141      17265061      -0.00%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!~"2.*",j="foo"-4     20416944      20416784      -0.00%

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>

											
										
										
											2020-01-01 03:38:01 -08:00
+								func (h *headIndexReader) LabelValues(name string) ([]string, error) {
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+									h.head.symMtx.RLock()
-												Added time range parameters to labelNames API (#7288)

* add time range params to labelNames api

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* evaluate min/max time range when reading labels from the head

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* add time range params to labelValues api

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* fix test, add docs

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* add a test for head min max range

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* fix test to match comment

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* address CR comments

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* combine vars only used once

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* add time range params to labelNames api

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* evaluate min/max time range when reading labels from the head

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* add time range params to labelValues api

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* fix test, add docs

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* add a test for head min max range

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* fix test to match comment

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* address CR comments

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* combine vars only used once

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* fix test

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* restart ci

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* use range expectedLabelNames instead of range actualLabelNames in test

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>
											
										
										
											2020-05-30 05:50:09 -07:00
 									if h.maxt < h.head.MinTime() || h.mint > h.head.MaxTime() {
 										h.head.symMtx.RUnlock()
 										return []string{}, nil
 									}
-												Remove last vestiges of never used composite index code.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>

											
										
										
											2020-01-01 03:21:42 -08:00
+									sl := make([]string, 0, len(h.head.values[name]))
 									for s := range h.head.values[name] {
-												Misc fixes for initial Prometheus integration

											
										
										
											2016-12-14 09:38:46 -08:00
+										sl = append(sl, s)
 									}
-												Reduce allocations for queries on `HEAD` (#417)

Some benchmarks for HEAD and allocate the correct slice size in LabelValues , we already know what it'll be

This is ~15% time improvement, and ~25% allocation improvement:


```
benchmark                             old ns/op     new ns/op     delta
BenchmarkHeadPostingForMatchers-4     74452         63514         -14.69%

benchmark                             old allocs     new allocs     delta
BenchmarkHeadPostingForMatchers-4     20             13             -35.00%

benchmark                             old bytes     new bytes     delta
BenchmarkHeadPostingForMatchers-4     5425          3137          -42.18%
```

Signed-off-by: Thomas Jackson <jacksontj.89@gmail.com>

											
										
										
											2018-10-22 03:52:02 -07:00
+									h.head.symMtx.RUnlock()
-												Misc fixes for initial Prometheus integration

											
										
										
											2016-12-14 09:38:46 -08:00
+									sort.Strings(sl)
-												Replace StringTuples with []string

Benchmarks show slight cpu/allocs improvements.

benchmark                                                               old ns/op      new ns/op      delta
BenchmarkPostingsForMatchers/Head/n="1"-4                               269978625      235305110      -12.84%
BenchmarkPostingsForMatchers/Head/n="1",j="foo"-4                       129739974      121646193      -6.24%
BenchmarkPostingsForMatchers/Head/j="foo",n="1"-4                       123826274      122056253      -1.43%
BenchmarkPostingsForMatchers/Head/n="1",j!="foo"-4                      126962188      130038235      +2.42%
BenchmarkPostingsForMatchers/Head/i=~".*"-4                             6423653989     5991126455     -6.73%
BenchmarkPostingsForMatchers/Head/i=~".+"-4                             6934647521     7033370634     +1.42%
BenchmarkPostingsForMatchers/Head/i=~""-4                               1177781285     1121497736     -4.78%
BenchmarkPostingsForMatchers/Head/i!=""-4                               7033680256     7246094991     +3.02%
BenchmarkPostingsForMatchers/Head/n="1",i=~".*",j="foo"-4               293702332      287440212      -2.13%
BenchmarkPostingsForMatchers/Head/n="1",i=~".*",i!="2",j="foo"-4        307628268      307039964      -0.19%
BenchmarkPostingsForMatchers/Head/n="1",i!=""-4                         512247746      480003862      -6.29%
BenchmarkPostingsForMatchers/Head/n="1",i!="",j="foo"-4                 361199794      367066917      +1.62%
BenchmarkPostingsForMatchers/Head/n="1",i=~".+",j="foo"-4               478863761      476037784      -0.59%
BenchmarkPostingsForMatchers/Head/n="1",i=~"1.+",j="foo"-4              103394659      102902098      -0.48%
BenchmarkPostingsForMatchers/Head/n="1",i=~".+",i!="2",j="foo"-4        482552781      475453903      -1.47%
BenchmarkPostingsForMatchers/Head/n="1",i=~".+",i!~"2.*",j="foo"-4      559257389      589297047      +5.37%
BenchmarkPostingsForMatchers/Block/n="1"-4                              36492          37012          +1.42%
BenchmarkPostingsForMatchers/Block/n="1",j="foo"-4                      557788         611903         +9.70%
BenchmarkPostingsForMatchers/Block/j="foo",n="1"-4                      554443         573814         +3.49%
BenchmarkPostingsForMatchers/Block/n="1",j!="foo"-4                     553227         553826         +0.11%
BenchmarkPostingsForMatchers/Block/i=~".*"-4                            113855090      111707221      -1.89%
BenchmarkPostingsForMatchers/Block/i=~".+"-4                            133994674      136520728      +1.89%
BenchmarkPostingsForMatchers/Block/i=~""-4                              38138091       36299898       -4.82%
BenchmarkPostingsForMatchers/Block/i!=""-4                              28861213       27396723       -5.07%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",j="foo"-4              112699941      110853868      -1.64%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",i!="2",j="foo"-4       113198026      111389742      -1.60%
BenchmarkPostingsForMatchers/Block/n="1",i!=""-4                        28994069       27363804       -5.62%
BenchmarkPostingsForMatchers/Block/n="1",i!="",j="foo"-4                29709406       28589223       -3.77%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",j="foo"-4              134695119      135736971      +0.77%
BenchmarkPostingsForMatchers/Block/n="1",i=~"1.+",j="foo"-4             26783286       25826928       -3.57%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!="2",j="foo"-4       134733254      134116739      -0.46%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!~"2.*",j="foo"-4     160713937      158802768      -1.19%

benchmark                                                               old allocs     new allocs     delta
BenchmarkPostingsForMatchers/Head/n="1"-4                               36             36             +0.00%
BenchmarkPostingsForMatchers/Head/n="1",j="foo"-4                       38             38             +0.00%
BenchmarkPostingsForMatchers/Head/j="foo",n="1"-4                       38             38             +0.00%
BenchmarkPostingsForMatchers/Head/n="1",j!="foo"-4                      42             40             -4.76%
BenchmarkPostingsForMatchers/Head/i=~".*"-4                             61             59             -3.28%
BenchmarkPostingsForMatchers/Head/i=~".+"-4                             100088         100087         -0.00%
BenchmarkPostingsForMatchers/Head/i=~""-4                               100053         100051         -0.00%
BenchmarkPostingsForMatchers/Head/i!=""-4                               100087         100085         -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i=~".*",j="foo"-4               44             42             -4.55%
BenchmarkPostingsForMatchers/Head/n="1",i=~".*",i!="2",j="foo"-4        50             48             -4.00%
BenchmarkPostingsForMatchers/Head/n="1",i!=""-4                         100076         100074         -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i!="",j="foo"-4                 100077         100075         -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i=~".+",j="foo"-4               100077         100074         -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i=~"1.+",j="foo"-4              11167          11165          -0.02%
BenchmarkPostingsForMatchers/Head/n="1",i=~".+",i!="2",j="foo"-4        100082         100080         -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i=~".+",i!~"2.*",j="foo"-4      111265         111261         -0.00%
BenchmarkPostingsForMatchers/Block/n="1"-4                              6              6              +0.00%
BenchmarkPostingsForMatchers/Block/n="1",j="foo"-4                      11             11             +0.00%
BenchmarkPostingsForMatchers/Block/j="foo",n="1"-4                      11             11             +0.00%
BenchmarkPostingsForMatchers/Block/n="1",j!="foo"-4                     15             13             -13.33%
BenchmarkPostingsForMatchers/Block/i=~".*"-4                            12             10             -16.67%
BenchmarkPostingsForMatchers/Block/i=~".+"-4                            100040         100038         -0.00%
BenchmarkPostingsForMatchers/Block/i=~""-4                              100045         100043         -0.00%
BenchmarkPostingsForMatchers/Block/i!=""-4                              100041         100039         -0.00%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",j="foo"-4              17             15             -11.76%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",i!="2",j="foo"-4       23             21             -8.70%
BenchmarkPostingsForMatchers/Block/n="1",i!=""-4                        100046         100044         -0.00%
BenchmarkPostingsForMatchers/Block/n="1",i!="",j="foo"-4                100050         100048         -0.00%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",j="foo"-4              100049         100047         -0.00%
BenchmarkPostingsForMatchers/Block/n="1",i=~"1.+",j="foo"-4             11150          11148          -0.02%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!="2",j="foo"-4       100055         100053         -0.00%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!~"2.*",j="foo"-4     111238         111234         -0.00%

benchmark                                                               old bytes     new bytes     delta
BenchmarkPostingsForMatchers/Head/n="1"-4                               10887816      10887817      +0.00%
BenchmarkPostingsForMatchers/Head/n="1",j="foo"-4                       5456648       5456648       +0.00%
BenchmarkPostingsForMatchers/Head/j="foo",n="1"-4                       5456648       5456648       +0.00%
BenchmarkPostingsForMatchers/Head/n="1",j!="foo"-4                      5456792       5456712       -0.00%
BenchmarkPostingsForMatchers/Head/i=~".*"-4                             258254408     258254328     -0.00%
BenchmarkPostingsForMatchers/Head/i=~".+"-4                             273912888     273912904     +0.00%
BenchmarkPostingsForMatchers/Head/i=~""-4                               17266680      17266600      -0.00%
BenchmarkPostingsForMatchers/Head/i!=""-4                               273912416     273912336     -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i=~".*",j="foo"-4               7062578       7062498       -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i=~".*",i!="2",j="foo"-4        7062770       7062690       -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i!=""-4                         28152346      28152266      -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i!="",j="foo"-4                 22721178      22721098      -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i=~".+",j="foo"-4               22721336      22721224      -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i=~"1.+",j="foo"-4              3623804       3623733       -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i=~".+",i!="2",j="foo"-4        22721480      22721400      -0.00%
BenchmarkPostingsForMatchers/Head/n="1",i=~".+",i!~"2.*",j="foo"-4      24816652      24816444      -0.00%
BenchmarkPostingsForMatchers/Block/n="1"-4                              296           296           +0.00%
BenchmarkPostingsForMatchers/Block/n="1",j="foo"-4                      424           424           +0.00%
BenchmarkPostingsForMatchers/Block/j="foo",n="1"-4                      424           424           +0.00%
BenchmarkPostingsForMatchers/Block/n="1",j!="foo"-4                     1544          1464          -5.18%
BenchmarkPostingsForMatchers/Block/i=~".*"-4                            1606114       1606045       -0.00%
BenchmarkPostingsForMatchers/Block/i=~".+"-4                            17264709      17264629      -0.00%
BenchmarkPostingsForMatchers/Block/i=~""-4                              17264780      17264696      -0.00%
BenchmarkPostingsForMatchers/Block/i!=""-4                              17264680      17264600      -0.00%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",j="foo"-4              1606253       1606165       -0.01%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",i!="2",j="foo"-4       1606445       1606348       -0.01%
BenchmarkPostingsForMatchers/Block/n="1",i!=""-4                        17264808      17264728      -0.00%
BenchmarkPostingsForMatchers/Block/n="1",i!="",j="foo"-4                17264936      17264856      -0.00%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",j="foo"-4              17264965      17264885      -0.00%
BenchmarkPostingsForMatchers/Block/n="1",i=~"1.+",j="foo"-4             3148262       3148182       -0.00%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!="2",j="foo"-4       17265141      17265061      -0.00%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!~"2.*",j="foo"-4     20416944      20416784      -0.00%

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>

											
										
										
											2020-01-01 03:38:01 -08:00
+									return sl, nil
-												Misc fixes for initial Prometheus integration

											
										
										
											2016-12-14 09:38:46 -08:00
+								}
-												Added time range parameters to labelNames API (#7288)

* add time range params to labelNames api

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* evaluate min/max time range when reading labels from the head

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* add time range params to labelValues api

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* fix test, add docs

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* add a test for head min max range

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* fix test to match comment

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* address CR comments

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* combine vars only used once

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* add time range params to labelNames api

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* evaluate min/max time range when reading labels from the head

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* add time range params to labelValues api

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* fix test, add docs

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* add a test for head min max range

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* fix test to match comment

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* address CR comments

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* combine vars only used once

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* fix test

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* restart ci

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* use range expectedLabelNames instead of range actualLabelNames in test

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>
											
										
										
											2020-05-30 05:50:09 -07:00
+								// LabelNames returns all the unique label names present in the head
 								// that are within the time range mint to maxt.
-												LabelNames() method to get all unique label names (#369)

* LabelNames() method to get all unique label names

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2018-11-07 07:52:41 -08:00
+								func (h *headIndexReader) LabelNames() ([]string, error) {
 									h.head.symMtx.RLock()
 									defer h.head.symMtx.RUnlock()
-												Added time range parameters to labelNames API (#7288)

* add time range params to labelNames api

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* evaluate min/max time range when reading labels from the head

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* add time range params to labelValues api

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* fix test, add docs

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* add a test for head min max range

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* fix test to match comment

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* address CR comments

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* combine vars only used once

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* add time range params to labelNames api

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* evaluate min/max time range when reading labels from the head

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* add time range params to labelValues api

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* fix test, add docs

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* add a test for head min max range

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* fix test to match comment

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* address CR comments

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* combine vars only used once

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* fix test

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* restart ci

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>

* use range expectedLabelNames instead of range actualLabelNames in test

Signed-off-by: jessicagreben <Jessica.greben1+github@gmail.com>
											
										
										
											2020-05-30 05:50:09 -07:00
 									if h.maxt < h.head.MinTime() || h.mint > h.head.MaxTime() {
 										return []string{}, nil
 									}
-												LabelNames() method to get all unique label names (#369)

* LabelNames() method to get all unique label names

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2018-11-07 07:52:41 -08:00
+									labelNames := make([]string, 0, len(h.head.values))
 									for name := range h.head.values {
 										if name == "" {
 											continue
 										}
 										labelNames = append(labelNames, name)
 									}
 									sort.Strings(labelNames)
 									return labelNames, nil
 								}
-												Reduce memory used by postings offset table.

Rather than keeping the offset of each postings list, instead
keep the nth offset of the offset of the posting list. As postings
list offsets have always been sorted, we can then get to the closest
entry before the one we want an iterate forwards.

I haven't done much tuning on the 32 number, it was chosen to try
not to read through more than a 4k page of data.

Switch to a bulk interface for fetching postings. Use it to avoid having
to re-read parts of the posting offset table when querying lots of it.

For a index with what BenchmarkHeadPostingForMatchers uses RAM
for r.postings drops from 3.79MB to 80.19kB or about 48x.
Bytes allocated go down by 30%, and suprisingly CPU usage drops by
4-6% for typical queries too.

benchmark                                                               old ns/op      new ns/op      delta
BenchmarkPostingsForMatchers/Block/n="1"-4                              35231          36673          +4.09%
BenchmarkPostingsForMatchers/Block/n="1",j="foo"-4                      563380         540627         -4.04%
BenchmarkPostingsForMatchers/Block/j="foo",n="1"-4                      536782         534186         -0.48%
BenchmarkPostingsForMatchers/Block/n="1",j!="foo"-4                     533990         541550         +1.42%
BenchmarkPostingsForMatchers/Block/i=~".*"-4                            113374598      117969608      +4.05%
BenchmarkPostingsForMatchers/Block/i=~".+"-4                            146329884      139651442      -4.56%
BenchmarkPostingsForMatchers/Block/i=~""-4                              50346510       44961127       -10.70%
BenchmarkPostingsForMatchers/Block/i!=""-4                              41261550       35356165       -14.31%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",j="foo"-4              112544418      116904010      +3.87%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",i!="2",j="foo"-4       112487086      116864918      +3.89%
BenchmarkPostingsForMatchers/Block/n="1",i!=""-4                        41094758       35457904       -13.72%
BenchmarkPostingsForMatchers/Block/n="1",i!="",j="foo"-4                41906372       36151473       -13.73%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",j="foo"-4              147262414      140424800      -4.64%
BenchmarkPostingsForMatchers/Block/n="1",i=~"1.+",j="foo"-4             28615629       27872072       -2.60%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!="2",j="foo"-4       147117177      140462403      -4.52%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!~"2.*",j="foo"-4     175096826      167902298      -4.11%

benchmark                                                               old allocs     new allocs     delta
BenchmarkPostingsForMatchers/Block/n="1"-4                              4              6              +50.00%
BenchmarkPostingsForMatchers/Block/n="1",j="foo"-4                      7              11             +57.14%
BenchmarkPostingsForMatchers/Block/j="foo",n="1"-4                      7              11             +57.14%
BenchmarkPostingsForMatchers/Block/n="1",j!="foo"-4                     15             17             +13.33%
BenchmarkPostingsForMatchers/Block/i=~".*"-4                            100010         100012         +0.00%
BenchmarkPostingsForMatchers/Block/i=~".+"-4                            200069         200040         -0.01%
BenchmarkPostingsForMatchers/Block/i=~""-4                              200072         200045         -0.01%
BenchmarkPostingsForMatchers/Block/i!=""-4                              200070         200041         -0.01%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",j="foo"-4              100013         100017         +0.00%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",i!="2",j="foo"-4       100017         100023         +0.01%
BenchmarkPostingsForMatchers/Block/n="1",i!=""-4                        200073         200046         -0.01%
BenchmarkPostingsForMatchers/Block/n="1",i!="",j="foo"-4                200075         200050         -0.01%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",j="foo"-4              200074         200049         -0.01%
BenchmarkPostingsForMatchers/Block/n="1",i=~"1.+",j="foo"-4             111165         111150         -0.01%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!="2",j="foo"-4       200078         200055         -0.01%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!~"2.*",j="foo"-4     311282         311238         -0.01%

benchmark                                                               old bytes     new bytes     delta
BenchmarkPostingsForMatchers/Block/n="1"-4                              264           296           +12.12%
BenchmarkPostingsForMatchers/Block/n="1",j="foo"-4                      360           424           +17.78%
BenchmarkPostingsForMatchers/Block/j="foo",n="1"-4                      360           424           +17.78%
BenchmarkPostingsForMatchers/Block/n="1",j!="foo"-4                     520           552           +6.15%
BenchmarkPostingsForMatchers/Block/i=~".*"-4                            1600461       1600482       +0.00%
BenchmarkPostingsForMatchers/Block/i=~".+"-4                            24900801      17259077      -30.69%
BenchmarkPostingsForMatchers/Block/i=~""-4                              24900836      17259151      -30.69%
BenchmarkPostingsForMatchers/Block/i!=""-4                              24900760      17259048      -30.69%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",j="foo"-4              1600557       1600621       +0.00%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",i!="2",j="foo"-4       1600717       1600813       +0.01%
BenchmarkPostingsForMatchers/Block/n="1",i!=""-4                        24900856      17259176      -30.69%
BenchmarkPostingsForMatchers/Block/n="1",i!="",j="foo"-4                24900952      17259304      -30.69%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",j="foo"-4              24900993      17259333      -30.69%
BenchmarkPostingsForMatchers/Block/n="1",i=~"1.+",j="foo"-4             3788311       3142630       -17.04%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!="2",j="foo"-4       24901137      17259509      -30.69%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!~"2.*",j="foo"-4     28693086      20405680      -28.88%

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>

											
										
										
											2019-12-05 10:27:40 -08:00
+								// Postings returns the postings list iterator for the label pairs.
 								func (h *headIndexReader) Postings(name string, values ...string) (index.Postings, error) {
 									res := make([]index.Postings, 0, len(values))
 									for _, value := range values {
-												Revert head posting optimization

This reverts commit 52630ad0c735f2dce4ce5bb851acb6c5d7df5eb1.

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>

											
										
										
											2020-03-25 12:13:47 -07:00
+										res = append(res, h.head.postings.Get(name, value))
-												Reduce memory used by postings offset table.

Rather than keeping the offset of each postings list, instead
keep the nth offset of the offset of the posting list. As postings
list offsets have always been sorted, we can then get to the closest
entry before the one we want an iterate forwards.

I haven't done much tuning on the 32 number, it was chosen to try
not to read through more than a 4k page of data.

Switch to a bulk interface for fetching postings. Use it to avoid having
to re-read parts of the posting offset table when querying lots of it.

For a index with what BenchmarkHeadPostingForMatchers uses RAM
for r.postings drops from 3.79MB to 80.19kB or about 48x.
Bytes allocated go down by 30%, and suprisingly CPU usage drops by
4-6% for typical queries too.

benchmark                                                               old ns/op      new ns/op      delta
BenchmarkPostingsForMatchers/Block/n="1"-4                              35231          36673          +4.09%
BenchmarkPostingsForMatchers/Block/n="1",j="foo"-4                      563380         540627         -4.04%
BenchmarkPostingsForMatchers/Block/j="foo",n="1"-4                      536782         534186         -0.48%
BenchmarkPostingsForMatchers/Block/n="1",j!="foo"-4                     533990         541550         +1.42%
BenchmarkPostingsForMatchers/Block/i=~".*"-4                            113374598      117969608      +4.05%
BenchmarkPostingsForMatchers/Block/i=~".+"-4                            146329884      139651442      -4.56%
BenchmarkPostingsForMatchers/Block/i=~""-4                              50346510       44961127       -10.70%
BenchmarkPostingsForMatchers/Block/i!=""-4                              41261550       35356165       -14.31%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",j="foo"-4              112544418      116904010      +3.87%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",i!="2",j="foo"-4       112487086      116864918      +3.89%
BenchmarkPostingsForMatchers/Block/n="1",i!=""-4                        41094758       35457904       -13.72%
BenchmarkPostingsForMatchers/Block/n="1",i!="",j="foo"-4                41906372       36151473       -13.73%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",j="foo"-4              147262414      140424800      -4.64%
BenchmarkPostingsForMatchers/Block/n="1",i=~"1.+",j="foo"-4             28615629       27872072       -2.60%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!="2",j="foo"-4       147117177      140462403      -4.52%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!~"2.*",j="foo"-4     175096826      167902298      -4.11%

benchmark                                                               old allocs     new allocs     delta
BenchmarkPostingsForMatchers/Block/n="1"-4                              4              6              +50.00%
BenchmarkPostingsForMatchers/Block/n="1",j="foo"-4                      7              11             +57.14%
BenchmarkPostingsForMatchers/Block/j="foo",n="1"-4                      7              11             +57.14%
BenchmarkPostingsForMatchers/Block/n="1",j!="foo"-4                     15             17             +13.33%
BenchmarkPostingsForMatchers/Block/i=~".*"-4                            100010         100012         +0.00%
BenchmarkPostingsForMatchers/Block/i=~".+"-4                            200069         200040         -0.01%
BenchmarkPostingsForMatchers/Block/i=~""-4                              200072         200045         -0.01%
BenchmarkPostingsForMatchers/Block/i!=""-4                              200070         200041         -0.01%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",j="foo"-4              100013         100017         +0.00%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",i!="2",j="foo"-4       100017         100023         +0.01%
BenchmarkPostingsForMatchers/Block/n="1",i!=""-4                        200073         200046         -0.01%
BenchmarkPostingsForMatchers/Block/n="1",i!="",j="foo"-4                200075         200050         -0.01%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",j="foo"-4              200074         200049         -0.01%
BenchmarkPostingsForMatchers/Block/n="1",i=~"1.+",j="foo"-4             111165         111150         -0.01%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!="2",j="foo"-4       200078         200055         -0.01%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!~"2.*",j="foo"-4     311282         311238         -0.01%

benchmark                                                               old bytes     new bytes     delta
BenchmarkPostingsForMatchers/Block/n="1"-4                              264           296           +12.12%
BenchmarkPostingsForMatchers/Block/n="1",j="foo"-4                      360           424           +17.78%
BenchmarkPostingsForMatchers/Block/j="foo",n="1"-4                      360           424           +17.78%
BenchmarkPostingsForMatchers/Block/n="1",j!="foo"-4                     520           552           +6.15%
BenchmarkPostingsForMatchers/Block/i=~".*"-4                            1600461       1600482       +0.00%
BenchmarkPostingsForMatchers/Block/i=~".+"-4                            24900801      17259077      -30.69%
BenchmarkPostingsForMatchers/Block/i=~""-4                              24900836      17259151      -30.69%
BenchmarkPostingsForMatchers/Block/i!=""-4                              24900760      17259048      -30.69%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",j="foo"-4              1600557       1600621       +0.00%
BenchmarkPostingsForMatchers/Block/n="1",i=~".*",i!="2",j="foo"-4       1600717       1600813       +0.01%
BenchmarkPostingsForMatchers/Block/n="1",i!=""-4                        24900856      17259176      -30.69%
BenchmarkPostingsForMatchers/Block/n="1",i!="",j="foo"-4                24900952      17259304      -30.69%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",j="foo"-4              24900993      17259333      -30.69%
BenchmarkPostingsForMatchers/Block/n="1",i=~"1.+",j="foo"-4             3788311       3142630       -17.04%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!="2",j="foo"-4       24901137      17259509      -30.69%
BenchmarkPostingsForMatchers/Block/n="1",i=~".+",i!~"2.*",j="foo"-4     28693086      20405680      -28.88%

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>

											
										
										
											2019-12-05 10:27:40 -08:00
+									}
 									return index.Merge(res...), nil
-												Misc fixes for initial Prometheus integration

											
										
										
											2016-12-14 09:38:46 -08:00
+								}
-												Move index and chunk encoders to own packages

											
										
										
											2017-11-30 06:34:49 -08:00
+								func (h *headIndexReader) SortedPostings(p index.Postings) index.Postings {
-												Move series fetches out of inner loop of SortedPostings. (#485)

With 1M series:

Before:
BenchmarkHeadPostingForMatchers-8              1        3501996117 ns/op 61311520 B/op         78 allocs/op

After:
BenchmarkHeadPostingForMatchers-8              1        1403072952 ns/op 69261568 B/op         72 allocs/op

This works out as 3X faster, as the above time includes other things.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-01-03 02:35:10 -08:00
+									series := make([]*memSeries, 0, 128)
-												Persist series without allocating the full set

Change index persistence for series to not be accumulated in memory
before being written as one large batch. `Labels` and `ChunkMeta`
objects are reused.
This cuts down memory spikes during compaction of multiple blocks
significantly.

As part of the the Index{Reader,Writer} now have an explicit notion of
symbols and series must be inserted in order.

											
										
										
											2017-08-05 04:31:48 -07:00
-												Move series fetches out of inner loop of SortedPostings. (#485)

With 1M series:

Before:
BenchmarkHeadPostingForMatchers-8              1        3501996117 ns/op 61311520 B/op         78 allocs/op

After:
BenchmarkHeadPostingForMatchers-8              1        1403072952 ns/op 69261568 B/op         72 allocs/op

This works out as 3X faster, as the above time includes other things.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-01-03 02:35:10 -08:00
+									// Fetch all the series only once.
-												Persist series without allocating the full set

Change index persistence for series to not be accumulated in memory
before being written as one large batch. `Labels` and `ChunkMeta`
objects are reused.
This cuts down memory spikes during compaction of multiple blocks
significantly.

As part of the the Index{Reader,Writer} now have an explicit notion of
symbols and series must be inserted in order.

											
										
										
											2017-08-05 04:31:48 -07:00
+									for p.Next() {
-												Move series fetches out of inner loop of SortedPostings. (#485)

With 1M series:

Before:
BenchmarkHeadPostingForMatchers-8              1        3501996117 ns/op 61311520 B/op         78 allocs/op

After:
BenchmarkHeadPostingForMatchers-8              1        1403072952 ns/op 69261568 B/op         72 allocs/op

This works out as 3X faster, as the above time includes other things.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-01-03 02:35:10 -08:00
+										s := h.head.series.getByID(p.At())
 										if s == nil {
-												Capitalizing first letter of all log lines (#7043)

Signed-off-by: Marek Slabicki <thaniri@gmail.com>
											
										
										
											2020-04-11 01:22:18 -07:00
+											level.Debug(h.head.logger).Log("msg", "Looked up series not found")
-												Move series fetches out of inner loop of SortedPostings. (#485)

With 1M series:

Before:
BenchmarkHeadPostingForMatchers-8              1        3501996117 ns/op 61311520 B/op         78 allocs/op

After:
BenchmarkHeadPostingForMatchers-8              1        1403072952 ns/op 69261568 B/op         72 allocs/op

This works out as 3X faster, as the above time includes other things.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-01-03 02:35:10 -08:00
+										} else {
 											series = append(series, s)
 										}
-												Persist series without allocating the full set

Change index persistence for series to not be accumulated in memory
before being written as one large batch. `Labels` and `ChunkMeta`
objects are reused.
This cuts down memory spikes during compaction of multiple blocks
significantly.

As part of the the Index{Reader,Writer} now have an explicit notion of
symbols and series must be inserted in order.

											
										
										
											2017-08-05 04:31:48 -07:00
+									}
 									if err := p.Err(); err != nil {
-												Move index and chunk encoders to own packages

											
										
										
											2017-11-30 06:34:49 -08:00
+										return index.ErrPostings(errors.Wrap(err, "expand postings"))
-												Persist series without allocating the full set

Change index persistence for series to not be accumulated in memory
before being written as one large batch. `Labels` and `ChunkMeta`
objects are reused.
This cuts down memory spikes during compaction of multiple blocks
significantly.

As part of the the Index{Reader,Writer} now have an explicit notion of
symbols and series must be inserted in order.

											
										
										
											2017-08-05 04:31:48 -07:00
+									}
-												Move series fetches out of inner loop of SortedPostings. (#485)

With 1M series:

Before:
BenchmarkHeadPostingForMatchers-8              1        3501996117 ns/op 61311520 B/op         78 allocs/op

After:
BenchmarkHeadPostingForMatchers-8              1        1403072952 ns/op 69261568 B/op         72 allocs/op

This works out as 3X faster, as the above time includes other things.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-01-03 02:35:10 -08:00
+									sort.Slice(series, func(i, j int) bool {
 										return labels.Compare(series[i].lset, series[j].lset) < 0
-												Persist series without allocating the full set

Change index persistence for series to not be accumulated in memory
before being written as one large batch. `Labels` and `ChunkMeta`
objects are reused.
This cuts down memory spikes during compaction of multiple blocks
significantly.

As part of the the Index{Reader,Writer} now have an explicit notion of
symbols and series must be inserted in order.

											
										
										
											2017-08-05 04:31:48 -07:00
+									})
-												Move series fetches out of inner loop of SortedPostings. (#485)

With 1M series:

Before:
BenchmarkHeadPostingForMatchers-8              1        3501996117 ns/op 61311520 B/op         78 allocs/op

After:
BenchmarkHeadPostingForMatchers-8              1        1403072952 ns/op 69261568 B/op         72 allocs/op

This works out as 3X faster, as the above time includes other things.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-01-03 02:35:10 -08:00
 									// Convert back to list.
 									ep := make([]uint64, 0, len(series))
 									for _, p := range series {
 										ep = append(ep, p.ref)
 									}
-												Move index and chunk encoders to own packages

											
										
										
											2017-11-30 06:34:49 -08:00
+									return index.NewListPostings(ep)
-												Persist series without allocating the full set

Change index persistence for series to not be accumulated in memory
before being written as one large batch. `Labels` and `ChunkMeta`
objects are reused.
This cuts down memory spikes during compaction of multiple blocks
significantly.

As part of the the Index{Reader,Writer} now have an explicit notion of
symbols and series must be inserted in order.

											
										
										
											2017-08-05 04:31:48 -07:00
+								}
-												Misc fixes for initial Prometheus integration

											
										
										
											2016-12-14 09:38:46 -08:00
+								// Series returns the series for the given reference.
-												Move index and chunk encoders to own packages

											
										
										
											2017-11-30 06:34:49 -08:00
+								func (h *headIndexReader) Series(ref uint64, lbls *labels.Labels, chks *[]chunks.Meta) error {
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+									s := h.head.series.getByID(ref)
-												Use separate lock for series creation

This uses the head block's own lock to only lock if new series were
encountered.
In the general append case we just need to hold a

											
										
										
											2017-01-06 08:23:12 -08:00
-												Switch append refs to string

											
										
										
											2017-05-17 07:43:01 -07:00
+									if s == nil {
-												head: track number of series not found errors in metric

											
										
										
											2017-10-12 06:25:12 -07:00
+										h.head.metrics.seriesNotFound.Inc()
-												Make TSDB use storage errors

This fixes #6992, which was introduced by #6777. There was an
intermediate component which translated TSDB errors into storage errors,
but that component was deleted and this bug went unnoticed, until we
were watching at the Prombench results. Without this, scrape will fail
instead of dropping samples or using "Add" when the series have been
garbage collected.

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>

											
										
										
											2020-03-16 14:52:02 -07:00
+										return storage.ErrNotFound
-												Switch append refs to string

											
										
										
											2017-05-17 07:43:01 -07:00
+									}
-												Persist series without allocating the full set

Change index persistence for series to not be accumulated in memory
before being written as one large batch. `Labels` and `ChunkMeta`
objects are reused.
This cuts down memory spikes during compaction of multiple blocks
significantly.

As part of the the Index{Reader,Writer} now have an explicit notion of
symbols and series must be inserted in order.

											
										
										
											2017-08-05 04:31:48 -07:00
+									*lbls = append((*lbls)[:0], s.lset...)
-												Replace single head chunk per series with memSeries

This adds a memory series holding several chunk to replace
the single head chunk per series so far.
This is necessary for uniform maximum chunk sizes in cases
where some series have higher frequency samples than others.

											
										
										
											2017-01-11 04:02:38 -08:00
-												Fix various races

											
										
										
											2017-09-07 23:48:19 -07:00
+									s.Lock()
 									defer s.Unlock()
-												Replace single head chunk per series with memSeries

This adds a memory series holding several chunk to replace
the single head chunk per series so far.
This is necessary for uniform maximum chunk sizes in cases
where some series have higher frequency samples than others.

											
										
										
											2017-01-11 04:02:38 -08:00
-												Persist series without allocating the full set

Change index persistence for series to not be accumulated in memory
before being written as one large batch. `Labels` and `ChunkMeta`
objects are reused.
This cuts down memory spikes during compaction of multiple blocks
significantly.

As part of the the Index{Reader,Writer} now have an explicit notion of
symbols and series must be inserted in order.

											
										
										
											2017-08-05 04:31:48 -07:00
+									*chks = (*chks)[:0]
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									for i, c := range s.mmappedChunks {
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+										// Do not expose chunks that are outside of the specified range.
-												Make interval overlap comparisons more explicit

Blocks are half-open intervals [a, b), while all other intervals
(chunks, head, ...) are closed intervals [a, b].

Make that distinction explicit by defining `OverlapsClosedInterval()`
methods for blocks and chunks, and using them in place of the more
generic `intervalOverlap()` function.

This change also fixes `db.Querier()` and `db.Delete()`, which could
previously return one extraneous block at the end of the specified
interval.

Signed-off-by: Benoît Knecht <benoit.knecht@fsfe.org>

											
										
										
											2018-07-02 01:23:36 -07:00
+										if !c.OverlapsClosedInterval(h.mint, h.maxt) {
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+											continue
 										}
-												Move index and chunk encoders to own packages

											
										
										
											2017-11-30 06:34:49 -08:00
+										*chks = append(*chks, chunks.Meta{
-												Replace single head chunk per series with memSeries

This adds a memory series holding several chunk to replace
the single head chunk per series so far.
This is necessary for uniform maximum chunk sizes in cases
where some series have higher frequency samples than others.

											
										
										
											2017-01-11 04:02:38 -08:00
+											MinTime: c.minTime,
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+											MaxTime: c.maxTime,
-												Change series ID from uint32 to uint64

											
										
										
											2017-09-04 07:08:38 -07:00
+											Ref:     packChunkID(s.ref, uint64(s.chunkID(i))),
-												Replace single head chunk per series with memSeries

This adds a memory series holding several chunk to replace
the single head chunk per series so far.
This is necessary for uniform maximum chunk sizes in cases
where some series have higher frequency samples than others.

											
										
										
											2017-01-11 04:02:38 -08:00
+										})
-												Consolidate persistence and compaction

											
										
										
											2017-01-03 06:43:26 -08:00
+									}
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									if s.headChunk != nil && s.headChunk.OverlapsClosedInterval(h.mint, h.maxt) {
 										*chks = append(*chks, chunks.Meta{
 											MinTime: s.headChunk.minTime,
 											MaxTime: math.MaxInt64, // Set the head chunks as open (being appended to).
 											Ref:     packChunkID(s.ref, uint64(s.chunkID(len(s.mmappedChunks)))),
 										})
 									}
-												Replace single head chunk per series with memSeries

This adds a memory series holding several chunk to replace
the single head chunk per series so far.
This is necessary for uniform maximum chunk sizes in cases
where some series have higher frequency samples than others.

											
										
										
											2017-01-11 04:02:38 -08:00
-												Persist series without allocating the full set

Change index persistence for series to not be accumulated in memory
before being written as one large batch. `Labels` and `ChunkMeta`
objects are reused.
This cuts down memory spikes during compaction of multiple blocks
significantly.

As part of the the Index{Reader,Writer} now have an explicit notion of
symbols and series must be inserted in order.

											
										
										
											2017-08-05 04:31:48 -07:00
+									return nil
-												Modify IndexReader API to accomodate compaction

This changes the IndexReader API to expose plain labels
and chunk meta information instead of a Series interface.
Dropping of irrelevant chunks is moved into the querier.

A LabelIndices method is added to query for existing label
value indices.

											
										
										
											2016-12-31 06:35:08 -08:00
+								}
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+								func (h *Head) getOrCreate(hash uint64, lset labels.Labels) (*memSeries, bool, error) {
-												Simplify series create logic in head

											
										
										
											2017-09-18 03:28:56 -07:00
+									// Just using `getOrSet` below would be semantically sufficient, but we'd create
 									// a new series on every sample inserted via Add(), which causes allocations
 									// and makes our series IDs rather random and harder to compress in postings.
 									s := h.series.getByHash(hash, lset)
 									if s != nil {
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+										return s, false, nil
-												Simplify series create logic in head

											
										
										
											2017-09-18 03:28:56 -07:00
+									}
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+									// Optimistically assume that we are the first one to create the series.
-												Change series ID from uint32 to uint64

											
										
										
											2017-09-04 07:08:38 -07:00
+									id := atomic.AddUint64(&h.lastSeriesID, 1)
-												Create series with ID recorded in WAL when reading it back

											
										
										
											2017-09-19 01:20:19 -07:00
 									return h.getOrCreateWithID(id, hash, lset)
 								}
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+								func (h *Head) getOrCreateWithID(id, hash uint64, lset labels.Labels) (*memSeries, bool, error) {
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									s := newMemSeries(lset, id, h.chunkRange, &h.memChunkPool)
-												Consolidate mem index into HeadBlock

											
										
										
											2016-12-21 16:12:28 -08:00
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+									s, created, err := h.series.getOrSet(hash, s)
 									if err != nil {
 										return nil, false, err
 									}
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+									if !created {
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+										return s, false, nil
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+									}
-												Simplify series create logic in head

											
										
										
											2017-09-18 03:28:56 -07:00
+									h.metrics.seriesCreated.Inc()
-												Open db in Read only mode (#588)

* Added db read only open mode and use it for the tsdb cli.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2019-07-23 01:04:48 -07:00
+									atomic.AddUint64(&h.numSeries, 1)
-												Simplify series create logic in head

											
										
										
											2017-09-18 03:28:56 -07:00
-												Move index and chunk encoders to own packages

											
										
										
											2017-11-30 06:34:49 -08:00
+									h.postings.Add(id, lset)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
 									h.symMtx.Lock()
 									defer h.symMtx.Unlock()
-												Consolidate mem index into HeadBlock

											
										
										
											2016-12-21 16:12:28 -08:00
 									for _, l := range lset {
 										valset, ok := h.values[l.Name]
 										if !ok {
 											valset = stringset{}
 											h.values[l.Name] = valset
 										}
 										valset.set(l.Value)
-												Consolidate persistence and compaction

											
										
										
											2017-01-03 06:43:26 -08:00
-												Persist series without allocating the full set

Change index persistence for series to not be accumulated in memory
before being written as one large batch. `Labels` and `ChunkMeta`
objects are reused.
This cuts down memory spikes during compaction of multiple blocks
significantly.

As part of the the Index{Reader,Writer} now have an explicit notion of
symbols and series must be inserted in order.

											
										
										
											2017-08-05 04:31:48 -07:00
+										h.symbols[l.Name] = struct{}{}
 										h.symbols[l.Value] = struct{}{}
-												Consolidate mem index into HeadBlock

											
										
										
											2016-12-21 16:12:28 -08:00
+									}
-												Consolidate persistence and compaction

											
										
										
											2017-01-03 06:43:26 -08:00
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+									return s, true, nil
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+								}
 								// seriesHashmap is a simple hashmap for memSeries by their label set. It is built
 								// on top of a regular hashmap and holds a slice of series to resolve hash collisions.
 								// Its methods require the hash to be submitted with it to avoid re-computations throughout
 								// the code.
 								type seriesHashmap map[uint64][]*memSeries
-												Add new interfaces and skeleton

											
										
										
											2016-12-04 04:16:11 -08:00
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+								func (m seriesHashmap) get(hash uint64, lset labels.Labels) *memSeries {
 									for _, s := range m[hash] {
-												Port tsdb to use pkg/labels. (#6326)

* Port tsdb to use pkg/labels.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Get tests passing.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Remove useless cast.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Appease linters.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-11-18 11:53:33 -08:00
+										if labels.Equal(s.lset, lset) {
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+											return s
 										}
 									}
 									return nil
 								}
 								func (m seriesHashmap) set(hash uint64, s *memSeries) {
 									l := m[hash]
 									for i, prev := range l {
-												Port tsdb to use pkg/labels. (#6326)

* Port tsdb to use pkg/labels.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Get tests passing.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Remove useless cast.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Appease linters.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-11-18 11:53:33 -08:00
+										if labels.Equal(prev.lset, s.lset) {
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+											l[i] = s
 											return
 										}
 									}
 									m[hash] = append(l, s)
 								}
 								func (m seriesHashmap) del(hash uint64, lset labels.Labels) {
 									var rem []*memSeries
 									for _, s := range m[hash] {
-												Port tsdb to use pkg/labels. (#6326)

* Port tsdb to use pkg/labels.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Get tests passing.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Remove useless cast.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Appease linters.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-11-18 11:53:33 -08:00
+										if !labels.Equal(s.lset, lset) {
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+											rem = append(rem, s)
 										}
 									}
 									if len(rem) == 0 {
 										delete(m, hash)
 									} else {
 										m[hash] = rem
 									}
 								}
-												made stripe size configurable (#6644)

Signed-off-by: Thor <thansen@digitalocean.com>
											
										
										
											2020-01-29 23:12:43 -08:00
+								const (
 									// DefaultStripeSize is the default number of entries to allocate in the stripeSeries hash map.
 									DefaultStripeSize = 1 << 14
 								)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+								// stripeSeries locks modulo ranges of IDs and hashes to reduce lock contention.
-												Fix typos in comments (#254)

a the -> the
timestmap -> timestamp
badded -> padded
its -> it is
callers -> caller's
											
										
										
											2018-01-13 09:51:50 -08:00
+								// The locks are padded to not be on the same cache line. Filling the padded space
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+								// with the maps was profiled to be slower – likely due to the additional pointer
 								// dereferences.
 								type stripeSeries struct {
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+									size                    int
 									series                  []map[uint64]*memSeries
 									hashes                  []seriesHashmap
 									locks                   []stripeLock
 									seriesLifecycleCallback SeriesLifecycleCallback
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+								}
 								type stripeLock struct {
 									sync.RWMutex
 									// Padding to avoid multiple locks being on the same cache line.
 									_ [40]byte
 								}
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+								func newStripeSeries(stripeSize int, seriesCallback SeriesLifecycleCallback) *stripeSeries {
-												made stripe size configurable (#6644)

Signed-off-by: Thor <thansen@digitalocean.com>
											
										
										
											2020-01-29 23:12:43 -08:00
+									s := &stripeSeries{
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+										size:                    stripeSize,
 										series:                  make([]map[uint64]*memSeries, stripeSize),
 										hashes:                  make([]seriesHashmap, stripeSize),
 										locks:                   make([]stripeLock, stripeSize),
 										seriesLifecycleCallback: seriesCallback,
-												made stripe size configurable (#6644)

Signed-off-by: Thor <thansen@digitalocean.com>
											
										
										
											2020-01-29 23:12:43 -08:00
+									}
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
 									for i := range s.series {
 										s.series[i] = map[uint64]*memSeries{}
 									}
 									for i := range s.hashes {
 										s.hashes[i] = seriesHashmap{}
 									}
-												Replace single head chunk per series with memSeries

This adds a memory series holding several chunk to replace
the single head chunk per series so far.
This is necessary for uniform maximum chunk sizes in cases
where some series have higher frequency samples than others.

											
										
										
											2017-01-11 04:02:38 -08:00
+									return s
-												Add new interfaces and skeleton

											
										
										
											2016-12-04 04:16:11 -08:00
+								}
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+								// gc garbage collects old chunks that are strictly before mint and removes
 								// series entirely that have no chunks left.
 								func (s *stripeSeries) gc(mint int64) (map[uint64]struct{}, int) {
 									var (
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+										deleted            = map[uint64]struct{}{}
 										deletedForCallback = []labels.Labels{}
 										rmChunks           = 0
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+									)
 									// Run through all series and truncate old chunks. Mark those with no
-												Filter WAL data in Head, misc fixes

											
										
										
											2017-09-06 07:20:37 -07:00
+									// chunks left as deleted and store their ID.
-												made stripe size configurable (#6644)

Signed-off-by: Thor <thansen@digitalocean.com>
											
										
										
											2020-01-29 23:12:43 -08:00
+									for i := 0; i < s.size; i++ {
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+										s.locks[i].Lock()
 										for hash, all := range s.hashes[i] {
 											for _, series := range all {
-												Fix various races

											
										
										
											2017-09-07 23:48:19 -07:00
+												series.Lock()
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+												rmChunks += series.truncateChunksBefore(mint)
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+												if len(series.mmappedChunks) > 0 || series.headChunk != nil || series.pendingCommit {
-												Fix various races

											
										
										
											2017-09-07 23:48:19 -07:00
+													series.Unlock()
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+													continue
 												}
 												// The series is gone entirely. We need to keep the series lock
 												// and make sure we have acquired the stripe locks for hash and ID of the
 												// series alike.
 												// If we don't hold them all, there's a very small chance that a series receives
 												// samples again while we are half-way into deleting it.
-												made stripe size configurable (#6644)

Signed-off-by: Thor <thansen@digitalocean.com>
											
										
										
											2020-01-29 23:12:43 -08:00
+												j := int(series.ref) & (s.size - 1)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
 												if i != j {
 													s.locks[j].Lock()
 												}
 												deleted[series.ref] = struct{}{}
 												s.hashes[i].del(hash, series.lset)
 												delete(s.series[j], series.ref)
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+												deletedForCallback = append(deletedForCallback, series.lset)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
 												if i != j {
 													s.locks[j].Unlock()
 												}
-												Fix various races

											
										
										
											2017-09-07 23:48:19 -07:00
+												series.Unlock()
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+											}
 										}
 										s.locks[i].Unlock()
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
 										s.seriesLifecycleCallback.PostDeletion(deletedForCallback...)
 										deletedForCallback = deletedForCallback[:0]
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+									}
 									return deleted, rmChunks
 								}
 								func (s *stripeSeries) getByID(id uint64) *memSeries {
-												made stripe size configurable (#6644)

Signed-off-by: Thor <thansen@digitalocean.com>
											
										
										
											2020-01-29 23:12:43 -08:00
+									i := id & uint64(s.size-1)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
 									s.locks[i].RLock()
 									series := s.series[i][id]
 									s.locks[i].RUnlock()
 									return series
 								}
 								func (s *stripeSeries) getByHash(hash uint64, lset labels.Labels) *memSeries {
-												made stripe size configurable (#6644)

Signed-off-by: Thor <thansen@digitalocean.com>
											
										
										
											2020-01-29 23:12:43 -08:00
+									i := hash & uint64(s.size-1)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
 									s.locks[i].RLock()
 									series := s.hashes[i].get(hash, lset)
 									s.locks[i].RUnlock()
 									return series
 								}
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+								func (s *stripeSeries) getOrSet(hash uint64, series *memSeries) (*memSeries, bool, error) {
 									// PreCreation is called here to avoid calling it inside the lock.
 									// It is not necessary to call it just before creating a series,
 									// rather it gives a 'hint' whether to create a series or not.
 									createSeriesErr := s.seriesLifecycleCallback.PreCreation(series.lset)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+									i := hash & uint64(s.size-1)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+									s.locks[i].Lock()
 									if prev := s.hashes[i].get(hash, series.lset); prev != nil {
-												Add missing unlock on early return

											
										
										
											2017-09-18 02:23:22 -07:00
+										s.locks[i].Unlock()
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+										return prev, false, nil
 									}
 									if createSeriesErr == nil {
 										s.hashes[i].set(hash, series)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+									}
 									s.locks[i].Unlock()
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+									if createSeriesErr != nil {
 										// The callback prevented creation of series.
 										return nil, false, createSeriesErr
 									}
 									// Setting the series in the s.hashes marks the creation of series
 									// as any further calls to this methods would return that series.
 									s.seriesLifecycleCallback.PostCreation(series.lset)
-												made stripe size configurable (#6644)

Signed-off-by: Thor <thansen@digitalocean.com>
											
										
										
											2020-01-29 23:12:43 -08:00
+									i = series.ref & uint64(s.size-1)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
 									s.locks[i].Lock()
 									s.series[i][series.ref] = series
 									s.locks[i].Unlock()
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
+									return series, true, nil
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 02:45:18 -07:00
+								}
-												Move BufferedSeriesIterator in own package

This functionality is useful for a lot of clients but not relevant to
the TSDB's core features.

											
										
										
											2017-03-24 02:20:39 -07:00
+								type sample struct {
 									t int64
 									v float64
 								}
-												refactor util funcs to allow re-usage. (#419)

* refactor util funcs to allow reusage.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-10-25 13:06:19 -07:00
+								func (s sample) T() int64 {
 									return s.t
 								}
 								func (s sample) V() float64 {
 									return s.v
 								}
-												Fix various races

											
										
										
											2017-09-07 23:48:19 -07:00
+								// memSeries is the in-memory representation of a series. None of its methods
-												Fix typos in comments (#254)

a the -> the
timestmap -> timestamp
badded -> padded
its -> it is
callers -> caller's
											
										
										
											2018-01-13 09:51:50 -08:00
+								// are goroutine safe and it is the caller's responsibility to lock it.
-												Replace single head chunk per series with memSeries

This adds a memory series holding several chunk to replace
the single head chunk per series so far.
This is necessary for uniform maximum chunk sizes in cases
where some series have higher frequency samples than others.

											
										
										
											2017-01-11 04:02:38 -08:00
+								type memSeries struct {
-												tsdb: fix races around head chunks (#6985)

* tsdb: fix races around head chunks

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
											
										
										
											2020-03-16 05:59:22 -07:00
+									sync.RWMutex
-												Make concurrent head chunk reads safe, fix misc races

This adds a 4 sample buffer to every head chunk. The XOR
compression scheme may edit bytes in place. The minimum size
of a sample is 2 bits. So keeping the last 4 samples in an in-memory
buffer makes it safe to query the preceeding ones while samples
are added

											
										
										
											2017-01-09 07:51:39 -08:00
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									ref           uint64
 									lset          labels.Labels
 									mmappedChunks []*mmappedChunk
 									headChunk     *memChunk
 									chunkRange    int64
 									firstChunkID  int
-												Make concurrent head chunk reads safe, fix misc races

This adds a 4 sample buffer to every head chunk. The XOR
compression scheme may edit bytes in place. The minimum size
of a sample is 2 bits. So keeping the last 4 samples in an in-memory
buffer makes it safe to query the preceeding ones while samples
are added

											
										
										
											2017-01-09 07:51:39 -08:00
-												Fix race condition between gc and committing (#378)

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>
											
										
										
											2018-09-17 09:58:42 -07:00
+									nextAt        int64 // Timestamp at which to cut the next chunk.
 									sampleBuf     [4]sample
 									pendingCommit bool // Whether there are samples waiting to be committed to this series.
-												Make concurrent head chunk reads safe, fix misc races

This adds a 4 sample buffer to every head chunk. The XOR
compression scheme may edit bytes in place. The minimum size
of a sample is 2 bits. So keeping the last 4 samples in an in-memory
buffer makes it safe to query the preceeding ones while samples
are added

											
										
										
											2017-01-09 07:51:39 -08:00
-												Move index and chunk encoders to own packages

											
										
										
											2017-11-30 06:34:49 -08:00
+									app chunkenc.Appender // Current appender for the chunk.
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									memChunkPool *sync.Pool
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+									txs *txRing
-												Make concurrent head chunk reads safe, fix misc races

This adds a 4 sample buffer to every head chunk. The XOR
compression scheme may edit bytes in place. The minimum size
of a sample is 2 bits. So keeping the last 4 samples in an in-memory
buffer makes it safe to query the preceeding ones while samples
are added

											
										
										
											2017-01-09 07:51:39 -08:00
+								}
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+								func newMemSeries(lset labels.Labels, id uint64, chunkRange int64, memChunkPool *sync.Pool) *memSeries {
-												Dont store stones in head, delete samples directly

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-01-08 09:08:41 -08:00
+									s := &memSeries{
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+										lset:         lset,
 										ref:          id,
 										chunkRange:   chunkRange,
 										nextAt:       math.MinInt64,
 										txs:          newTxRing(4),
 										memChunkPool: memChunkPool,
-												Dont store stones in head, delete samples directly

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-01-08 09:08:41 -08:00
+									}
 									return s
 								}
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+								func (s *memSeries) minTime() int64 {
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									if len(s.mmappedChunks) > 0 {
 										return s.mmappedChunks[0].minTime
-												Fix crash when a series has no block

											
										
										
											2018-02-07 05:43:21 -08:00
+									}
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									if s.headChunk != nil {
 										return s.headChunk.minTime
 									}
 									return math.MinInt64
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+								}
 								func (s *memSeries) maxTime() int64 {
-												Fix crash when a series has no block

											
										
										
											2018-02-07 05:43:21 -08:00
+									c := s.head()
 									if c == nil {
 										return math.MinInt64
 									}
 									return c.maxTime
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+								}
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+								func (s *memSeries) cutNewHeadChunk(mint int64, chunkDiskMapper *chunks.ChunkDiskMapper) *memChunk {
 									s.mmapCurrentHeadChunk(chunkDiskMapper)
 									s.headChunk = &memChunk{
-												Move index and chunk encoders to own packages

											
										
										
											2017-11-30 06:34:49 -08:00
+										chunk:   chunkenc.NewXORChunk(),
-												Improve heuristic to spread chunks across block

											
										
										
											2017-06-07 04:42:53 -07:00
+										minTime: mint,
-												Replace single head chunk per series with memSeries

This adds a memory series holding several chunk to replace
the single head chunk per series so far.
This is necessary for uniform maximum chunk sizes in cases
where some series have higher frequency samples than others.

											
										
										
											2017-01-11 04:02:38 -08:00
+										maxTime: math.MinInt64,
 									}
-												Compact TSDB head chunks after being cut, to reduce inuse memory

Signed-off-by: Marco Pracucci <marco@pracucci.com>

											
										
										
											2020-01-23 23:44:52 -08:00
-												Ensure near-empty chunks end at correct boundary

We were determining a chunk's end time once it was one quarter full to
compute it so all chunks have uniform number of samples.
This accidentally skipped the case where series started near the end of
a chunk range/block and never reached that threshold. As a result they
got persisted but were continued across the range.

This resulted in corrupted persisted data.

											
										
										
											2017-10-25 00:32:06 -07:00
+									// Set upper bound on when the next chunk must be started. An earlier timestamp
 									// may be chosen dynamically at a later point.
-												no overlapping on compaction when an existing block is not within default boundaries. (#461)

closes https://github.com/prometheus/prometheus/issues/4643

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-12-04 02:30:49 -08:00
+									s.nextAt = rangeForTimestamp(mint, s.chunkRange)
-												Ensure near-empty chunks end at correct boundary

We were determining a chunk's end time once it was one quarter full to
compute it so all chunks have uniform number of samples.
This accidentally skipped the case where series started near the end of
a chunk range/block and never reached that threshold. As a result they
got persisted but were continued across the range.

This resulted in corrupted persisted data.

											
										
										
											2017-10-25 00:32:06 -07:00
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									app, err := s.headChunk.chunk.Appender()
-												Replace single head chunk per series with memSeries

This adds a memory series holding several chunk to replace
the single head chunk per series so far.
This is necessary for uniform maximum chunk sizes in cases
where some series have higher frequency samples than others.

											
										
										
											2017-01-11 04:02:38 -08:00
+									if err != nil {
 										panic(err)
 									}
 									s.app = app
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									return s.headChunk
 								}
 								func (s *memSeries) mmapCurrentHeadChunk(chunkDiskMapper *chunks.ChunkDiskMapper) {
 									if s.headChunk == nil {
 										// There is no head chunk, so nothing to m-map here.
 										return
 									}
 									chunkRef, err := chunkDiskMapper.WriteChunk(s.ref, s.headChunk.minTime, s.headChunk.maxTime, s.headChunk.chunk)
 									if err != nil {
 										if err != chunks.ErrChunkDiskMapperClosed {
 											panic(err)
 										}
 									}
 									s.mmappedChunks = append(s.mmappedChunks, &mmappedChunk{
 										ref:        chunkRef,
 										numSamples: uint16(s.headChunk.chunk.NumSamples()),
 										minTime:    s.headChunk.minTime,
 										maxTime:    s.headChunk.maxTime,
 									})
-												Replace single head chunk per series with memSeries

This adds a memory series holding several chunk to replace
the single head chunk per series so far.
This is necessary for uniform maximum chunk sizes in cases
where some series have higher frequency samples than others.

											
										
										
											2017-01-11 04:02:38 -08:00
+								}
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+								// appendable checks whether the given sample is valid for appending to the series.
 								func (s *memSeries) appendable(t int64, v float64) error {
-												Add tests for GC and chunk truncation

											
										
										
											2017-09-01 05:38:49 -07:00
+									c := s.head()
 									if c == nil {
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+										return nil
 									}
 									if t > c.maxTime {
 										return nil
 									}
 									if t < c.maxTime {
-												Make TSDB use storage errors

This fixes #6992, which was introduced by #6777. There was an
intermediate component which translated TSDB errors into storage errors,
but that component was deleted and this bug went unnoticed, until we
were watching at the Prombench results. Without this, scrape will fail
instead of dropping samples or using "Add" when the series have been
garbage collected.

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>

											
										
										
											2020-03-16 14:52:02 -07:00
+										return storage.ErrOutOfOrderSample
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+									}
 									// We are allowing exact duplicates as we can encounter them in valid cases
 									// like federation and erroring out at that time would be extremely noisy.
-												Use sampleBuf instead of maintaining lastValue. (#444)

This cuts the size of memSize by 8B.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2018-11-14 06:02:32 -08:00
+									if math.Float64bits(s.sampleBuf[3].v) != math.Float64bits(v) {
-												Make TSDB use storage errors

This fixes #6992, which was introduced by #6777. There was an
intermediate component which translated TSDB errors into storage errors,
but that component was deleted and this bug went unnoticed, until we
were watching at the Prombench results. Without this, scrape will fail
instead of dropping samples or using "Add" when the series have been
garbage collected.

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>

											
										
										
											2020-03-16 14:52:02 -07:00
+										return storage.ErrDuplicateSampleForTimestamp
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+									}
 									return nil
 								}
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+								// chunk returns the chunk for the chunk id from memory or by m-mapping it from the disk.
 								// If garbageCollect is true, it means that the returned *memChunk
 								// (and not the chunkenc.Chunk inside it) can be garbage collected after it's usage.
-												More explicit chunks and  head error handling. (#7277)


											
										
										
											2020-05-22 02:03:23 -07:00
+								func (s *memSeries) chunk(id int, chunkDiskMapper *chunks.ChunkDiskMapper) (chunk *memChunk, garbageCollect bool, err error) {
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									// ix represents the index of chunk in the s.mmappedChunks slice. The chunk id's are
 									// incremented by 1 when new chunk is created, hence (id - firstChunkID) gives the slice index.
 									// The max index for the s.mmappedChunks slice can be len(s.mmappedChunks)-1, hence if the ix
 									// is len(s.mmappedChunks), it represents the next chunk, which is the head chunk.
-												Add tests for GC and chunk truncation

											
										
										
											2017-09-01 05:38:49 -07:00
+									ix := id - s.firstChunkID
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									if ix < 0 || ix > len(s.mmappedChunks) {
-												More explicit chunks and  head error handling. (#7277)


											
										
										
											2020-05-22 02:03:23 -07:00
+										return nil, false, storage.ErrNotFound
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									}
 									if ix == len(s.mmappedChunks) {
-												More explicit chunks and  head error handling. (#7277)


											
										
										
											2020-05-22 02:03:23 -07:00
+										if s.headChunk == nil {
 											return nil, false, errors.New("invalid head chunk")
 										}
 										return s.headChunk, false, nil
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									}
 									chk, err := chunkDiskMapper.Chunk(s.mmappedChunks[ix].ref)
 									if err != nil {
-												More explicit chunks and  head error handling. (#7277)


											
										
										
											2020-05-22 02:03:23 -07:00
+										if _, ok := err.(*chunks.CorruptionErr); ok {
 											panic(err)
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+										}
-												More explicit chunks and  head error handling. (#7277)


											
										
										
											2020-05-22 02:03:23 -07:00
+										return nil, false, err
-												Add tests for GC and chunk truncation

											
										
										
											2017-09-01 05:38:49 -07:00
+									}
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									mc := s.memChunkPool.Get().(*memChunk)
 									mc.chunk = chk
 									mc.minTime = s.mmappedChunks[ix].minTime
 									mc.maxTime = s.mmappedChunks[ix].maxTime
-												More explicit chunks and  head error handling. (#7277)


											
										
										
											2020-05-22 02:03:23 -07:00
+									return mc, true, nil
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+								}
 								func (s *memSeries) chunkID(pos int) int {
 									return pos + s.firstChunkID
 								}
 								// truncateChunksBefore removes all chunks from the series that have not timestamp
 								// at or after mint. Chunk IDs remain unchanged.
-												Add various metrics

											
										
										
											2017-08-30 08:38:25 -07:00
+								func (s *memSeries) truncateChunksBefore(mint int64) (removed int) {
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+									var k int
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									if s.headChunk != nil && s.headChunk.maxTime < mint {
 										// If head chunk is truncated, we can truncate all mmapped chunks.
 										k = 1 + len(s.mmappedChunks)
 										s.firstChunkID += k
-												Precalculate memSeries.head

This is read far more than it changes.
This cuts ~14% off walltme and ~27% off CPU for WAL reading.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>

											
										
										
											2018-10-31 06:28:56 -07:00
+										s.headChunk = nil
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+										s.mmappedChunks = nil
 										return k
 									}
 									if len(s.mmappedChunks) > 0 {
 										for i, c := range s.mmappedChunks {
 											if c.maxTime >= mint {
 												break
 											}
 											k = i + 1
 										}
 										s.mmappedChunks = append(s.mmappedChunks[:0], s.mmappedChunks[k:]...)
 										s.firstChunkID += k
-												Precalculate memSeries.head

This is read far more than it changes.
This cuts ~14% off walltme and ~27% off CPU for WAL reading.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>

											
										
										
											2018-10-31 06:28:56 -07:00
+									}
-												Add various metrics

											
										
										
											2017-08-30 08:38:25 -07:00
+									return k
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 15:39:17 -07:00
+								}
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+								// append adds the sample (t, v) to the series. The caller also has to provide
-												Do not attempt isolation for appendID == 0

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2020-02-28 17:39:26 -08:00
+								// the appendID for isolation. (The appendID can be zero, which results in no
 								// isolation for this append.)
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+								// It is unsafe to call this concurrently with s.iterator(...) without holding the series lock.
 								func (s *memSeries) append(t int64, v float64, appendID uint64, chunkDiskMapper *chunks.ChunkDiskMapper) (sampleInOrder, chunkCreated bool) {
-												comments about the 120samples const and link to Gorilla papers. (#423)

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-10-23 03:43:06 -07:00
+									// Based on Gorilla white papers this offers near-optimal compression ratio
 									// so anything bigger that this has diminishing returns and increases
 									// the time range within which we have to decompress all samples.
-												Improve heuristic to spread chunks across block

											
										
										
											2017-06-07 04:42:53 -07:00
+									const samplesPerChunk = 120
-												Add tests for GC and chunk truncation

											
										
										
											2017-09-01 05:38:49 -07:00
+									c := s.head()
-												Replace single head chunk per series with memSeries

This adds a memory series holding several chunk to replace
the single head chunk per series so far.
This is necessary for uniform maximum chunk sizes in cases
where some series have higher frequency samples than others.

											
										
										
											2017-01-11 04:02:38 -08:00
-												Add tests for GC and chunk truncation

											
										
										
											2017-09-01 05:38:49 -07:00
+									if c == nil {
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+										if len(s.mmappedChunks) > 0 && s.mmappedChunks[len(s.mmappedChunks)-1].maxTime >= t {
 											// Out of order sample. Sample timestamp is already in the mmaped chunks, so ignore it.
 											return false, false
 										}
 										// There is no chunk in this series yet, create the first chunk for the sample.
 										c = s.cutNewHeadChunk(t, chunkDiskMapper)
-												Add various metrics

											
										
										
											2017-08-30 08:38:25 -07:00
+										chunkCreated = true
-												Improve heuristic to spread chunks across block

											
										
										
											2017-06-07 04:42:53 -07:00
+									}
-												wal: parallelize sample processing

											
										
										
											2017-10-07 06:55:11 -07:00
+									numSamples := c.chunk.NumSamples()
-												Ensure near-empty chunks end at correct boundary

We were determining a chunk's end time once it was one quarter full to
compute it so all chunks have uniform number of samples.
This accidentally skipped the case where series started near the end of
a chunk range/block and never reached that threshold. As a result they
got persisted but were continued across the range.

This resulted in corrupted persisted data.

											
										
										
											2017-10-25 00:32:06 -07:00
+									// Out of order sample.
-												Improve heuristic to spread chunks across block

											
										
										
											2017-06-07 04:42:53 -07:00
+									if c.maxTime >= t {
-												Add various metrics

											
										
										
											2017-08-30 08:38:25 -07:00
+										return false, chunkCreated
-												Improve heuristic to spread chunks across block

											
										
										
											2017-06-07 04:42:53 -07:00
+									}
-												Ensure near-empty chunks end at correct boundary

We were determining a chunk's end time once it was one quarter full to
compute it so all chunks have uniform number of samples.
This accidentally skipped the case where series started near the end of
a chunk range/block and never reached that threshold. As a result they
got persisted but were continued across the range.

This resulted in corrupted persisted data.

											
										
										
											2017-10-25 00:32:06 -07:00
+									// If we reach 25% of a chunk's desired sample count, set a definitive time
 									// at which to start the next chunk.
 									// At latest it must happen at the timestamp set when the chunk was cut.
 									if numSamples == samplesPerChunk/4 {
 										s.nextAt = computeChunkEndTime(c.minTime, c.maxTime, s.nextAt)
 									}
 									if t >= s.nextAt {
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+										c = s.cutNewHeadChunk(t, chunkDiskMapper)
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+										chunkCreated = true
-												Make concurrent head chunk reads safe, fix misc races

This adds a 4 sample buffer to every head chunk. The XOR
compression scheme may edit bytes in place. The minimum size
of a sample is 2 bits. So keeping the last 4 samples in an in-memory
buffer makes it safe to query the preceeding ones while samples
are added

											
										
										
											2017-01-09 07:51:39 -08:00
+									}
-												Replace single head chunk per series with memSeries

This adds a memory series holding several chunk to replace
the single head chunk per series so far.
This is necessary for uniform maximum chunk sizes in cases
where some series have higher frequency samples than others.

											
										
										
											2017-01-11 04:02:38 -08:00
+									s.app.Append(t, v)
 									c.maxTime = t
 									s.sampleBuf[0] = s.sampleBuf[1]
 									s.sampleBuf[1] = s.sampleBuf[2]
 									s.sampleBuf[2] = s.sampleBuf[3]
 									s.sampleBuf[3] = sample{t: t, v: v}
-												Make concurrent head chunk reads safe, fix misc races

This adds a 4 sample buffer to every head chunk. The XOR
compression scheme may edit bytes in place. The minimum size
of a sample is 2 bits. So keeping the last 4 samples in an in-memory
buffer makes it safe to query the preceeding ones while samples
are added

											
										
										
											2017-01-09 07:51:39 -08:00
-												Do not attempt isolation for appendID == 0

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2020-02-28 17:39:26 -08:00
+									if appendID > 0 {
 										s.txs.add(appendID)
 									}
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
-												Add various metrics

											
										
										
											2017-08-30 08:38:25 -07:00
+									return true, chunkCreated
-												Make concurrent head chunk reads safe, fix misc races

This adds a 4 sample buffer to every head chunk. The XOR
compression scheme may edit bytes in place. The minimum size
of a sample is 2 bits. So keeping the last 4 samples in an in-memory
buffer makes it safe to query the preceeding ones while samples
are added

											
										
										
											2017-01-09 07:51:39 -08:00
+								}
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+								// cleanupAppendIDsBelow cleans up older appendIDs. Has to be called after
 								// acquiring lock.
 								func (s *memSeries) cleanupAppendIDsBelow(bound uint64) {
 									s.txs.cleanupAppendIDsBelow(bound)
 								}
 								// computeChunkEndTime estimates the end timestamp based the beginning of a
 								// chunk, its current timestamp and the upper bound up to which we insert data.
-												Improve heuristic to spread chunks across block

											
										
										
											2017-06-07 04:42:53 -07:00
+								// It assumes that the time range is 1/4 full.
 								func computeChunkEndTime(start, cur, max int64) int64 {
 									a := (max - start) / ((cur - start + 1) * 4)
 									if a == 0 {
 										return max
 									}
 									return start + (max-start)/a
 								}
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+								// iterator returns a chunk iterator.
 								// It is unsafe to call this concurrently with s.append(...) without holding the series lock.
 								func (s *memSeries) iterator(id int, isoState *isolationState, chunkDiskMapper *chunks.ChunkDiskMapper, it chunkenc.Iterator) chunkenc.Iterator {
-												More explicit chunks and  head error handling. (#7277)


											
										
										
											2020-05-22 02:03:23 -07:00
+									c, garbageCollect, err := s.chunk(id, chunkDiskMapper)
 									// TODO(fabxc): Work around! An error will be returns when a querier have retrieved a pointer to a
 									// series's chunk, which got then garbage collected before it got
 									// accessed.  We must ensure to not garbage collect as long as any
 									// readers still hold a reference.
 									if err != nil {
 										return chunkenc.NewNopIterator()
 									}
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									defer func() {
 										if garbageCollect {
 											// Set this to nil so that Go GC can collect it after it has been used.
 											// This should be done always at the end.
 											c.chunk = nil
 											s.memChunkPool.Put(c)
 										}
 									}()
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+									ix := id - s.firstChunkID
 									numSamples := c.chunk.NumSamples()
 									stopAfter := numSamples
 									if isoState != nil {
 										totalSamples := 0    // Total samples in this series.
 										previousSamples := 0 // Samples before this chunk.
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+										for j, d := range s.mmappedChunks {
 											totalSamples += int(d.numSamples)
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+											if j < ix {
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+												previousSamples += int(d.numSamples)
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+											}
 										}
-												Avoid panic when the headChunk is nil during isolation.

Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>

											
										
										
											2020-07-20 08:23:18 -07:00
 										if s.headChunk != nil {
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+											totalSamples += s.headChunk.chunk.NumSamples()
 										}
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
 										// Removing the extra transactionIDs that are relevant for samples that
 										// come after this chunk, from the total transactionIDs.
 										appendIDsToConsider := s.txs.txIDCount - (totalSamples - (previousSamples + numSamples))
 										// Iterate over the appendIDs, find the first one that the isolation state says not
 										// to return.
 										it := s.txs.iterator()
 										for index := 0; index < appendIDsToConsider; index++ {
 											appendID := it.At()
 											if appendID <= isoState.maxAppendID { // Easy check first.
 												if _, ok := isoState.incompleteAppends[appendID]; !ok {
 													it.Next()
 													continue
 												}
 											}
 											stopAfter = numSamples - (appendIDsToConsider - index)
 											if stopAfter < 0 {
 												stopAfter = 0 // Stopped in a previous chunk.
 											}
 											break
 										}
 									}
 									if stopAfter == 0 {
 										return chunkenc.NewNopIterator()
 									}
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+									if id-s.firstChunkID < len(s.mmappedChunks) {
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+										if stopAfter == numSamples {
 											return c.chunk.Iterator(it)
 										}
 										if msIter, ok := it.(*stopIterator); ok {
 											msIter.Iterator = c.chunk.Iterator(msIter.Iterator)
 											msIter.i = -1
 											msIter.stopAfter = stopAfter
 											return msIter
 										}
 										return &stopIterator{
 											Iterator:  c.chunk.Iterator(it),
 											i:         -1,
 											stopAfter: stopAfter,
 										}
-												Replace single head chunk per series with memSeries

This adds a memory series holding several chunk to replace
the single head chunk per series so far.
This is necessary for uniform maximum chunk sizes in cases
where some series have higher frequency samples than others.

											
										
										
											2017-01-11 04:02:38 -08:00
+									}
-												Misc fixes (#285)

* Fix typo in head.go

pralellize -> paralellize

* Remove commented out code

It's dead code, remove it.

* Correct reference to sample buffer

											
										
										
											2018-02-21 07:38:59 -08:00
+									// Serve the last 4 samples for the last chunk from the sample buffer
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 09:34:54 -07:00
+									// as their compressed bytes may be mutated by added samples.
-												Reuse Chunk Iterator (#642)

* Reset method for chunkenc.Iterator

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Reset method only for XORIterator

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Use Reset(...) in querier.go

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Reuse deletedIterator

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Another way of reusing chunk iterators

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Unexport xorIterator

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix memSeries.iterator(...)

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add some comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-07-09 02:49:34 -07:00
+									if msIter, ok := it.(*memSafeIterator); ok {
 										msIter.Iterator = c.chunk.Iterator(msIter.Iterator)
 										msIter.i = -1
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+										msIter.total = numSamples
 										msIter.stopAfter = stopAfter
-												Reuse Chunk Iterator (#642)

* Reset method for chunkenc.Iterator

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Reset method only for XORIterator

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Use Reset(...) in querier.go

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Reuse deletedIterator

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Another way of reusing chunk iterators

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Unexport xorIterator

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix memSeries.iterator(...)

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add some comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-07-09 02:49:34 -07:00
+										msIter.buf = s.sampleBuf
 										return msIter
 									}
 									return &memSafeIterator{
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+										stopIterator: stopIterator{
 											Iterator:  c.chunk.Iterator(it),
 											i:         -1,
 											stopAfter: stopAfter,
 										},
 										total: numSamples,
 										buf:   s.sampleBuf,
-												Make concurrent head chunk reads safe, fix misc races

This adds a 4 sample buffer to every head chunk. The XOR
compression scheme may edit bytes in place. The minimum size
of a sample is 2 bits. So keeping the last 4 samples in an in-memory
buffer makes it safe to query the preceeding ones while samples
are added

											
										
										
											2017-01-09 07:51:39 -08:00
+									}
 								}
-												Replace single head chunk per series with memSeries

This adds a memory series holding several chunk to replace
the single head chunk per series so far.
This is necessary for uniform maximum chunk sizes in cases
where some series have higher frequency samples than others.

											
										
										
											2017-01-11 04:02:38 -08:00
+								func (s *memSeries) head() *memChunk {
-												Precalculate memSeries.head

This is read far more than it changes.
This cuts ~14% off walltme and ~27% off CPU for WAL reading.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>

											
										
										
											2018-10-31 06:28:56 -07:00
+									return s.headChunk
-												Replace single head chunk per series with memSeries

This adds a memory series holding several chunk to replace
the single head chunk per series so far.
This is necessary for uniform maximum chunk sizes in cases
where some series have higher frequency samples than others.

											
										
										
											2017-01-11 04:02:38 -08:00
+								}
 								type memChunk struct {
-												Move index and chunk encoders to own packages

											
										
										
											2017-11-30 06:34:49 -08:00
+									chunk            chunkenc.Chunk
-												Replace single head chunk per series with memSeries

This adds a memory series holding several chunk to replace
the single head chunk per series so far.
This is necessary for uniform maximum chunk sizes in cases
where some series have higher frequency samples than others.

											
										
										
											2017-01-11 04:02:38 -08:00
+									minTime, maxTime int64
 								}
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
+								// OverlapsClosedInterval returns true if the chunk overlaps [mint, maxt].
-												Make interval overlap comparisons more explicit

Blocks are half-open intervals [a, b), while all other intervals
(chunks, head, ...) are closed intervals [a, b].

Make that distinction explicit by defining `OverlapsClosedInterval()`
methods for blocks and chunks, and using them in place of the more
generic `intervalOverlap()` function.

This change also fixes `db.Querier()` and `db.Delete()`, which could
previously return one extraneous block at the end of the specified
interval.

Signed-off-by: Benoît Knecht <benoit.knecht@fsfe.org>

											
										
										
											2018-07-02 01:23:36 -07:00
+								func (mc *memChunk) OverlapsClosedInterval(mint, maxt int64) bool {
 									return mc.minTime <= maxt && mint <= mc.maxTime
 								}
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+								type stopIterator struct {
-												Move index and chunk encoders to own packages

											
										
										
											2017-11-30 06:34:49 -08:00
+									chunkenc.Iterator
-												Make concurrent head chunk reads safe, fix misc races

This adds a 4 sample buffer to every head chunk. The XOR
compression scheme may edit bytes in place. The minimum size
of a sample is 2 bits. So keeping the last 4 samples in an in-memory
buffer makes it safe to query the preceeding ones while samples
are added

											
										
										
											2017-01-09 07:51:39 -08:00
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+									i, stopAfter int
 								}
 								func (it *stopIterator) Next() bool {
 									if it.i+1 >= it.stopAfter {
 										return false
 									}
 									it.i++
 									return it.Iterator.Next()
 								}
 								type memSafeIterator struct {
 									stopIterator
-												Make concurrent head chunk reads safe, fix misc races

This adds a 4 sample buffer to every head chunk. The XOR
compression scheme may edit bytes in place. The minimum size
of a sample is 2 bits. So keeping the last 4 samples in an in-memory
buffer makes it safe to query the preceeding ones while samples
are added

											
										
										
											2017-01-09 07:51:39 -08:00
+									total int
 									buf   [4]sample
 								}
 								func (it *memSafeIterator) Next() bool {
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 11:22:27 -08:00
+									if it.i+1 >= it.stopAfter {
-												Make concurrent head chunk reads safe, fix misc races

This adds a 4 sample buffer to every head chunk. The XOR
compression scheme may edit bytes in place. The minimum size
of a sample is 2 bits. So keeping the last 4 samples in an in-memory
buffer makes it safe to query the preceeding ones while samples
are added

											
										
										
											2017-01-09 07:51:39 -08:00
+										return false
 									}
 									it.i++
 									if it.total-it.i > 4 {
 										return it.Iterator.Next()
 									}
 									return true
 								}
 								func (it *memSafeIterator) At() (int64, float64) {
 									if it.total-it.i > 4 {
 										return it.Iterator.At()
 									}
 									s := it.buf[4-(it.total-it.i)]
 									return s.t, s.v
 								}
-												Move index and chunk encoders to own packages

											
										
										
											2017-11-30 06:34:49 -08:00
 								type stringset map[string]struct{}
 								func (ss stringset) set(s string) {
 									ss[s] = struct{}{}
 								}
 								func (ss stringset) String() string {
 									return strings.Join(ss.slice(), ",")
 								}
 								func (ss stringset) slice() []string {
 									slice := make([]string, 0, len(ss))
 									for k := range ss {
 										slice = append(slice, k)
 									}
 									sort.Strings(slice)
 									return slice
 								}
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 08:30:00 -07:00
 								type mmappedChunk struct {
 									ref              uint64
 									numSamples       uint16
 									minTime, maxTime int64
 								}
 								// Returns true if the chunk overlaps [mint, maxt].
 								func (mc *mmappedChunk) OverlapsClosedInterval(mint, maxt int64) bool {
 									return mc.minTime <= maxt && mint <= mc.maxTime
 								}
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 06:22:08 -07:00
 								// SeriesLifecycleCallback specifies a list of callbacks that will be called during a lifecycle of a series.
 								// It is always a no-op in Prometheus and mainly meant for external users who import TSDB.
 								// All the callbacks should be safe to be called concurrently.
 								// It is upto the user to implement soft or hard consistency by making the callbacks
 								// atomic or non-atomic. Atomic callbacks can cause degradation performance.
 								type SeriesLifecycleCallback interface {
 									// PreCreation is called before creating a series to indicate if the series can be created.
 									// A non nil error means the series should not be created.
 									PreCreation(labels.Labels) error
 									// PostCreation is called after creating a series to indicate a creation of series.
 									PostCreation(labels.Labels)
 									// PostDeletion is called after deletion of series.
 									PostDeletion(...labels.Labels)
 								}
 								type noopSeriesLifecycleCallback struct{}
 								func (noopSeriesLifecycleCallback) PreCreation(labels.Labels) error { return nil }
 								func (noopSeriesLifecycleCallback) PostCreation(labels.Labels)      {}
 								func (noopSeriesLifecycleCallback) PostDeletion(...labels.Labels)   {}