// Copyright 2017 The Prometheus Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tsdb import ( "context" "crypto/rand" "fmt" "io" "math" "os" "path/filepath" "sort" "time" "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" "github.com/oklog/ulid" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/prometheus/pkg/labels" "github.com/prometheus/prometheus/storage" "github.com/prometheus/prometheus/tsdb/chunkenc" "github.com/prometheus/prometheus/tsdb/chunks" tsdb_errors "github.com/prometheus/prometheus/tsdb/errors" "github.com/prometheus/prometheus/tsdb/fileutil" "github.com/prometheus/prometheus/tsdb/index" "github.com/prometheus/prometheus/tsdb/tombstones" ) // ExponentialBlockRanges returns the time ranges based on the stepSize. func ExponentialBlockRanges(minSize int64, steps, stepSize int) []int64 { ranges := make([]int64, 0, steps) curRange := minSize for i := 0; i < steps; i++ { ranges = append(ranges, curRange) curRange = curRange * int64(stepSize) } return ranges } // Compactor provides compaction against an underlying storage // of time series data. type Compactor interface { // Plan returns a set of directories that can be compacted concurrently. // The directories can be overlapping. // Results returned when compactions are in progress are undefined. Plan(dir string) ([]string, error) // Write persists a Block into a directory. // No Block is written when resulting Block has 0 samples, and returns empty ulid.ULID{}. Write(dest string, b BlockReader, mint, maxt int64, parent *BlockMeta) (ulid.ULID, error) // Compact runs compaction against the provided directories. Must // only be called concurrently with results of Plan(). // Can optionally pass a list of already open blocks, // to avoid having to reopen them. // When resulting Block has 0 samples // * No block is written. // * The source dirs are marked Deletable. // * Returns empty ulid.ULID{}. Compact(dest string, dirs []string, open []*Block) (ulid.ULID, error) } // LeveledCompactor implements the Compactor interface. type LeveledCompactor struct { metrics *compactorMetrics logger log.Logger ranges []int64 chunkPool chunkenc.Pool ctx context.Context } type compactorMetrics struct { ran prometheus.Counter populatingBlocks prometheus.Gauge overlappingBlocks prometheus.Counter duration prometheus.Histogram chunkSize prometheus.Histogram chunkSamples prometheus.Histogram chunkRange prometheus.Histogram } func newCompactorMetrics(r prometheus.Registerer) *compactorMetrics { m := &compactorMetrics{} m.ran = prometheus.NewCounter(prometheus.CounterOpts{ Name: "prometheus_tsdb_compactions_total", Help: "Total number of compactions that were executed for the partition.", }) m.populatingBlocks = prometheus.NewGauge(prometheus.GaugeOpts{ Name: "prometheus_tsdb_compaction_populating_block", Help: "Set to 1 when a block is currently being written to the disk.", }) m.overlappingBlocks = prometheus.NewCounter(prometheus.CounterOpts{ Name: "prometheus_tsdb_vertical_compactions_total", Help: "Total number of compactions done on overlapping blocks.", }) m.duration = prometheus.NewHistogram(prometheus.HistogramOpts{ Name: "prometheus_tsdb_compaction_duration_seconds", Help: "Duration of compaction runs", Buckets: prometheus.ExponentialBuckets(1, 2, 10), }) m.chunkSize = prometheus.NewHistogram(prometheus.HistogramOpts{ Name: "prometheus_tsdb_compaction_chunk_size_bytes", Help: "Final size of chunks on their first compaction", Buckets: prometheus.ExponentialBuckets(32, 1.5, 12), }) m.chunkSamples = prometheus.NewHistogram(prometheus.HistogramOpts{ Name: "prometheus_tsdb_compaction_chunk_samples", Help: "Final number of samples on their first compaction", Buckets: prometheus.ExponentialBuckets(4, 1.5, 12), }) m.chunkRange = prometheus.NewHistogram(prometheus.HistogramOpts{ Name: "prometheus_tsdb_compaction_chunk_range_seconds", Help: "Final time range of chunks on their first compaction", Buckets: prometheus.ExponentialBuckets(100, 4, 10), }) if r != nil { r.MustRegister( m.ran, m.populatingBlocks, m.overlappingBlocks, m.duration, m.chunkRange, m.chunkSamples, m.chunkSize, ) } return m } // NewLeveledCompactor returns a LeveledCompactor. func NewLeveledCompactor(ctx context.Context, r prometheus.Registerer, l log.Logger, ranges []int64, pool chunkenc.Pool) (*LeveledCompactor, error) { if len(ranges) == 0 { return nil, errors.Errorf("at least one range must be provided") } if pool == nil { pool = chunkenc.NewPool() } if l == nil { l = log.NewNopLogger() } return &LeveledCompactor{ ranges: ranges, chunkPool: pool, logger: l, metrics: newCompactorMetrics(r), ctx: ctx, }, nil } type dirMeta struct { dir string meta *BlockMeta } // Plan returns a list of compactable blocks in the provided directory. func (c *LeveledCompactor) Plan(dir string) ([]string, error) { dirs, err := blockDirs(dir) if err != nil { return nil, err } if len(dirs) < 1 { return nil, nil } var dms []dirMeta for _, dir := range dirs { meta, _, err := readMetaFile(dir) if err != nil { return nil, err } dms = append(dms, dirMeta{dir, meta}) } return c.plan(dms) } func (c *LeveledCompactor) plan(dms []dirMeta) ([]string, error) { sort.Slice(dms, func(i, j int) bool { return dms[i].meta.MinTime < dms[j].meta.MinTime }) res := c.selectOverlappingDirs(dms) if len(res) > 0 { return res, nil } // No overlapping blocks, do compaction the usual way. // We do not include a recently created block with max(minTime), so the block which was just created from WAL. // This gives users a window of a full block size to piece-wise backup new data without having to care about data overlap. dms = dms[:len(dms)-1] for _, dm := range c.selectDirs(dms) { res = append(res, dm.dir) } if len(res) > 0 { return res, nil } // Compact any blocks with big enough time range that have >5% tombstones. for i := len(dms) - 1; i >= 0; i-- { meta := dms[i].meta if meta.MaxTime-meta.MinTime < c.ranges[len(c.ranges)/2] { break } if float64(meta.Stats.NumTombstones)/float64(meta.Stats.NumSeries+1) > 0.05 { return []string{dms[i].dir}, nil } } return nil, nil } // selectDirs returns the dir metas that should be compacted into a single new block. // If only a single block range is configured, the result is always nil. func (c *LeveledCompactor) selectDirs(ds []dirMeta) []dirMeta { if len(c.ranges) < 2 || len(ds) < 1 { return nil } highTime := ds[len(ds)-1].meta.MinTime for _, iv := range c.ranges[1:] { parts := splitByRange(ds, iv) if len(parts) == 0 { continue } Outer: for _, p := range parts { // Do not select the range if it has a block whose compaction failed. for _, dm := range p { if dm.meta.Compaction.Failed { continue Outer } } mint := p[0].meta.MinTime maxt := p[len(p)-1].meta.MaxTime // Pick the range of blocks if it spans the full range (potentially with gaps) // or is before the most recent block. // This ensures we don't compact blocks prematurely when another one of the same // size still fits in the range. if (maxt-mint == iv || maxt <= highTime) && len(p) > 1 { return p } } } return nil } // selectOverlappingDirs returns all dirs with overlapping time ranges. // It expects sorted input by mint and returns the overlapping dirs in the same order as received. func (c *LeveledCompactor) selectOverlappingDirs(ds []dirMeta) []string { if len(ds) < 2 { return nil } var overlappingDirs []string globalMaxt := ds[0].meta.MaxTime for i, d := range ds[1:] { if d.meta.MinTime < globalMaxt { if len(overlappingDirs) == 0 { // When it is the first overlap, need to add the last one as well. overlappingDirs = append(overlappingDirs, ds[i].dir) } overlappingDirs = append(overlappingDirs, d.dir) } else if len(overlappingDirs) > 0 { break } if d.meta.MaxTime > globalMaxt { globalMaxt = d.meta.MaxTime } } return overlappingDirs } // splitByRange splits the directories by the time range. The range sequence starts at 0. // // For example, if we have blocks [0-10, 10-20, 50-60, 90-100] and the split range tr is 30 // it returns [0-10, 10-20], [50-60], [90-100]. func splitByRange(ds []dirMeta, tr int64) [][]dirMeta { var splitDirs [][]dirMeta for i := 0; i < len(ds); { var ( group []dirMeta t0 int64 m = ds[i].meta ) // Compute start of aligned time range of size tr closest to the current block's start. if m.MinTime >= 0 { t0 = tr * (m.MinTime / tr) } else { t0 = tr * ((m.MinTime - tr + 1) / tr) } // Skip blocks that don't fall into the range. This can happen via mis-alignment or // by being the multiple of the intended range. if m.MaxTime > t0+tr { i++ continue } // Add all dirs to the current group that are within [t0, t0+tr]. for ; i < len(ds); i++ { // Either the block falls into the next range or doesn't fit at all (checked above). if ds[i].meta.MaxTime > t0+tr { break } group = append(group, ds[i]) } if len(group) > 0 { splitDirs = append(splitDirs, group) } } return splitDirs } func compactBlockMetas(uid ulid.ULID, blocks ...*BlockMeta) *BlockMeta { res := &BlockMeta{ ULID: uid, MinTime: blocks[0].MinTime, } sources := map[ulid.ULID]struct{}{} // For overlapping blocks, the Maxt can be // in any block so we track it globally. maxt := int64(math.MinInt64) for _, b := range blocks { if b.MaxTime > maxt { maxt = b.MaxTime } if b.Compaction.Level > res.Compaction.Level { res.Compaction.Level = b.Compaction.Level } for _, s := range b.Compaction.Sources { sources[s] = struct{}{} } res.Compaction.Parents = append(res.Compaction.Parents, BlockDesc{ ULID: b.ULID, MinTime: b.MinTime, MaxTime: b.MaxTime, }) } res.Compaction.Level++ for s := range sources { res.Compaction.Sources = append(res.Compaction.Sources, s) } sort.Slice(res.Compaction.Sources, func(i, j int) bool { return res.Compaction.Sources[i].Compare(res.Compaction.Sources[j]) < 0 }) res.MaxTime = maxt return res } // Compact creates a new block in the compactor's directory from the blocks in the // provided directories. func (c *LeveledCompactor) Compact(dest string, dirs []string, open []*Block) (uid ulid.ULID, err error) { var ( blocks []BlockReader bs []*Block metas []*BlockMeta uids []string ) start := time.Now() for _, d := range dirs { meta, _, err := readMetaFile(d) if err != nil { return uid, err } var b *Block // Use already open blocks if we can, to avoid // having the index data in memory twice. for _, o := range open { if meta.ULID == o.Meta().ULID { b = o break } } if b == nil { var err error b, err = OpenBlock(c.logger, d, c.chunkPool) if err != nil { return uid, err } defer b.Close() } metas = append(metas, meta) blocks = append(blocks, b) bs = append(bs, b) uids = append(uids, meta.ULID.String()) } uid = ulid.MustNew(ulid.Now(), rand.Reader) meta := compactBlockMetas(uid, metas...) err = c.write(dest, meta, blocks...) if err == nil { if meta.Stats.NumSamples == 0 { for _, b := range bs { b.meta.Compaction.Deletable = true n, err := writeMetaFile(c.logger, b.dir, &b.meta) if err != nil { level.Error(c.logger).Log( "msg", "Failed to write 'Deletable' to meta file after compaction", "ulid", b.meta.ULID, ) } b.numBytesMeta = n } uid = ulid.ULID{} level.Info(c.logger).Log( "msg", "compact blocks resulted in empty block", "count", len(blocks), "sources", fmt.Sprintf("%v", uids), "duration", time.Since(start), ) } else { level.Info(c.logger).Log( "msg", "compact blocks", "count", len(blocks), "mint", meta.MinTime, "maxt", meta.MaxTime, "ulid", meta.ULID, "sources", fmt.Sprintf("%v", uids), "duration", time.Since(start), ) } return uid, nil } var merr tsdb_errors.MultiError merr.Add(err) if err != context.Canceled { for _, b := range bs { if err := b.setCompactionFailed(); err != nil { merr.Add(errors.Wrapf(err, "setting compaction failed for block: %s", b.Dir())) } } } return uid, merr } func (c *LeveledCompactor) Write(dest string, b BlockReader, mint, maxt int64, parent *BlockMeta) (ulid.ULID, error) { start := time.Now() uid := ulid.MustNew(ulid.Now(), rand.Reader) meta := &BlockMeta{ ULID: uid, MinTime: mint, MaxTime: maxt, } meta.Compaction.Level = 1 meta.Compaction.Sources = []ulid.ULID{uid} if parent != nil { meta.Compaction.Parents = []BlockDesc{ {ULID: parent.ULID, MinTime: parent.MinTime, MaxTime: parent.MaxTime}, } } err := c.write(dest, meta, b) if err != nil { return uid, err } if meta.Stats.NumSamples == 0 { return ulid.ULID{}, nil } level.Info(c.logger).Log( "msg", "write block", "mint", meta.MinTime, "maxt", meta.MaxTime, "ulid", meta.ULID, "duration", time.Since(start), ) return uid, nil } // instrumentedChunkWriter is used for level 1 compactions to record statistics // about compacted chunks. type instrumentedChunkWriter struct { ChunkWriter size prometheus.Histogram samples prometheus.Histogram trange prometheus.Histogram } func (w *instrumentedChunkWriter) WriteChunks(chunks ...chunks.Meta) error { for _, c := range chunks { w.size.Observe(float64(len(c.Chunk.Bytes()))) w.samples.Observe(float64(c.Chunk.NumSamples())) w.trange.Observe(float64(c.MaxTime - c.MinTime)) } return w.ChunkWriter.WriteChunks(chunks...) } // write creates a new block that is the union of the provided blocks into dir. // It cleans up all files of the old blocks after completing successfully. func (c *LeveledCompactor) write(dest string, meta *BlockMeta, blocks ...BlockReader) (err error) { dir := filepath.Join(dest, meta.ULID.String()) tmp := dir + ".tmp" var closers []io.Closer defer func(t time.Time) { var merr tsdb_errors.MultiError merr.Add(err) merr.Add(closeAll(closers)) err = merr.Err() // RemoveAll returns no error when tmp doesn't exist so it is safe to always run it. if err := os.RemoveAll(tmp); err != nil { level.Error(c.logger).Log("msg", "removed tmp folder after failed compaction", "err", err.Error()) } c.metrics.ran.Inc() c.metrics.duration.Observe(time.Since(t).Seconds()) }(time.Now()) if err = os.RemoveAll(tmp); err != nil { return err } if err = os.MkdirAll(tmp, 0777); err != nil { return err } // Populate chunk and index files into temporary directory with // data of all blocks. var chunkw ChunkWriter chunkw, err = chunks.NewWriter(chunkDir(tmp)) if err != nil { return errors.Wrap(err, "open chunk writer") } closers = append(closers, chunkw) // Record written chunk sizes on level 1 compactions. if meta.Compaction.Level == 1 { chunkw = &instrumentedChunkWriter{ ChunkWriter: chunkw, size: c.metrics.chunkSize, samples: c.metrics.chunkSamples, trange: c.metrics.chunkRange, } } indexw, err := index.NewWriter(c.ctx, filepath.Join(tmp, indexFilename)) if err != nil { return errors.Wrap(err, "open index writer") } closers = append(closers, indexw) if err := c.populateBlock(blocks, meta, indexw, chunkw); err != nil { return errors.Wrap(err, "populate block") } select { case <-c.ctx.Done(): return c.ctx.Err() default: } // We are explicitly closing them here to check for error even // though these are covered under defer. This is because in Windows, // you cannot delete these unless they are closed and the defer is to // make sure they are closed if the function exits due to an error above. var merr tsdb_errors.MultiError for _, w := range closers { merr.Add(w.Close()) } closers = closers[:0] // Avoid closing the writers twice in the defer. if merr.Err() != nil { return merr.Err() } // Populated block is empty, so exit early. if meta.Stats.NumSamples == 0 { return nil } if _, err = writeMetaFile(c.logger, tmp, meta); err != nil { return errors.Wrap(err, "write merged meta") } // Create an empty tombstones file. if _, err := tombstones.WriteFile(c.logger, tmp, tombstones.NewMemTombstones()); err != nil { return errors.Wrap(err, "write new tombstones file") } df, err := fileutil.OpenDir(tmp) if err != nil { return errors.Wrap(err, "open temporary block dir") } defer func() { if df != nil { df.Close() } }() if err := df.Sync(); err != nil { return errors.Wrap(err, "sync temporary dir file") } // Close temp dir before rename block dir (for windows platform). if err = df.Close(); err != nil { return errors.Wrap(err, "close temporary dir") } df = nil // Block successfully written, make visible and remove old ones. if err := fileutil.Replace(tmp, dir); err != nil { return errors.Wrap(err, "rename block dir") } return nil } // populateBlock fills the index and chunk writers with new data gathered as the union // of the provided blocks. It returns meta information for the new block. // It expects sorted blocks input by mint. func (c *LeveledCompactor) populateBlock(blocks []BlockReader, meta *BlockMeta, indexw IndexWriter, chunkw ChunkWriter) (err error) { if len(blocks) == 0 { return errors.New("cannot populate block from no readers") } var ( sets []storage.ChunkSeriesSet symbols index.StringIter closers []io.Closer overlapping bool ) defer func() { var merr tsdb_errors.MultiError merr.Add(err) if cerr := closeAll(closers); cerr != nil { merr.Add(errors.Wrap(cerr, "close")) } err = merr.Err() c.metrics.populatingBlocks.Set(0) }() c.metrics.populatingBlocks.Set(1) globalMaxt := blocks[0].Meta().MaxTime for i, b := range blocks { select { case <-c.ctx.Done(): return c.ctx.Err() default: } if !overlapping { if i > 0 && b.Meta().MinTime < globalMaxt { c.metrics.overlappingBlocks.Inc() overlapping = true level.Warn(c.logger).Log("msg", "Found overlapping blocks during compaction", "ulid", meta.ULID) } if b.Meta().MaxTime > globalMaxt { globalMaxt = b.Meta().MaxTime } } indexr, err := b.Index() if err != nil { return errors.Wrapf(err, "open index reader for block %+v", b.Meta()) } closers = append(closers, indexr) chunkr, err := b.Chunks() if err != nil { return errors.Wrapf(err, "open chunk reader for block %+v", b.Meta()) } closers = append(closers, chunkr) tombsr, err := b.Tombstones() if err != nil { return errors.Wrapf(err, "open tombstone reader for block %+v", b.Meta()) } closers = append(closers, tombsr) k, v := index.AllPostingsKey() all, err := indexr.Postings(k, v) if err != nil { return err } all = indexr.SortedPostings(all) // Blocks meta is half open: [min, max), so subtract 1 to ensure we don't hold samples with exact meta.MaxTime timestamp. sets = append(sets, newBlockChunkSeriesSet(indexr, chunkr, tombsr, all, meta.MinTime, meta.MaxTime-1)) syms := indexr.Symbols() if i == 0 { symbols = syms continue } symbols = newMergedStringIter(symbols, syms) } for symbols.Next() { if err := indexw.AddSymbol(symbols.At()); err != nil { return errors.Wrap(err, "add symbol") } } if symbols.Err() != nil { return errors.Wrap(symbols.Err(), "next symbol") } var ( ref = uint64(0) chks []chunks.Meta ) set := sets[0] if len(sets) > 1 { // Merge series using compacting chunk series merger. set = storage.NewMergeChunkSeriesSet(sets, storage.NewCompactingChunkSeriesMerger(storage.ChainedSeriesMerge)) } // Iterate over all sorted chunk series. for set.Next() { select { case <-c.ctx.Done(): return c.ctx.Err() default: } s := set.At() chksIter := s.Iterator() chks = chks[:0] for chksIter.Next() { // We are not iterating in streaming way over chunk as it's more efficient to do bulk write for index and // chunk file purposes. chks = append(chks, chksIter.At()) } if chksIter.Err() != nil { return errors.Wrap(chksIter.Err(), "chunk iter") } // Skip the series with all deleted chunks. if len(chks) == 0 { continue } if err := chunkw.WriteChunks(chks...); err != nil { return errors.Wrap(err, "write chunks") } if err := indexw.AddSeries(ref, s.Labels(), chks...); err != nil { return errors.Wrap(err, "add series") } meta.Stats.NumChunks += uint64(len(chks)) meta.Stats.NumSeries++ for _, chk := range chks { meta.Stats.NumSamples += uint64(chk.Chunk.NumSamples()) } for _, chk := range chks { if err := c.chunkPool.Put(chk.Chunk); err != nil { return errors.Wrap(err, "put chunk") } } ref++ } if set.Err() != nil { return errors.Wrap(set.Err(), "iterate compaction set") } return nil } // blockBaseSeriesSet allows to iterate over all series in the single block. // Iterated series are trimmed with given min and max time as well as tombstones. // See newBlockSeriesSet and newBlockChunkSeriesSet to use it for either sample or chunk iterating. type blockBaseSeriesSet struct { p index.Postings index IndexReader chunks ChunkReader tombstones tombstones.Reader mint, maxt int64 currIterFn func() *populateWithDelGenericSeriesIterator currLabels labels.Labels bufChks []chunks.Meta err error } func (b *blockBaseSeriesSet) Next() bool { var lbls labels.Labels for b.p.Next() { if err := b.index.Series(b.p.At(), &lbls, &b.bufChks); err != nil { // Postings may be stale. Skip if no underlying series exists. if errors.Cause(err) == storage.ErrNotFound { continue } b.err = errors.Wrapf(err, "get series %d", b.p.At()) return false } if len(b.bufChks) == 0 { continue } intervals, err := b.tombstones.Get(b.p.At()) if err != nil { b.err = errors.Wrap(err, "get tombstones") return false } // NOTE: // * block time range is half-open: [meta.MinTime, meta.MaxTime). // * chunks are both closed: [chk.MinTime, chk.MaxTime]. // * requested time ranges are closed: [req.Start, req.End]. var trimFront, trimBack bool // Copy chunks as iteratables are reusable. chks := make([]chunks.Meta, 0, len(b.bufChks)) // Prefilter chunks and pick those which are not entirely deleted or totally outside of the requested range. for _, chk := range b.bufChks { if chk.MaxTime < b.mint { continue } if chk.MinTime > b.maxt { continue } if !(tombstones.Interval{Mint: chk.MinTime, Maxt: chk.MaxTime}.IsSubrange(intervals)) { chks = append(chks, chk) } // If still not entirely deleted, check if trim is needed based on requested time range. if chk.MinTime < b.mint { trimFront = true } if chk.MaxTime > b.maxt { trimBack = true } } if len(chks) == 0 { continue } if trimFront { intervals = intervals.Add(tombstones.Interval{Mint: math.MinInt64, Maxt: b.mint - 1}) } if trimBack { intervals = intervals.Add(tombstones.Interval{Mint: b.maxt + 1, Maxt: math.MaxInt64}) } b.currLabels = lbls b.currIterFn = func() *populateWithDelGenericSeriesIterator { return newPopulateWithDelGenericSeriesIterator(b.chunks, chks, intervals) } return true } return false } func (b *blockBaseSeriesSet) Err() error { if b.err != nil { return b.err } return b.p.Err() } func (b *blockBaseSeriesSet) Warnings() storage.Warnings { return nil } // populateWithDelGenericSeriesIterator allows to iterate over given chunk metas. In each iteration it ensures // that chunks are trimmed based on given tombstones interval if any. // // populateWithDelGenericSeriesIterator assumes that chunks that would be fully removed by intervals are filtered out in previous phase. // // On each iteration currChkMeta is available. If currDelIter is not nil, it means that chunk iterator in currChkMeta // is invalid and chunk rewrite is needed, currDelIter should be used. type populateWithDelGenericSeriesIterator struct { chunks ChunkReader // chks are expected to be sorted by minTime and should be related to the same, single series. chks []chunks.Meta i int err error bufIter *deletedIterator intervals tombstones.Intervals currDelIter chunkenc.Iterator currChkMeta chunks.Meta } func newPopulateWithDelGenericSeriesIterator( chunks ChunkReader, chks []chunks.Meta, intervals tombstones.Intervals, ) *populateWithDelGenericSeriesIterator { return &populateWithDelGenericSeriesIterator{ chunks: chunks, chks: chks, i: -1, bufIter: &deletedIterator{}, intervals: intervals, } } func (p *populateWithDelGenericSeriesIterator) next() bool { if p.err != nil || p.i >= len(p.chks)-1 { return false } p.i++ p.currChkMeta = p.chks[p.i] p.currChkMeta.Chunk, p.err = p.chunks.Chunk(p.currChkMeta.Ref) if p.err != nil { p.err = errors.Wrapf(p.err, "cannot populate chunk %d", p.currChkMeta.Ref) return false } p.bufIter.intervals = p.bufIter.intervals[:0] for _, interval := range p.intervals { if p.currChkMeta.OverlapsClosedInterval(interval.Mint, interval.Maxt) { p.bufIter.intervals = p.bufIter.intervals.Add(interval) } } // Re-encode head chunks that are still open (being appended to) or // outside the compacted MaxTime range. // The chunk.Bytes() method is not safe for open chunks hence the re-encoding. // This happens when snapshotting the head block or just fetching chunks from TSDB. // // TODO think how to avoid the typecasting to verify when it is head block. _, isSafeChunk := p.currChkMeta.Chunk.(*safeChunk) if len(p.bufIter.intervals) == 0 && !(isSafeChunk && p.currChkMeta.MaxTime == math.MaxInt64) { // If there are no overlap with deletion intervals AND it's NOT an "open" head chunk, we can take chunk as it is. p.currDelIter = nil return true } // We don't want full chunk or it's potentially still opened, take just part of it. p.bufIter.it = p.currChkMeta.Chunk.Iterator(nil) p.currDelIter = p.bufIter return true } func (p *populateWithDelGenericSeriesIterator) Err() error { return p.err } func (p *populateWithDelGenericSeriesIterator) toSeriesIterator() chunkenc.Iterator { return &populateWithDelSeriesIterator{populateWithDelGenericSeriesIterator: p} } func (p *populateWithDelGenericSeriesIterator) toChunkSeriesIterator() chunks.Iterator { return &populateWithDelChunkSeriesIterator{populateWithDelGenericSeriesIterator: p} } // populateWithDelSeriesIterator allows to iterate over samples for the single series. type populateWithDelSeriesIterator struct { *populateWithDelGenericSeriesIterator curr chunkenc.Iterator } func (p *populateWithDelSeriesIterator) Next() bool { if p.curr != nil && p.curr.Next() { return true } for p.next() { if p.currDelIter != nil { p.curr = p.currDelIter } else { p.curr = p.currChkMeta.Chunk.Iterator(nil) } if p.curr.Next() { return true } } return false } func (p *populateWithDelSeriesIterator) Seek(t int64) bool { if p.curr != nil && p.curr.Seek(t) { return true } for p.Next() { if p.curr.Seek(t) { return true } } return false } func (p *populateWithDelSeriesIterator) At() (int64, float64) { return p.curr.At() } func (p *populateWithDelSeriesIterator) Err() error { if err := p.populateWithDelGenericSeriesIterator.Err(); err != nil { return err } if p.curr != nil { return p.curr.Err() } return nil } type populateWithDelChunkSeriesIterator struct { *populateWithDelGenericSeriesIterator curr chunks.Meta } func (p *populateWithDelChunkSeriesIterator) Next() bool { if !p.next() { return false } p.curr = p.currChkMeta if p.currDelIter == nil { return true } // Re-encode the chunk if iterator is provider. This means that it has some samples to be deleted or chunk is opened. newChunk := chunkenc.NewXORChunk() app, err := newChunk.Appender() if err != nil { p.err = err return false } if !p.currDelIter.Next() { if err := p.currDelIter.Err(); err != nil { p.err = errors.Wrap(err, "iterate chunk while re-encoding") return false } // Empty chunk, this should not happen, as we assume full deletions being filtered before this iterator. p.err = errors.Wrap(err, "populateWithDelChunkSeriesIterator: unexpected empty chunk found while rewriting chunk") return false } t, v := p.currDelIter.At() p.curr.MinTime = t app.Append(t, v) for p.currDelIter.Next() { t, v = p.currDelIter.At() app.Append(t, v) } if err := p.currDelIter.Err(); err != nil { p.err = errors.Wrap(err, "iterate chunk while re-encoding") return false } p.curr.Chunk = newChunk p.curr.MaxTime = t return true } func (p *populateWithDelChunkSeriesIterator) At() chunks.Meta { return p.curr } // blockSeriesSet allows to iterate over sorted, populated series with applied tombstones. // Series with all deleted chunks are still present as Series with no samples. // Samples from chunks are also trimmed to requested min and max time. type blockSeriesSet struct { blockBaseSeriesSet } func newBlockSeriesSet(i IndexReader, c ChunkReader, t tombstones.Reader, p index.Postings, mint, maxt int64) storage.SeriesSet { return &blockSeriesSet{ blockBaseSeriesSet{ index: i, chunks: c, tombstones: t, p: p, mint: mint, maxt: maxt, }, } } func (b *blockSeriesSet) At() storage.Series { // At can be looped over before iterating, so save the current value locally. currIterFn := b.currIterFn return &storage.SeriesEntry{ Lset: b.currLabels, SampleIteratorFn: func() chunkenc.Iterator { return currIterFn().toSeriesIterator() }, } } // blockChunkSeriesSet allows to iterate over sorted, populated series with applied tombstones. // Series with all deleted chunks are still present as Labelled iterator with no chunks. // Chunks are also trimmed to requested [min and max] (keeping samples with min and max timestamps). type blockChunkSeriesSet struct { blockBaseSeriesSet } func newBlockChunkSeriesSet(i IndexReader, c ChunkReader, t tombstones.Reader, p index.Postings, mint, maxt int64) storage.ChunkSeriesSet { return &blockChunkSeriesSet{ blockBaseSeriesSet{ index: i, chunks: c, tombstones: t, p: p, mint: mint, maxt: maxt, }, } } func (b *blockChunkSeriesSet) At() storage.ChunkSeries { // At can be looped over before iterating, so save the current value locally. currIterFn := b.currIterFn return &storage.ChunkSeriesEntry{ Lset: b.currLabels, ChunkIteratorFn: func() chunks.Iterator { return currIterFn().toChunkSeriesIterator() }, } } func newMergedStringIter(a index.StringIter, b index.StringIter) index.StringIter { return &mergedStringIter{a: a, b: b, aok: a.Next(), bok: b.Next()} } type mergedStringIter struct { a index.StringIter b index.StringIter aok, bok bool cur string } func (m *mergedStringIter) Next() bool { if (!m.aok && !m.bok) || (m.Err() != nil) { return false } if !m.aok { m.cur = m.b.At() m.bok = m.b.Next() } else if !m.bok { m.cur = m.a.At() m.aok = m.a.Next() } else if m.b.At() > m.a.At() { m.cur = m.a.At() m.aok = m.a.Next() } else if m.a.At() > m.b.At() { m.cur = m.b.At() m.bok = m.b.Next() } else { // Equal. m.cur = m.b.At() m.aok = m.a.Next() m.bok = m.b.Next() } return true } func (m mergedStringIter) At() string { return m.cur } func (m mergedStringIter) Err() error { if m.a.Err() != nil { return m.a.Err() } return m.b.Err() }