prometheus/tsdb/compact.go
Ganesh Vernekar df59320886
Add out-of-order sample support to the TSDB (#269)
This implementation is based on this design doc:
https://docs.google.com/document/d/1Kppm7qL9C-BJB1j6yb6-9ObG3AbdZnFUBYPNNWwDBYM/edit?usp=sharing

This commit adds support to accept out-of-order ("OOO") sample into the TSDB
up to a configurable time allowance. If OOO is enabled, overlapping querying
are automatically enabled.

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

Co-authored-by: Jesus Vazquez <jesus.vazquez@grafana.com>
Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com>
Co-authored-by: Dieter Plaetinck <dieter@grafana.com>
2022-06-22 11:45:21 +00:00

1322 lines
40 KiB
Go

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdb
import (
"context"
"crypto/rand"
"fmt"
"io"
"os"
"path/filepath"
"sort"
"sync"
"time"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/oklog/ulid"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"go.uber.org/atomic"
"golang.org/x/sync/semaphore"
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/storage"
"github.com/prometheus/prometheus/tsdb/chunkenc"
"github.com/prometheus/prometheus/tsdb/chunks"
tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
"github.com/prometheus/prometheus/tsdb/fileutil"
"github.com/prometheus/prometheus/tsdb/index"
"github.com/prometheus/prometheus/tsdb/tombstones"
)
// ExponentialBlockRanges returns the time ranges based on the stepSize.
func ExponentialBlockRanges(minSize int64, steps, stepSize int) []int64 {
ranges := make([]int64, 0, steps)
curRange := minSize
for i := 0; i < steps; i++ {
ranges = append(ranges, curRange)
curRange = curRange * int64(stepSize)
}
return ranges
}
// Compactor provides compaction against an underlying storage
// of time series data.
type Compactor interface {
// Plan returns a set of directories that can be compacted concurrently.
// The directories can be overlapping.
// Results returned when compactions are in progress are undefined.
Plan(dir string) ([]string, error)
// Write persists a Block into a directory.
// No Block is written when resulting Block has 0 samples, and returns empty ulid.ULID{}.
Write(dest string, b BlockReader, mint, maxt int64, parent *BlockMeta) (ulid.ULID, error)
// Compact runs compaction against the provided directories. Must
// only be called concurrently with results of Plan().
// Can optionally pass a list of already open blocks,
// to avoid having to reopen them.
// When resulting Block has 0 samples
// * No block is written.
// * The source dirs are marked Deletable.
// * Returns empty ulid.ULID{}.
Compact(dest string, dirs []string, open []*Block) (ulid.ULID, error)
// CompactOOO creates a new block per possible block range in the compactor's directory from the OOO Head given.
// Each ULID in the result corresponds to a block in a unique time range.
CompactOOO(dest string, oooHead *OOOCompactionHead) (result []ulid.ULID, err error)
}
// LeveledCompactor implements the Compactor interface.
type LeveledCompactor struct {
metrics *compactorMetrics
logger log.Logger
ranges []int64
chunkPool chunkenc.Pool
ctx context.Context
maxBlockChunkSegmentSize int64
mergeFunc storage.VerticalChunkSeriesMergeFunc
enableOverlappingCompaction bool
concurrencyOpts LeveledCompactorConcurrencyOptions
}
type compactorMetrics struct {
ran prometheus.Counter
populatingBlocks prometheus.Gauge
overlappingBlocks prometheus.Counter
duration prometheus.Histogram
chunkSize prometheus.Histogram
chunkSamples prometheus.Histogram
chunkRange prometheus.Histogram
}
func newCompactorMetrics(r prometheus.Registerer) *compactorMetrics {
m := &compactorMetrics{}
m.ran = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_compactions_total",
Help: "Total number of compactions that were executed for the partition.",
})
m.populatingBlocks = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "prometheus_tsdb_compaction_populating_block",
Help: "Set to 1 when a block is currently being written to the disk.",
})
m.overlappingBlocks = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_vertical_compactions_total",
Help: "Total number of compactions done on overlapping blocks.",
})
m.duration = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "prometheus_tsdb_compaction_duration_seconds",
Help: "Duration of compaction runs",
Buckets: prometheus.ExponentialBuckets(1, 2, 14),
})
m.chunkSize = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "prometheus_tsdb_compaction_chunk_size_bytes",
Help: "Final size of chunks on their first compaction",
Buckets: prometheus.ExponentialBuckets(32, 1.5, 12),
})
m.chunkSamples = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "prometheus_tsdb_compaction_chunk_samples",
Help: "Final number of samples on their first compaction",
Buckets: prometheus.ExponentialBuckets(4, 1.5, 12),
})
m.chunkRange = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "prometheus_tsdb_compaction_chunk_range_seconds",
Help: "Final time range of chunks on their first compaction",
Buckets: prometheus.ExponentialBuckets(100, 4, 10),
})
if r != nil {
r.MustRegister(
m.ran,
m.populatingBlocks,
m.overlappingBlocks,
m.duration,
m.chunkRange,
m.chunkSamples,
m.chunkSize,
)
}
return m
}
// NewLeveledCompactor returns a LeveledCompactor.
func NewLeveledCompactor(ctx context.Context, r prometheus.Registerer, l log.Logger, ranges []int64, pool chunkenc.Pool, mergeFunc storage.VerticalChunkSeriesMergeFunc, enableOverlappingCompaction bool) (*LeveledCompactor, error) {
return NewLeveledCompactorWithChunkSize(ctx, r, l, ranges, pool, chunks.DefaultChunkSegmentSize, mergeFunc, enableOverlappingCompaction)
}
func NewLeveledCompactorWithChunkSize(ctx context.Context, r prometheus.Registerer, l log.Logger, ranges []int64, pool chunkenc.Pool, maxBlockChunkSegmentSize int64, mergeFunc storage.VerticalChunkSeriesMergeFunc, enableOverlappingCompaction bool) (*LeveledCompactor, error) {
if len(ranges) == 0 {
return nil, errors.Errorf("at least one range must be provided")
}
if pool == nil {
pool = chunkenc.NewPool()
}
if l == nil {
l = log.NewNopLogger()
}
if mergeFunc == nil {
mergeFunc = storage.NewCompactingChunkSeriesMerger(storage.ChainedSeriesMerge)
}
return &LeveledCompactor{
ranges: ranges,
chunkPool: pool,
logger: l,
metrics: newCompactorMetrics(r),
ctx: ctx,
maxBlockChunkSegmentSize: maxBlockChunkSegmentSize,
mergeFunc: mergeFunc,
concurrencyOpts: DefaultLeveledCompactorConcurrencyOptions(),
enableOverlappingCompaction: enableOverlappingCompaction,
}, nil
}
// LeveledCompactorConcurrencyOptions is a collection of concurrency options used by LeveledCompactor.
type LeveledCompactorConcurrencyOptions struct {
MaxOpeningBlocks int // Number of goroutines opening blocks before compaction.
MaxClosingBlocks int // Max number of blocks that can be closed concurrently during split compaction. Note that closing of newly compacted block uses a lot of memory for writing index.
SymbolsFlushersCount int // Number of symbols flushers used when doing split compaction.
}
func DefaultLeveledCompactorConcurrencyOptions() LeveledCompactorConcurrencyOptions {
return LeveledCompactorConcurrencyOptions{
MaxClosingBlocks: 1,
SymbolsFlushersCount: 1,
MaxOpeningBlocks: 1,
}
}
func (c *LeveledCompactor) SetConcurrencyOptions(opts LeveledCompactorConcurrencyOptions) {
c.concurrencyOpts = opts
}
type dirMeta struct {
dir string
meta *BlockMeta
}
// Plan returns a list of compactable blocks in the provided directory.
func (c *LeveledCompactor) Plan(dir string) ([]string, error) {
dirs, err := blockDirs(dir)
if err != nil {
return nil, err
}
if len(dirs) < 1 {
return nil, nil
}
var dms []dirMeta
for _, dir := range dirs {
meta, _, err := readMetaFile(dir)
if err != nil {
return nil, err
}
dms = append(dms, dirMeta{dir, meta})
}
return c.plan(dms)
}
func (c *LeveledCompactor) plan(dms []dirMeta) ([]string, error) {
sort.Slice(dms, func(i, j int) bool {
return dms[i].meta.MinTime < dms[j].meta.MinTime
})
res := c.selectOverlappingDirs(dms)
if len(res) > 0 {
return res, nil
}
// No overlapping blocks or overlapping block compaction not allowed, do compaction the usual way.
// We do not include a recently created block with max(minTime), so the block which was just created from WAL.
// This gives users a window of a full block size to piece-wise backup new data without having to care about data overlap.
dms = dms[:len(dms)-1]
for _, dm := range c.selectDirs(dms) {
res = append(res, dm.dir)
}
if len(res) > 0 {
return res, nil
}
// Compact any blocks with big enough time range that have >5% tombstones.
for i := len(dms) - 1; i >= 0; i-- {
meta := dms[i].meta
if meta.MaxTime-meta.MinTime < c.ranges[len(c.ranges)/2] {
// If the block is entirely deleted, then we don't care about the block being big enough.
// TODO: This is assuming single tombstone is for distinct series, which might be no true.
if meta.Stats.NumTombstones > 0 && meta.Stats.NumTombstones >= meta.Stats.NumSeries {
return []string{dms[i].dir}, nil
}
break
}
if float64(meta.Stats.NumTombstones)/float64(meta.Stats.NumSeries+1) > 0.05 {
return []string{dms[i].dir}, nil
}
}
return nil, nil
}
// selectDirs returns the dir metas that should be compacted into a single new block.
// If only a single block range is configured, the result is always nil.
func (c *LeveledCompactor) selectDirs(ds []dirMeta) []dirMeta {
if len(c.ranges) < 2 || len(ds) < 1 {
return nil
}
highTime := ds[len(ds)-1].meta.MinTime
for _, iv := range c.ranges[1:] {
parts := splitByRange(ds, iv)
if len(parts) == 0 {
continue
}
Outer:
for _, p := range parts {
// Do not select the range if it has a block whose compaction failed.
for _, dm := range p {
if dm.meta.Compaction.Failed {
continue Outer
}
}
mint := p[0].meta.MinTime
maxt := p[len(p)-1].meta.MaxTime
// Pick the range of blocks if it spans the full range (potentially with gaps)
// or is before the most recent block.
// This ensures we don't compact blocks prematurely when another one of the same
// size still fits in the range.
if (maxt-mint == iv || maxt <= highTime) && len(p) > 1 {
return p
}
}
}
return nil
}
// selectOverlappingDirs returns all dirs with overlapping time ranges.
// It expects sorted input by mint and returns the overlapping dirs in the same order as received.
func (c *LeveledCompactor) selectOverlappingDirs(ds []dirMeta) []string {
if !c.enableOverlappingCompaction {
return nil
}
if len(ds) < 2 {
return nil
}
var overlappingDirs []string
globalMaxt := ds[0].meta.MaxTime
for i, d := range ds[1:] {
if d.meta.MinTime < globalMaxt {
if len(overlappingDirs) == 0 { // When it is the first overlap, need to add the last one as well.
overlappingDirs = append(overlappingDirs, ds[i].dir)
}
overlappingDirs = append(overlappingDirs, d.dir)
} else if len(overlappingDirs) > 0 {
break
}
if d.meta.MaxTime > globalMaxt {
globalMaxt = d.meta.MaxTime
}
}
return overlappingDirs
}
// splitByRange splits the directories by the time range. The range sequence starts at 0.
//
// For example, if we have blocks [0-10, 10-20, 50-60, 90-100] and the split range tr is 30
// it returns [0-10, 10-20], [50-60], [90-100].
func splitByRange(ds []dirMeta, tr int64) [][]dirMeta {
var splitDirs [][]dirMeta
for i := 0; i < len(ds); {
var (
group []dirMeta
t0 int64
m = ds[i].meta
)
// Compute start of aligned time range of size tr closest to the current block's start.
if m.MinTime >= 0 {
t0 = tr * (m.MinTime / tr)
} else {
t0 = tr * ((m.MinTime - tr + 1) / tr)
}
// Skip blocks that don't fall into the range. This can happen via mis-alignment or
// by being the multiple of the intended range.
if m.MaxTime > t0+tr {
i++
continue
}
// Add all dirs to the current group that are within [t0, t0+tr].
for ; i < len(ds); i++ {
// Either the block falls into the next range or doesn't fit at all (checked above).
if ds[i].meta.MaxTime > t0+tr {
break
}
group = append(group, ds[i])
}
if len(group) > 0 {
splitDirs = append(splitDirs, group)
}
}
return splitDirs
}
// CompactBlockMetas merges many block metas into one, combining it's source blocks together
// and adjusting compaction level. Min/Max time of result block meta covers all input blocks.
func CompactBlockMetas(uid ulid.ULID, blocks ...*BlockMeta) *BlockMeta {
res := &BlockMeta{
ULID: uid,
}
sources := map[ulid.ULID]struct{}{}
mint := blocks[0].MinTime
maxt := blocks[0].MaxTime
for _, b := range blocks {
if b.MinTime < mint {
mint = b.MinTime
}
if b.MaxTime > maxt {
maxt = b.MaxTime
}
if b.Compaction.Level > res.Compaction.Level {
res.Compaction.Level = b.Compaction.Level
}
for _, s := range b.Compaction.Sources {
sources[s] = struct{}{}
}
res.Compaction.Parents = append(res.Compaction.Parents, BlockDesc{
ULID: b.ULID,
MinTime: b.MinTime,
MaxTime: b.MaxTime,
})
}
res.Compaction.Level++
for s := range sources {
res.Compaction.Sources = append(res.Compaction.Sources, s)
}
sort.Slice(res.Compaction.Sources, func(i, j int) bool {
return res.Compaction.Sources[i].Compare(res.Compaction.Sources[j]) < 0
})
res.MinTime = mint
res.MaxTime = maxt
return res
}
// CompactWithSplitting merges and splits the input blocks into shardCount number of output blocks,
// and returns slice of block IDs. Position of returned block ID in the result slice corresponds to the shard index.
// If given output block has no series, corresponding block ID will be zero ULID value.
func (c *LeveledCompactor) CompactWithSplitting(dest string, dirs []string, open []*Block, shardCount uint64) (result []ulid.ULID, _ error) {
return c.compact(dest, dirs, open, shardCount)
}
// Compact creates a new block in the compactor's directory from the blocks in the
// provided directories.
func (c *LeveledCompactor) Compact(dest string, dirs []string, open []*Block) (uid ulid.ULID, err error) {
ulids, err := c.compact(dest, dirs, open, 1)
if err != nil {
return ulid.ULID{}, err
}
return ulids[0], nil
}
// shardedBlock describes single *output* block during compaction. This struct is passed between
// compaction methods to wrap output block details, index and chunk writer together.
// Shard index is determined by the position of this structure in the slice of output blocks.
type shardedBlock struct {
meta *BlockMeta
blockDir string
tmpDir string // Temp directory used when block is being built (= blockDir + temp suffix)
chunkw ChunkWriter
indexw IndexWriter
}
func (c *LeveledCompactor) compact(dest string, dirs []string, open []*Block, shardCount uint64) (_ []ulid.ULID, err error) {
if shardCount == 0 {
shardCount = 1
}
start := time.Now()
bs, blocksToClose, err := openBlocksForCompaction(dirs, open, c.logger, c.chunkPool, c.concurrencyOpts.MaxOpeningBlocks)
for _, b := range blocksToClose {
defer b.Close()
}
if err != nil {
return nil, err
}
var (
blocks []BlockReader
metas []*BlockMeta
uids []string
)
for _, b := range bs {
blocks = append(blocks, b)
m := b.Meta()
metas = append(metas, &m)
uids = append(uids, b.meta.ULID.String())
}
outBlocks := make([]shardedBlock, shardCount)
outBlocksTime := ulid.Now() // Make all out blocks share the same timestamp in the ULID.
for ix := range outBlocks {
outBlocks[ix] = shardedBlock{meta: CompactBlockMetas(ulid.MustNew(outBlocksTime, rand.Reader), metas...)}
}
err = c.write(dest, outBlocks, blocks...)
if err == nil {
ulids := make([]ulid.ULID, len(outBlocks))
allOutputBlocksAreEmpty := true
for ix := range outBlocks {
meta := outBlocks[ix].meta
if meta.Stats.NumSamples == 0 {
level.Info(c.logger).Log(
"msg", "compact blocks resulted in empty block",
"count", len(blocks),
"sources", fmt.Sprintf("%v", uids),
"duration", time.Since(start),
"shard", fmt.Sprintf("%d_of_%d", ix+1, shardCount),
)
} else {
allOutputBlocksAreEmpty = false
ulids[ix] = outBlocks[ix].meta.ULID
level.Info(c.logger).Log(
"msg", "compact blocks",
"count", len(blocks),
"mint", meta.MinTime,
"maxt", meta.MaxTime,
"ulid", meta.ULID,
"sources", fmt.Sprintf("%v", uids),
"duration", time.Since(start),
"shard", fmt.Sprintf("%d_of_%d", ix+1, shardCount),
)
}
}
if allOutputBlocksAreEmpty {
// Mark source blocks as deletable.
for _, b := range bs {
b.meta.Compaction.Deletable = true
n, err := writeMetaFile(c.logger, b.dir, &b.meta)
if err != nil {
level.Error(c.logger).Log(
"msg", "Failed to write 'Deletable' to meta file after compaction",
"ulid", b.meta.ULID,
)
}
b.numBytesMeta = n
}
}
return ulids, nil
}
errs := tsdb_errors.NewMulti(err)
if err != context.Canceled {
for _, b := range bs {
if err := b.setCompactionFailed(); err != nil {
errs.Add(errors.Wrapf(err, "setting compaction failed for block: %s", b.Dir()))
}
}
}
return nil, errs.Err()
}
// CompactOOOWithSplitting splits the input OOO Head into shardCount number of output blocks
// per possible block range, and returns slice of block IDs. In result[i][j],
// 'i' corresponds to a single time range of blocks while 'j' corresponds to the shard index.
// If given output block has no series, corresponding block ID will be zero ULID value.
// TODO: write tests for this.
func (c *LeveledCompactor) CompactOOOWithSplitting(dest string, oooHead *OOOCompactionHead, shardCount uint64) (result [][]ulid.ULID, _ error) {
return c.compactOOO(dest, oooHead, shardCount)
}
// CompactOOO creates a new block per possible block range in the compactor's directory from the OOO Head given.
// Each ULID in the result corresponds to a block in a unique time range.
func (c *LeveledCompactor) CompactOOO(dest string, oooHead *OOOCompactionHead) (result []ulid.ULID, err error) {
ulids, err := c.compactOOO(dest, oooHead, 1)
if err != nil {
return nil, err
}
for _, s := range ulids {
if s[0].Compare(ulid.ULID{}) != 0 {
result = append(result, s[0])
}
}
return result, err
}
func (c *LeveledCompactor) compactOOO(dest string, oooHead *OOOCompactionHead, shardCount uint64) (_ [][]ulid.ULID, err error) {
if shardCount == 0 {
shardCount = 1
}
start := time.Now()
if err != nil {
return nil, err
}
// The first dimension of outBlocks determines the time based splitting (i.e. outBlocks[i] has blocks all for the same time range).
// The second dimension of outBlocks determines the label based shard (i.e. outBlocks[i][j] is the (j+1)th shard.
// During ingestion of samples we can identify which ooo blocks will exists so that
// we dont have to prefill symbols and etc for the blocks that will be empty.
// With this, len(outBlocks[x]) will still be the same for all x so that we can pick blocks easily.
// Just that, only some of the outBlocks[x][y] will be valid and populated based on preexisting knowledge of
// which blocks to expect.
// In case we see a sample that is not present in the estimated block ranges, we will create them on flight.
outBlocks := make([][]shardedBlock, 0)
outBlocksTime := ulid.Now() // Make all out blocks share the same timestamp in the ULID.
blockSize := oooHead.ChunkRange()
oooHeadMint, oooHeadMaxt := oooHead.MinTime(), oooHead.MaxTime()
ulids := make([][]ulid.ULID, 0)
for t := blockSize * (oooHeadMint / blockSize); t <= oooHeadMaxt; t = t + blockSize {
mint, maxt := t, t+blockSize
outBlocks = append(outBlocks, make([]shardedBlock, shardCount))
ulids = append(ulids, make([]ulid.ULID, shardCount))
ix := len(outBlocks) - 1
for jx := range outBlocks[ix] {
uid := ulid.MustNew(outBlocksTime, rand.Reader)
meta := &BlockMeta{
ULID: uid,
MinTime: mint,
MaxTime: maxt,
}
meta.Compaction.Level = 1
meta.Compaction.Sources = []ulid.ULID{uid}
outBlocks[ix][jx] = shardedBlock{
meta: meta,
}
ulids[ix][jx] = meta.ULID
}
// Block intervals are half-open: [b.MinTime, b.MaxTime). Block intervals are always +1 than the total samples it includes.
err := c.write(dest, outBlocks[ix], oooHead.CloneForTimeRange(mint, maxt-1))
if err != nil {
// We need to delete all blocks in case there was an error.
for _, obs := range outBlocks {
for _, ob := range obs {
if ob.tmpDir != "" {
if removeErr := os.RemoveAll(ob.tmpDir); removeErr != nil {
level.Error(c.logger).Log("msg", "Failed to remove temp folder after failed compaction", "dir", ob.tmpDir, "err", removeErr.Error())
}
}
if ob.blockDir != "" {
if removeErr := os.RemoveAll(ob.blockDir); removeErr != nil {
level.Error(c.logger).Log("msg", "Failed to remove block folder after failed compaction", "dir", ob.blockDir, "err", removeErr.Error())
}
}
}
}
return nil, err
}
}
noOOOBlock := true
for ix, obs := range outBlocks {
for jx := range obs {
meta := outBlocks[ix][jx].meta
if meta.Stats.NumSamples != 0 {
noOOOBlock = false
level.Info(c.logger).Log(
"msg", "compact ooo head",
"mint", meta.MinTime,
"maxt", meta.MaxTime,
"ulid", meta.ULID,
"duration", time.Since(start),
"shard", fmt.Sprintf("%d_of_%d", jx+1, shardCount),
)
} else {
// This block did not get any data. So clear out the ulid to signal this.
ulids[ix][jx] = ulid.ULID{}
}
}
}
if noOOOBlock {
level.Info(c.logger).Log(
"msg", "compact ooo head resulted in no blocks",
"duration", time.Since(start),
)
return nil, nil
}
return ulids, nil
}
func (c *LeveledCompactor) Write(dest string, b BlockReader, mint, maxt int64, parent *BlockMeta) (ulid.ULID, error) {
start := time.Now()
uid := ulid.MustNew(ulid.Now(), rand.Reader)
meta := &BlockMeta{
ULID: uid,
MinTime: mint,
MaxTime: maxt,
}
meta.Compaction.Level = 1
meta.Compaction.Sources = []ulid.ULID{uid}
if parent != nil {
meta.Compaction.Parents = []BlockDesc{
{ULID: parent.ULID, MinTime: parent.MinTime, MaxTime: parent.MaxTime},
}
}
err := c.write(dest, []shardedBlock{{meta: meta}}, b)
if err != nil {
return uid, err
}
if meta.Stats.NumSamples == 0 {
level.Info(c.logger).Log(
"msg", "write block resulted in empty block",
"mint", meta.MinTime,
"maxt", meta.MaxTime,
"duration", time.Since(start),
)
return ulid.ULID{}, nil
}
level.Info(c.logger).Log(
"msg", "write block",
"mint", meta.MinTime,
"maxt", meta.MaxTime,
"ulid", meta.ULID,
"duration", time.Since(start),
)
return uid, nil
}
// instrumentedChunkWriter is used for level 1 compactions to record statistics
// about compacted chunks.
type instrumentedChunkWriter struct {
ChunkWriter
size prometheus.Histogram
samples prometheus.Histogram
trange prometheus.Histogram
}
func (w *instrumentedChunkWriter) WriteChunks(chunks ...chunks.Meta) error {
for _, c := range chunks {
w.size.Observe(float64(len(c.Chunk.Bytes())))
w.samples.Observe(float64(c.Chunk.NumSamples()))
w.trange.Observe(float64(c.MaxTime - c.MinTime))
}
return w.ChunkWriter.WriteChunks(chunks...)
}
// write creates new output blocks that are the union of the provided blocks into dir.
func (c *LeveledCompactor) write(dest string, outBlocks []shardedBlock, blocks ...BlockReader) (err error) {
var closers []io.Closer
defer func(t time.Time) {
err = tsdb_errors.NewMulti(err, tsdb_errors.CloseAll(closers)).Err()
for _, ob := range outBlocks {
if ob.tmpDir != "" {
// RemoveAll returns no error when tmp doesn't exist so it is safe to always run it.
if removeErr := os.RemoveAll(ob.tmpDir); removeErr != nil {
level.Error(c.logger).Log("msg", "Failed to remove temp folder after failed compaction", "dir", ob.tmpDir, "err", removeErr.Error())
}
}
// If there was any error, and we have multiple output blocks, some blocks may have been generated, or at
// least have existing blockDir. In such case, we want to remove them.
// BlockDir may also not be set yet, if preparation for some previous blocks have failed.
if err != nil && ob.blockDir != "" {
// RemoveAll returns no error when tmp doesn't exist so it is safe to always run it.
if removeErr := os.RemoveAll(ob.blockDir); removeErr != nil {
level.Error(c.logger).Log("msg", "Failed to remove block folder after failed compaction", "dir", ob.blockDir, "err", removeErr.Error())
}
}
}
c.metrics.ran.Inc()
c.metrics.duration.Observe(time.Since(t).Seconds())
}(time.Now())
for ix := range outBlocks {
dir := filepath.Join(dest, outBlocks[ix].meta.ULID.String())
tmp := dir + tmpForCreationBlockDirSuffix
outBlocks[ix].blockDir = dir
outBlocks[ix].tmpDir = tmp
if err = os.RemoveAll(tmp); err != nil {
return err
}
if err = os.MkdirAll(tmp, 0o777); err != nil {
return err
}
// Populate chunk and index files into temporary directory with
// data of all blocks.
var chunkw ChunkWriter
chunkw, err = chunks.NewWriterWithSegSize(chunkDir(tmp), c.maxBlockChunkSegmentSize)
if err != nil {
return errors.Wrap(err, "open chunk writer")
}
chunkw = newPreventDoubleCloseChunkWriter(chunkw) // We now close chunkWriter in populateBlock, but keep it in the closers here as well.
closers = append(closers, chunkw)
// Record written chunk sizes on level 1 compactions.
if outBlocks[ix].meta.Compaction.Level == 1 {
chunkw = &instrumentedChunkWriter{
ChunkWriter: chunkw,
size: c.metrics.chunkSize,
samples: c.metrics.chunkSamples,
trange: c.metrics.chunkRange,
}
}
outBlocks[ix].chunkw = chunkw
var indexw IndexWriter
indexw, err = index.NewWriter(c.ctx, filepath.Join(tmp, indexFilename))
if err != nil {
return errors.Wrap(err, "open index writer")
}
indexw = newPreventDoubleCloseIndexWriter(indexw) // We now close indexWriter in populateBlock, but keep it in the closers here as well.
closers = append(closers, indexw)
outBlocks[ix].indexw = indexw
}
// We use MinTime and MaxTime from first output block, because ALL output blocks have the same min/max times set.
if err := c.populateBlock(blocks, outBlocks[0].meta.MinTime, outBlocks[0].meta.MaxTime, outBlocks); err != nil {
return errors.Wrap(err, "populate block")
}
select {
case <-c.ctx.Done():
return c.ctx.Err()
default:
}
// We are explicitly closing them here to check for error even
// though these are covered under defer. This is because in Windows,
// you cannot delete these unless they are closed and the defer is to
// make sure they are closed if the function exits due to an error above.
errs := tsdb_errors.NewMulti()
for _, w := range closers {
errs.Add(w.Close())
}
closers = closers[:0] // Avoid closing the writers twice in the defer.
if errs.Err() != nil {
return errs.Err()
}
for _, ob := range outBlocks {
// Populated block is empty, don't write meta file for it.
if ob.meta.Stats.NumSamples == 0 {
continue
}
if _, err = writeMetaFile(c.logger, ob.tmpDir, ob.meta); err != nil {
return errors.Wrap(err, "write merged meta")
}
// Create an empty tombstones file.
if _, err := tombstones.WriteFile(c.logger, ob.tmpDir, tombstones.NewMemTombstones()); err != nil {
return errors.Wrap(err, "write new tombstones file")
}
df, err := fileutil.OpenDir(ob.tmpDir)
if err != nil {
return errors.Wrap(err, "open temporary block dir")
}
defer func() {
if df != nil {
df.Close()
}
}()
if err := df.Sync(); err != nil {
return errors.Wrap(err, "sync temporary dir file")
}
// Close temp dir before rename block dir (for windows platform).
if err = df.Close(); err != nil {
return errors.Wrap(err, "close temporary dir")
}
df = nil
// Block successfully written, make it visible in destination dir by moving it from tmp one.
if err := fileutil.Replace(ob.tmpDir, ob.blockDir); err != nil {
return errors.Wrap(err, "rename block dir")
}
}
return nil
}
func debugOutOfOrderChunks(chks []chunks.Meta, logger log.Logger) {
if len(chks) <= 1 {
return
}
prevChk := chks[0]
for i := 1; i < len(chks); i++ {
currChk := chks[i]
if currChk.MinTime > prevChk.MaxTime {
// Not out of order.
continue
}
// Looks like the chunk is out of order.
prevSafeChk, prevIsSafeChk := prevChk.Chunk.(*safeChunk)
currSafeChk, currIsSafeChk := currChk.Chunk.(*safeChunk)
// Get info out of safeChunk (if possible).
prevHeadChunkID := chunks.HeadChunkID(0)
currHeadChunkID := chunks.HeadChunkID(0)
prevLabels := labels.Labels{}
currLabels := labels.Labels{}
if prevSafeChk != nil {
prevHeadChunkID = prevSafeChk.cid
prevLabels = prevSafeChk.s.lset
}
if currSafeChk != nil {
currHeadChunkID = currSafeChk.cid
currLabels = currSafeChk.s.lset
}
level.Warn(logger).Log(
"msg", "found out-of-order chunk when compacting",
"prev_ref", prevChk.Ref,
"curr_ref", currChk.Ref,
"prev_min_time", timeFromMillis(prevChk.MinTime).UTC().String(),
"prev_max_time", timeFromMillis(prevChk.MaxTime).UTC().String(),
"curr_min_time", timeFromMillis(currChk.MinTime).UTC().String(),
"curr_max_time", timeFromMillis(currChk.MaxTime).UTC().String(),
"prev_samples", prevChk.Chunk.NumSamples(),
"curr_samples", currChk.Chunk.NumSamples(),
"prev_is_safe_chunk", prevIsSafeChk,
"curr_is_safe_chunk", currIsSafeChk,
"prev_head_chunk_id", prevHeadChunkID,
"curr_head_chunk_id", currHeadChunkID,
"prev_labelset", prevLabels.String(),
"curr_labelset", currLabels.String(),
"num_chunks_for_series", len(chks),
)
}
}
func timeFromMillis(ms int64) time.Time {
return time.Unix(0, ms*int64(time.Millisecond))
}
// populateBlock fills the index and chunk writers of output blocks with new data gathered as the union
// of the provided blocks.
// It expects sorted blocks input by mint.
// If there is more than 1 output block, each output block will only contain series that hash into its shard
// (based on total number of output blocks).
func (c *LeveledCompactor) populateBlock(blocks []BlockReader, minT, maxT int64, outBlocks []shardedBlock) (err error) {
if len(blocks) == 0 {
return errors.New("cannot populate block(s) from no readers")
}
var (
sets []storage.ChunkSeriesSet
symbolsSets []storage.ChunkSeriesSet // series sets used for finding symbols. Only used when doing sharding.
symbols index.StringIter
closers []io.Closer
overlapping bool
)
defer func() {
errs := tsdb_errors.NewMulti(err)
if cerr := tsdb_errors.CloseAll(closers); cerr != nil {
errs.Add(errors.Wrap(cerr, "close"))
}
err = errs.Err()
c.metrics.populatingBlocks.Set(0)
}()
c.metrics.populatingBlocks.Set(1)
globalMaxt := blocks[0].Meta().MaxTime
for i, b := range blocks {
select {
case <-c.ctx.Done():
return c.ctx.Err()
default:
}
if !overlapping {
if i > 0 && b.Meta().MinTime < globalMaxt {
c.metrics.overlappingBlocks.Inc()
overlapping = true
level.Info(c.logger).Log("msg", "Found overlapping blocks during compaction")
}
if b.Meta().MaxTime > globalMaxt {
globalMaxt = b.Meta().MaxTime
}
}
indexr, err := b.Index()
if err != nil {
return errors.Wrapf(err, "open index reader for block %+v", b.Meta())
}
closers = append(closers, indexr)
chunkr, err := b.Chunks()
if err != nil {
return errors.Wrapf(err, "open chunk reader for block %+v", b.Meta())
}
closers = append(closers, chunkr)
tombsr, err := b.Tombstones()
if err != nil {
return errors.Wrapf(err, "open tombstone reader for block %+v", b.Meta())
}
closers = append(closers, tombsr)
k, v := index.AllPostingsKey()
all, err := indexr.Postings(k, v)
if err != nil {
return err
}
all = indexr.SortedPostings(all)
// Blocks meta is half open: [min, max), so subtract 1 to ensure we don't hold samples with exact meta.MaxTime timestamp.
sets = append(sets, newBlockChunkSeriesSet(indexr, chunkr, tombsr, all, minT, maxT-1, false))
if len(outBlocks) > 1 {
// To iterate series when populating symbols, we cannot reuse postings we just got, but need to get a new copy.
// Postings can only be iterated once.
k, v = index.AllPostingsKey()
all, err = indexr.Postings(k, v)
if err != nil {
return err
}
all = indexr.SortedPostings(all)
// Blocks meta is half open: [min, max), so subtract 1 to ensure we don't hold samples with exact meta.MaxTime timestamp.
symbolsSets = append(symbolsSets, newBlockChunkSeriesSet(indexr, chunkr, tombsr, all, minT, maxT-1, false))
} else {
syms := indexr.Symbols()
if i == 0 {
symbols = syms
continue
}
symbols = NewMergedStringIter(symbols, syms)
}
}
if len(outBlocks) == 1 {
for symbols.Next() {
if err := outBlocks[0].indexw.AddSymbol(symbols.At()); err != nil {
return errors.Wrap(err, "add symbol")
}
}
if symbols.Err() != nil {
return errors.Wrap(symbols.Err(), "next symbol")
}
} else {
if err := c.populateSymbols(symbolsSets, outBlocks); err != nil {
return err
}
}
// Semaphore for number of blocks that can be closed at once.
sema := semaphore.NewWeighted(int64(c.concurrencyOpts.MaxClosingBlocks))
blockWriters := make([]*asyncBlockWriter, len(outBlocks))
for ix := range outBlocks {
blockWriters[ix] = newAsyncBlockWriter(c.chunkPool, outBlocks[ix].chunkw, outBlocks[ix].indexw, sema)
defer blockWriters[ix].closeAsync() // Make sure to close writer to stop goroutine.
}
set := sets[0]
if len(sets) > 1 {
// Merge series using specified chunk series merger.
// The default one is the compacting series merger.
set = storage.NewMergeChunkSeriesSet(sets, c.mergeFunc)
}
// Iterate over all sorted chunk series.
for set.Next() {
select {
case <-c.ctx.Done():
return c.ctx.Err()
default:
}
s := set.At()
chksIter := s.Iterator()
var chks []chunks.Meta
for chksIter.Next() {
// We are not iterating in streaming way over chunk as it's more efficient to do bulk write for index and
// chunk file purposes.
chks = append(chks, chksIter.At())
}
if chksIter.Err() != nil {
return errors.Wrap(chksIter.Err(), "chunk iter")
}
// Skip the series with all deleted chunks.
if len(chks) == 0 {
continue
}
debugOutOfOrderChunks(chks, c.logger)
obIx := uint64(0)
if len(outBlocks) > 1 {
obIx = s.Labels().Hash() % uint64(len(outBlocks))
}
err := blockWriters[obIx].addSeries(s.Labels(), chks)
if err != nil {
return errors.Wrap(err, "adding series")
}
}
if set.Err() != nil {
return errors.Wrap(set.Err(), "iterate compaction set")
}
for ix := range blockWriters {
blockWriters[ix].closeAsync()
}
for ix := range blockWriters {
stats, err := blockWriters[ix].waitFinished()
if err != nil {
return errors.Wrap(err, "writing block")
}
outBlocks[ix].meta.Stats = stats
}
return nil
}
// How many symbols we buffer in memory per output block.
const inMemorySymbolsLimit = 1_000_000
// populateSymbols writes symbols to output blocks. We need to iterate through all series to find
// which series belongs to what block. We collect symbols per sharded block, and then add sorted symbols to
// block's index.
func (c *LeveledCompactor) populateSymbols(sets []storage.ChunkSeriesSet, outBlocks []shardedBlock) error {
if len(outBlocks) == 0 {
return errors.New("no output block")
}
flushers := newSymbolFlushers(c.concurrencyOpts.SymbolsFlushersCount)
defer flushers.close() // Make sure to stop flushers before exiting to avoid leaking goroutines.
batchers := make([]*symbolsBatcher, len(outBlocks))
for ix := range outBlocks {
batchers[ix] = newSymbolsBatcher(inMemorySymbolsLimit, outBlocks[ix].tmpDir, flushers)
// Always include empty symbol. Blocks created from Head always have it in the symbols table,
// and if we only include symbols from series, we would skip it.
// It may not be required, but it's small and better be safe than sorry.
if err := batchers[ix].addSymbol(""); err != nil {
return errors.Wrap(err, "addSymbol to batcher")
}
}
seriesSet := sets[0]
if len(sets) > 1 {
seriesSet = storage.NewMergeChunkSeriesSet(sets, c.mergeFunc)
}
for seriesSet.Next() {
if err := c.ctx.Err(); err != nil {
return err
}
s := seriesSet.At()
obIx := s.Labels().Hash() % uint64(len(outBlocks))
for _, l := range s.Labels() {
if err := batchers[obIx].addSymbol(l.Name); err != nil {
return errors.Wrap(err, "addSymbol to batcher")
}
if err := batchers[obIx].addSymbol(l.Value); err != nil {
return errors.Wrap(err, "addSymbol to batcher")
}
}
}
for ix := range outBlocks {
// Flush the batcher to write remaining symbols.
if err := batchers[ix].flushSymbols(true); err != nil {
return errors.Wrap(err, "flushing batcher")
}
}
err := flushers.close()
if err != nil {
return errors.Wrap(err, "closing flushers")
}
for ix := range outBlocks {
if err := c.ctx.Err(); err != nil {
return err
}
symbolFiles := batchers[ix].getSymbolFiles()
it, err := newSymbolsIterator(symbolFiles)
if err != nil {
return errors.Wrap(err, "opening symbols iterator")
}
// Each symbols iterator must be closed to close underlying files.
closeIt := it
defer func() {
if closeIt != nil {
_ = closeIt.Close()
}
}()
var sym string
for sym, err = it.NextSymbol(); err == nil; sym, err = it.NextSymbol() {
err = outBlocks[ix].indexw.AddSymbol(sym)
if err != nil {
return errors.Wrap(err, "AddSymbol")
}
}
if err != io.EOF {
return errors.Wrap(err, "iterating symbols")
}
// if err == io.EOF, we have iterated through all symbols. We can close underlying
// files now.
closeIt = nil
_ = it.Close()
// Delete symbol files from symbolsBatcher. We don't need to perform the cleanup if populateSymbols
// or compaction fails, because in that case compactor already removes entire (temp) output block directory.
for _, fn := range symbolFiles {
if err := os.Remove(fn); err != nil {
return errors.Wrap(err, "deleting symbols file")
}
}
}
return nil
}
// Returns opened blocks, and blocks that should be closed (also returned in case of error).
func openBlocksForCompaction(dirs []string, open []*Block, logger log.Logger, pool chunkenc.Pool, concurrency int) (blocks, blocksToClose []*Block, _ error) {
blocks = make([]*Block, 0, len(dirs))
blocksToClose = make([]*Block, 0, len(dirs))
toOpenCh := make(chan string, len(dirs))
for _, d := range dirs {
meta, _, err := readMetaFile(d)
if err != nil {
return nil, blocksToClose, err
}
var b *Block
// Use already open blocks if we can, to avoid
// having the index data in memory twice.
for _, o := range open {
if meta.ULID == o.Meta().ULID {
b = o
break
}
}
if b != nil {
blocks = append(blocks, b)
} else {
toOpenCh <- d
}
}
close(toOpenCh)
type openResult struct {
b *Block
err error
}
openResultCh := make(chan openResult, len(toOpenCh))
// Signals to all opening goroutines that there was an error opening some block, and they can stop early.
// If openingError is true, at least one error is sent to openResultCh.
openingError := atomic.NewBool(false)
wg := sync.WaitGroup{}
if len(dirs) < concurrency {
concurrency = len(dirs)
}
for i := 0; i < concurrency; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for d := range toOpenCh {
if openingError.Load() {
return
}
b, err := OpenBlock(logger, d, pool)
openResultCh <- openResult{b: b, err: err}
if err != nil {
openingError.Store(true)
return
}
}
}()
}
wg.Wait()
// All writers to openResultCh have stopped, we can close the output channel, so we can range over it.
close(openResultCh)
var firstErr error
for or := range openResultCh {
if or.err != nil {
// Don't stop on error, but iterate over all opened blocks to collect blocksToClose.
if firstErr == nil {
firstErr = or.err
}
} else {
blocks = append(blocks, or.b)
blocksToClose = append(blocksToClose, or.b)
}
}
return blocks, blocksToClose, firstErr
}