From c1b669bf9b0b1286ece53c977262089119783105 Mon Sep 17 00:00:00 2001 From: Jesus Vazquez Date: Tue, 20 Sep 2022 19:05:50 +0200 Subject: [PATCH] Add out-of-order sample support to the TSDB (#11075) * Introduce out-of-order TSDB support This implementation is based on this design doc: https://docs.google.com/document/d/1Kppm7qL9C-BJB1j6yb6-9ObG3AbdZnFUBYPNNWwDBYM/edit?usp=sharing This commit adds support to accept out-of-order ("OOO") sample into the TSDB up to a configurable time allowance. If OOO is enabled, overlapping querying are automatically enabled. Most of the additions have been borrowed from https://github.com/grafana/mimir-prometheus/ Here is the list ist of the original commits cherry picked from mimir-prometheus into this branch: - 4b2198d7ec47d50989b7c2df66b7b207c32f7f6e - 2836e5513f1bc591535a859f5d41154a75e7c6bc - 00b379c3a5b1ec3799699b6242f300a2b3ea30f0 - ff0dc757587cada63ca948d2d5eb00bf090d63e0 - a632c73352a7e39d60b445700beb47d691549c3e - c6f3d4ab339ab80bbbce74c9946237ced01f0509 - 5e8406a1d4a50d0052bbee83e28ca3b3371408aa - abde1e0ba128936b9eb0224ee1551e56216ebd4a - e70e7698897bb03860bee0467c733fa44e14c9bd - df59320886e03a555d379ac4b0b3130f661407e0 Co-authored-by: Jesus Vazquez Co-authored-by: Ganesh Vernekar Co-authored-by: Dieter Plaetinck Signed-off-by: Jesus Vazquez * gofumpt files Signed-off-by: Jesus Vazquez * Add license header to missing files Signed-off-by: Jesus Vazquez * Fix OOO tests due to existing chunk disk mapper implementation Signed-off-by: Jesus Vazquez * Fix truncate int overflow Signed-off-by: Jesus Vazquez * Add Sync method to the WAL and update tests Signed-off-by: Jesus Vazquez * remove useless sync Signed-off-by: Jesus Vazquez * Update minOOOTime after truncating Head * Update minOOOTime after truncating Head Signed-off-by: Ganesh Vernekar * Fix lint Signed-off-by: Ganesh Vernekar * Add a unit test Signed-off-by: Ganesh Vernekar Signed-off-by: Jesus Vazquez * Load OutOfOrderTimeWindow only once per appender Signed-off-by: Jesus Vazquez * Fix OOO Head LabelValues and PostingsForMatchers Signed-off-by: Jesus Vazquez * Fix replay of OOO mmap chunks Signed-off-by: Ganesh Vernekar * Remove unnecessary err check Signed-off-by: Jesus Vazquez * Prevent panic with ApplyConfig Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Signed-off-by: Jesus Vazquez * Run OOO compaction after restart if there is OOO data from WBL Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Signed-off-by: Jesus Vazquez * Apply Bartek's suggestions Co-authored-by: Bartlomiej Plotka Signed-off-by: Jesus Vazquez * Refactor OOO compaction Signed-off-by: Ganesh Vernekar * Address comments and TODOs - Added a comment explaining why we need the allow overlapping compaction toggle - Clarified TSDBConfig OutOfOrderTimeWindow doc - Added an owner to all the TODOs in the code Signed-off-by: Jesus Vazquez * Run go format Signed-off-by: Jesus Vazquez * Fix remaining review comments Signed-off-by: Ganesh Vernekar * Fix tests Signed-off-by: Ganesh Vernekar * Change wbl reference when truncating ooo in TestHeadMinOOOTimeUpdate Signed-off-by: Jesus Vazquez * Fix TestWBLAndMmapReplay test failure on windows Signed-off-by: Ganesh Vernekar * Address most of the feedback Signed-off-by: Ganesh Vernekar * Refactor the block meta for out of order Signed-off-by: Ganesh Vernekar * Fix windows error Signed-off-by: Ganesh Vernekar * Fix review comments Signed-off-by: Ganesh Vernekar Signed-off-by: Jesus Vazquez Signed-off-by: Ganesh Vernekar Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Co-authored-by: Ganesh Vernekar Co-authored-by: Dieter Plaetinck Co-authored-by: Oleg Zaytsev Co-authored-by: Bartlomiej Plotka --- cmd/prometheus/main.go | 8 +- cmd/promtool/rules_test.go | 6 +- cmd/promtool/tsdb.go | 2 +- config/config.go | 28 + storage/interface.go | 11 +- storage/merge.go | 53 + storage/merge_test.go | 134 ++ tsdb/agent/db.go | 3 +- tsdb/block.go | 29 +- tsdb/block_test.go | 63 +- tsdb/blockwriter.go | 4 +- tsdb/chunkenc/chunk.go | 21 +- tsdb/chunkenc/xor.go | 9 + tsdb/chunks/chunks.go | 13 +- tsdb/chunks/chunks_test.go | 2 +- tsdb/chunks/head_chunks.go | 33 +- tsdb/chunks/head_chunks_test.go | 50 +- tsdb/compact_test.go | 2 +- tsdb/db.go | 394 +++++- tsdb/db_test.go | 2020 ++++++++++++++++++++++++++++++- tsdb/head.go | 489 ++++++-- tsdb/head_append.go | 293 ++++- tsdb/head_bench_test.go | 6 +- tsdb/head_read.go | 269 +++- tsdb/head_read_test.go | 178 +++ tsdb/head_test.go | 510 +++++++- tsdb/head_wal.go | 315 ++++- tsdb/ooo_head.go | 159 +++ tsdb/ooo_head_read.go | 433 +++++++ tsdb/ooo_head_read_test.go | 1207 ++++++++++++++++++ tsdb/ooo_head_test.go | 93 ++ tsdb/querier.go | 4 +- tsdb/querier_bench_test.go | 4 +- tsdb/querier_test.go | 56 +- tsdb/record/record.go | 52 +- tsdb/wal/wal.go | 65 +- tsdb/wal/watcher_test.go | 15 +- web/api/v1/api_test.go | 2 +- 38 files changed, 6655 insertions(+), 380 deletions(-) create mode 100644 tsdb/head_read_test.go create mode 100644 tsdb/ooo_head.go create mode 100644 tsdb/ooo_head_read.go create mode 100644 tsdb/ooo_head_read_test.go create mode 100644 tsdb/ooo_head_test.go diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index ba267dca17..596e962fd2 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -463,6 +463,9 @@ func main() { } cfg.tsdb.MaxExemplars = int64(cfgFile.StorageConfig.ExemplarsConfig.MaxExemplars) } + if cfgFile.StorageConfig.TSDBConfig != nil { + cfg.tsdb.OutOfOrderTimeWindow = cfgFile.StorageConfig.TSDBConfig.OutOfOrderTimeWindow + } // Now that the validity of the config is established, set the config // success metrics accordingly, although the config isn't really loaded @@ -1537,6 +1540,7 @@ type tsdbOptions struct { StripeSize int MinBlockDuration model.Duration MaxBlockDuration model.Duration + OutOfOrderTimeWindow int64 EnableExemplarStorage bool MaxExemplars int64 EnableMemorySnapshotOnShutdown bool @@ -1549,7 +1553,8 @@ func (opts tsdbOptions) ToTSDBOptions() tsdb.Options { RetentionDuration: int64(time.Duration(opts.RetentionDuration) / time.Millisecond), MaxBytes: int64(opts.MaxBytes), NoLockfile: opts.NoLockfile, - AllowOverlappingBlocks: opts.AllowOverlappingBlocks, + AllowOverlappingCompaction: opts.AllowOverlappingBlocks, + AllowOverlappingQueries: opts.AllowOverlappingBlocks, WALCompression: opts.WALCompression, HeadChunksWriteQueueSize: opts.HeadChunksWriteQueueSize, StripeSize: opts.StripeSize, @@ -1558,6 +1563,7 @@ func (opts tsdbOptions) ToTSDBOptions() tsdb.Options { EnableExemplarStorage: opts.EnableExemplarStorage, MaxExemplars: opts.MaxExemplars, EnableMemorySnapshotOnShutdown: opts.EnableMemorySnapshotOnShutdown, + OutOfOrderTimeWindow: opts.OutOfOrderTimeWindow, } } diff --git a/cmd/promtool/rules_test.go b/cmd/promtool/rules_test.go index 1248c26bb0..a184311e5a 100644 --- a/cmd/promtool/rules_test.go +++ b/cmd/promtool/rules_test.go @@ -117,7 +117,8 @@ func TestBackfillRuleIntegration(t *testing.T) { } opts := tsdb.DefaultOptions() - opts.AllowOverlappingBlocks = true + opts.AllowOverlappingQueries = true + opts.AllowOverlappingCompaction = true db, err := tsdb.Open(tmpDir, nil, nil, opts, nil) require.NoError(t, err) @@ -245,7 +246,8 @@ func TestBackfillLabels(t *testing.T) { } opts := tsdb.DefaultOptions() - opts.AllowOverlappingBlocks = true + opts.AllowOverlappingQueries = true + opts.AllowOverlappingCompaction = true db, err := tsdb.Open(tmpDir, nil, nil, opts, nil) require.NoError(t, err) diff --git a/cmd/promtool/tsdb.go b/cmd/promtool/tsdb.go index 7707a99043..7c7c8f6ec0 100644 --- a/cmd/promtool/tsdb.go +++ b/cmd/promtool/tsdb.go @@ -597,7 +597,7 @@ func analyzeCompaction(block tsdb.BlockReader, indexr tsdb.IndexReader) (err err for _, chk := range chks { // Load the actual data of the chunk. - chk, err := chunkr.Chunk(chk.Ref) + chk, err := chunkr.Chunk(chk) if err != nil { return err } diff --git a/config/config.go b/config/config.go index 036faaeef7..a13f397f81 100644 --- a/config/config.go +++ b/config/config.go @@ -501,9 +501,37 @@ func (c *ScrapeConfig) MarshalYAML() (interface{}, error) { // StorageConfig configures runtime reloadable configuration options. type StorageConfig struct { + TSDBConfig *TSDBConfig `yaml:"tsdb,omitempty"` ExemplarsConfig *ExemplarsConfig `yaml:"exemplars,omitempty"` } +// TSDBConfig configures runtime reloadable configuration options. +type TSDBConfig struct { + // OutOfOrderTimeWindow sets how long back in time an out-of-order sample can be inserted + // into the TSDB. This flag is typically set while unmarshaling the configuration file and translating + // OutOfOrderTimeWindowFlag's duration. The unit of this flag is expected to be the same as any + // other timestamp in the TSDB. + OutOfOrderTimeWindow int64 + + // OutOfOrderTimeWindowFlag holds the parsed duration from the config file. + // During unmarshall, this is converted into milliseconds and stored in OutOfOrderTimeWindow. + // This should not be used directly and must be converted into OutOfOrderTimeWindow. + OutOfOrderTimeWindowFlag model.Duration `yaml:"out_of_order_time_window,omitempty"` +} + +// UnmarshalYAML implements the yaml.Unmarshaler interface. +func (t *TSDBConfig) UnmarshalYAML(unmarshal func(interface{}) error) error { + *t = TSDBConfig{} + type plain TSDBConfig + if err := unmarshal((*plain)(t)); err != nil { + return err + } + + t.OutOfOrderTimeWindow = time.Duration(t.OutOfOrderTimeWindowFlag).Milliseconds() + + return nil +} + type TracingClientType string const ( diff --git a/storage/interface.go b/storage/interface.go index f5af49eb73..d73ec72203 100644 --- a/storage/interface.go +++ b/storage/interface.go @@ -27,10 +27,15 @@ import ( // The errors exposed. var ( - ErrNotFound = errors.New("not found") - ErrOutOfOrderSample = errors.New("out of order sample") + ErrNotFound = errors.New("not found") + // ErrOutOfOrderSample is when out of order support is disabled and the sample is out of order. + ErrOutOfOrderSample = errors.New("out of order sample") + // ErrOutOfBounds is when out of order support is disabled and the sample is older than the min valid time for the append. + ErrOutOfBounds = errors.New("out of bounds") + // ErrTooOldSample is when out of order support is enabled but the sample is outside the time window allowed. + ErrTooOldSample = errors.New("too old sample") + // ErrDuplicateSampleForTimestamp is when the sample has same timestamp but different value. ErrDuplicateSampleForTimestamp = errors.New("duplicate sample for timestamp") - ErrOutOfBounds = errors.New("out of bounds") ErrOutOfOrderExemplar = errors.New("out of order exemplar") ErrDuplicateExemplar = errors.New("duplicate exemplar") ErrExemplarLabelLength = fmt.Errorf("label length for exemplar exceeds maximum of %d UTF-8 characters", exemplar.ExemplarMaxLabelSetLength) diff --git a/storage/merge.go b/storage/merge.go index 7726f9bdc9..2f175d3e7e 100644 --- a/storage/merge.go +++ b/storage/merge.go @@ -717,3 +717,56 @@ func (h *chunkIteratorHeap) Pop() interface{} { *h = old[0 : n-1] return x } + +// NewConcatenatingChunkSeriesMerger returns a VerticalChunkSeriesMergeFunc that simply concatenates the +// chunks from the series. The resultant stream of chunks for a series might be overlapping and unsorted. +func NewConcatenatingChunkSeriesMerger() VerticalChunkSeriesMergeFunc { + return func(series ...ChunkSeries) ChunkSeries { + if len(series) == 0 { + return nil + } + return &ChunkSeriesEntry{ + Lset: series[0].Labels(), + ChunkIteratorFn: func() chunks.Iterator { + iterators := make([]chunks.Iterator, 0, len(series)) + for _, s := range series { + iterators = append(iterators, s.Iterator()) + } + return &concatenatingChunkIterator{ + iterators: iterators, + } + }, + } + } +} + +type concatenatingChunkIterator struct { + iterators []chunks.Iterator + idx int + + curr chunks.Meta +} + +func (c *concatenatingChunkIterator) At() chunks.Meta { + return c.curr +} + +func (c *concatenatingChunkIterator) Next() bool { + if c.idx >= len(c.iterators) { + return false + } + if c.iterators[c.idx].Next() { + c.curr = c.iterators[c.idx].At() + return true + } + c.idx++ + return c.Next() +} + +func (c *concatenatingChunkIterator) Err() error { + errs := tsdb_errors.NewMulti() + for _, iter := range c.iterators { + errs.Add(iter.Err()) + } + return errs.Err() +} diff --git a/storage/merge_test.go b/storage/merge_test.go index 90bc1f9d0e..36ce726b1c 100644 --- a/storage/merge_test.go +++ b/storage/merge_test.go @@ -499,6 +499,140 @@ func TestCompactingChunkSeriesMerger(t *testing.T) { } } +func TestConcatenatingChunkSeriesMerger(t *testing.T) { + m := NewConcatenatingChunkSeriesMerger() + + for _, tc := range []struct { + name string + input []ChunkSeries + expected ChunkSeries + }{ + { + name: "single empty series", + input: []ChunkSeries{ + NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil), + }, + expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil), + }, + { + name: "single series", + input: []ChunkSeries{ + NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}}), + }, + expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}}), + }, + { + name: "two empty series", + input: []ChunkSeries{ + NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil), + NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil), + }, + expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil, nil), + }, + { + name: "two non overlapping", + input: []ChunkSeries{ + NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}, sample{5, 5}}), + NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{7, 7}, sample{9, 9}}, []tsdbutil.Sample{sample{10, 10}}), + }, + expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}, sample{5, 5}}, []tsdbutil.Sample{sample{7, 7}, sample{9, 9}}, []tsdbutil.Sample{sample{10, 10}}), + }, + { + name: "two overlapping", + input: []ChunkSeries{ + NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}, sample{8, 8}}), + NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{7, 7}, sample{9, 9}}, []tsdbutil.Sample{sample{10, 10}}), + }, + expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), + []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}, sample{8, 8}}, + []tsdbutil.Sample{sample{7, 7}, sample{9, 9}}, []tsdbutil.Sample{sample{10, 10}}, + ), + }, + { + name: "two duplicated", + input: []ChunkSeries{ + NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}}), + NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{2, 2}, sample{3, 3}, sample{5, 5}}), + }, + expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), + []tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}}, + []tsdbutil.Sample{sample{2, 2}, sample{3, 3}, sample{5, 5}}, + ), + }, + { + name: "three overlapping", + input: []ChunkSeries{ + NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}}), + NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{2, 2}, sample{3, 3}, sample{6, 6}}), + NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{0, 0}, sample{4, 4}}), + }, + expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), + []tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}}, + []tsdbutil.Sample{sample{2, 2}, sample{3, 3}, sample{6, 6}}, + []tsdbutil.Sample{sample{0, 0}, sample{4, 4}}, + ), + }, + { + name: "three in chained overlap", + input: []ChunkSeries{ + NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}}), + NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{4, 4}, sample{6, 66}}), + NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{6, 6}, sample{10, 10}}), + }, + expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), + []tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}}, + []tsdbutil.Sample{sample{4, 4}, sample{6, 66}}, + []tsdbutil.Sample{sample{6, 6}, sample{10, 10}}, + ), + }, + { + name: "three in chained overlap complex", + input: []ChunkSeries{ + NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{0, 0}, sample{5, 5}}, []tsdbutil.Sample{sample{10, 10}, sample{15, 15}}), + NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{2, 2}, sample{20, 20}}, []tsdbutil.Sample{sample{25, 25}, sample{30, 30}}), + NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{18, 18}, sample{26, 26}}, []tsdbutil.Sample{sample{31, 31}, sample{35, 35}}), + }, + expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), + []tsdbutil.Sample{sample{0, 0}, sample{5, 5}}, []tsdbutil.Sample{sample{10, 10}, sample{15, 15}}, + []tsdbutil.Sample{sample{2, 2}, sample{20, 20}}, []tsdbutil.Sample{sample{25, 25}, sample{30, 30}}, + []tsdbutil.Sample{sample{18, 18}, sample{26, 26}}, []tsdbutil.Sample{sample{31, 31}, sample{35, 35}}, + ), + }, + { + name: "110 overlapping", + input: []ChunkSeries{ + NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), tsdbutil.GenerateSamples(0, 110)), // [0 - 110) + NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), tsdbutil.GenerateSamples(60, 50)), // [60 - 110) + }, + expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), + tsdbutil.GenerateSamples(0, 110), + tsdbutil.GenerateSamples(60, 50), + ), + }, + { + name: "150 overlapping samples, simply concatenated and no splits", + input: []ChunkSeries{ + NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), tsdbutil.GenerateSamples(0, 90)), // [0 - 90) + NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), tsdbutil.GenerateSamples(60, 90)), // [90 - 150) + }, + expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), + tsdbutil.GenerateSamples(0, 90), + tsdbutil.GenerateSamples(60, 90), + ), + }, + } { + t.Run(tc.name, func(t *testing.T) { + merged := m(tc.input...) + require.Equal(t, tc.expected.Labels(), merged.Labels()) + actChks, actErr := ExpandChunks(merged.Iterator()) + expChks, expErr := ExpandChunks(tc.expected.Iterator()) + + require.Equal(t, expErr, actErr) + require.Equal(t, expChks, actChks) + }) + } +} + type mockQuerier struct { LabelQuerier diff --git a/tsdb/agent/db.go b/tsdb/agent/db.go index 3feb55623a..e210cdc99e 100644 --- a/tsdb/agent/db.go +++ b/tsdb/agent/db.go @@ -567,8 +567,7 @@ func (db *DB) truncate(mint int64) error { // Start a new segment so low ingestion volume instances don't have more WAL // than needed. - err = db.wal.NextSegment() - if err != nil { + if _, err := db.wal.NextSegment(); err != nil { return errors.Wrap(err, "next segment") } diff --git a/tsdb/block.go b/tsdb/block.go index 6b8b65dda7..8fd1066ba2 100644 --- a/tsdb/block.go +++ b/tsdb/block.go @@ -116,7 +116,7 @@ type ChunkWriter interface { // ChunkReader provides reading access of serialized time series data. type ChunkReader interface { // Chunk returns the series data chunk with the given reference. - Chunk(ref chunks.ChunkRef) (chunkenc.Chunk, error) + Chunk(meta chunks.Meta) (chunkenc.Chunk, error) // Close releases all underlying resources of the reader. Close() error @@ -189,12 +189,39 @@ type BlockMetaCompaction struct { // this block. Parents []BlockDesc `json:"parents,omitempty"` Failed bool `json:"failed,omitempty"` + // Additional information about the compaction, for example, block created from out-of-order chunks. + Hints []string `json:"hints,omitempty"` +} + +func (bm *BlockMetaCompaction) SetOutOfOrder() { + if bm.containsHint(CompactionHintFromOutOfOrder) { + return + } + bm.Hints = append(bm.Hints, CompactionHintFromOutOfOrder) + sort.Strings(bm.Hints) +} + +func (bm *BlockMetaCompaction) FromOutOfOrder() bool { + return bm.containsHint(CompactionHintFromOutOfOrder) +} + +func (bm *BlockMetaCompaction) containsHint(hint string) bool { + for _, h := range bm.Hints { + if h == hint { + return true + } + } + return false } const ( indexFilename = "index" metaFilename = "meta.json" metaVersion1 = 1 + + // CompactionHintFromOutOfOrder is a hint noting that the block + // was created from out-of-order chunks. + CompactionHintFromOutOfOrder = "from-out-of-order" ) func chunkDir(dir string) string { return filepath.Join(dir, "chunks") } diff --git a/tsdb/block_test.go b/tsdb/block_test.go index cf208caf1b..9ebd823d31 100644 --- a/tsdb/block_test.go +++ b/tsdb/block_test.go @@ -27,6 +27,7 @@ import ( "testing" "github.com/go-kit/log" + prom_testutil "github.com/prometheus/client_golang/prometheus/testutil" "github.com/stretchr/testify/require" "github.com/prometheus/prometheus/model/labels" @@ -487,7 +488,7 @@ func createBlockFromHead(tb testing.TB, dir string, head *Head) string { func createHead(tb testing.TB, w *wal.WAL, series []storage.Series, chunkDir string) *Head { opts := DefaultHeadOptions() opts.ChunkDirRoot = chunkDir - head, err := NewHead(nil, nil, w, opts, nil) + head, err := NewHead(nil, nil, w, nil, opts, nil) require.NoError(tb, err) app := head.Appender(context.Background()) @@ -506,6 +507,66 @@ func createHead(tb testing.TB, w *wal.WAL, series []storage.Series, chunkDir str return head } +func createHeadWithOOOSamples(tb testing.TB, w *wal.WAL, series []storage.Series, chunkDir string, oooSampleFrequency int) *Head { + opts := DefaultHeadOptions() + opts.ChunkDirRoot = chunkDir + opts.OutOfOrderTimeWindow.Store(10000000000) + head, err := NewHead(nil, nil, w, nil, opts, nil) + require.NoError(tb, err) + + oooSampleLabels := make([]labels.Labels, 0, len(series)) + oooSamples := make([]tsdbutil.SampleSlice, 0, len(series)) + + totalSamples := 0 + app := head.Appender(context.Background()) + for _, s := range series { + ref := storage.SeriesRef(0) + it := s.Iterator() + lset := s.Labels() + os := tsdbutil.SampleSlice{} + count := 0 + for it.Next() { + totalSamples++ + count++ + t, v := it.At() + if count%oooSampleFrequency == 0 { + os = append(os, sample{t: t, v: v}) + continue + } + ref, err = app.Append(ref, lset, t, v) + require.NoError(tb, err) + } + require.NoError(tb, it.Err()) + if len(os) > 0 { + oooSampleLabels = append(oooSampleLabels, lset) + oooSamples = append(oooSamples, os) + } + } + require.NoError(tb, app.Commit()) + + oooSamplesAppended := 0 + require.Equal(tb, float64(0), prom_testutil.ToFloat64(head.metrics.outOfOrderSamplesAppended)) + + app = head.Appender(context.Background()) + for i, lset := range oooSampleLabels { + ref := storage.SeriesRef(0) + for _, sample := range oooSamples[i] { + ref, err = app.Append(ref, lset, sample.T(), sample.V()) + require.NoError(tb, err) + oooSamplesAppended++ + } + } + require.NoError(tb, app.Commit()) + + actOOOAppended := prom_testutil.ToFloat64(head.metrics.outOfOrderSamplesAppended) + require.GreaterOrEqual(tb, actOOOAppended, float64(oooSamplesAppended-len(series))) + require.LessOrEqual(tb, actOOOAppended, float64(oooSamplesAppended)) + + require.Equal(tb, float64(totalSamples), prom_testutil.ToFloat64(head.metrics.samplesAppended)) + + return head +} + const ( defaultLabelName = "labelName" defaultLabelValue = "labelValue" diff --git a/tsdb/blockwriter.go b/tsdb/blockwriter.go index 09b355368d..4db3079975 100644 --- a/tsdb/blockwriter.go +++ b/tsdb/blockwriter.go @@ -39,7 +39,7 @@ type BlockWriter struct { } // ErrNoSeriesAppended is returned if the series count is zero while flushing blocks. -var ErrNoSeriesAppended error = errors.New("no series appended, aborting") +var ErrNoSeriesAppended = errors.New("no series appended, aborting") // NewBlockWriter create a new block writer. // @@ -71,7 +71,7 @@ func (w *BlockWriter) initHead() error { opts := DefaultHeadOptions() opts.ChunkRange = w.blockSize opts.ChunkDirRoot = w.chunkDir - h, err := NewHead(nil, w.logger, nil, opts, NewHeadStats()) + h, err := NewHead(nil, w.logger, nil, nil, opts, NewHeadStats()) if err != nil { return errors.Wrap(err, "tsdb.NewHead") } diff --git a/tsdb/chunkenc/chunk.go b/tsdb/chunkenc/chunk.go index bffb7e75ab..c5f8036a71 100644 --- a/tsdb/chunkenc/chunk.go +++ b/tsdb/chunkenc/chunk.go @@ -39,6 +39,21 @@ const ( EncXOR ) +// Chunk encodings for out-of-order chunks. +// These encodings must be only used by the Head block for its internal bookkeeping. +const ( + OutOfOrderMask = 0b10000000 + EncOOOXOR = EncXOR | OutOfOrderMask +) + +func IsOutOfOrderChunk(e Encoding) bool { + return (e & OutOfOrderMask) != 0 +} + +func IsValidEncoding(e Encoding) bool { + return e == EncXOR || e == EncOOOXOR +} + // Chunk holds a sequence of sample pairs that can be iterated over and appended to. type Chunk interface { // Bytes returns the underlying byte slice of the chunk. @@ -155,7 +170,7 @@ func NewPool() Pool { func (p *pool) Get(e Encoding, b []byte) (Chunk, error) { switch e { - case EncXOR: + case EncXOR, EncOOOXOR: c := p.xor.Get().(*XORChunk) c.b.stream = b c.b.count = 0 @@ -166,7 +181,7 @@ func (p *pool) Get(e Encoding, b []byte) (Chunk, error) { func (p *pool) Put(c Chunk) error { switch c.Encoding() { - case EncXOR: + case EncXOR, EncOOOXOR: xc, ok := c.(*XORChunk) // This may happen often with wrapped chunks. Nothing we can really do about // it but returning an error would cause a lot of allocations again. Thus, @@ -188,7 +203,7 @@ func (p *pool) Put(c Chunk) error { // bytes. func FromData(e Encoding, d []byte) (Chunk, error) { switch e { - case EncXOR: + case EncXOR, EncOOOXOR: return &XORChunk{b: bstream{count: 0, stream: d}}, nil } return nil, errors.Errorf("invalid chunk encoding %q", e) diff --git a/tsdb/chunkenc/xor.go b/tsdb/chunkenc/xor.go index ba00a6e811..716f0698f0 100644 --- a/tsdb/chunkenc/xor.go +++ b/tsdb/chunkenc/xor.go @@ -457,3 +457,12 @@ func (it *xorIterator) readValue() bool { it.numRead++ return true } + +// OOOXORChunk holds a XORChunk and overrides the Encoding() method. +type OOOXORChunk struct { + *XORChunk +} + +func (c *OOOXORChunk) Encoding() Encoding { + return EncOOOXOR +} diff --git a/tsdb/chunks/chunks.go b/tsdb/chunks/chunks.go index a88884a2e6..6d04998e80 100644 --- a/tsdb/chunks/chunks.go +++ b/tsdb/chunks/chunks.go @@ -121,6 +121,15 @@ type Meta struct { // Time range the data covers. // When MaxTime == math.MaxInt64 the chunk is still open and being appended to. MinTime, MaxTime int64 + + // OOOLastRef, OOOLastMinTime and OOOLastMaxTime are kept as markers for + // overlapping chunks. + // These fields point to the last created out of order Chunk (the head) that existed + // when Series() was called and was overlapping. + // Series() and Chunk() method responses should be consistent for the same + // query even if new data is added in between the calls. + OOOLastRef ChunkRef + OOOLastMinTime, OOOLastMaxTime int64 } // Iterator iterates over the chunks of a single time series. @@ -556,8 +565,8 @@ func (s *Reader) Size() int64 { } // Chunk returns a chunk from a given reference. -func (s *Reader) Chunk(ref ChunkRef) (chunkenc.Chunk, error) { - sgmIndex, chkStart := BlockChunkRef(ref).Unpack() +func (s *Reader) Chunk(meta Meta) (chunkenc.Chunk, error) { + sgmIndex, chkStart := BlockChunkRef(meta.Ref).Unpack() if sgmIndex >= len(s.bs) { return nil, errors.Errorf("segment index %d out of range", sgmIndex) diff --git a/tsdb/chunks/chunks_test.go b/tsdb/chunks/chunks_test.go index 6a4d13db82..affaa4b9f1 100644 --- a/tsdb/chunks/chunks_test.go +++ b/tsdb/chunks/chunks_test.go @@ -23,6 +23,6 @@ func TestReaderWithInvalidBuffer(t *testing.T) { b := realByteSlice([]byte{0x81, 0x81, 0x81, 0x81, 0x81, 0x81}) r := &Reader{bs: []ByteSlice{b}} - _, err := r.Chunk(0) + _, err := r.Chunk(Meta{Ref: 0}) require.Error(t, err) } diff --git a/tsdb/chunks/head_chunks.go b/tsdb/chunks/head_chunks.go index edd7dd5419..dce874a35f 100644 --- a/tsdb/chunks/head_chunks.go +++ b/tsdb/chunks/head_chunks.go @@ -87,6 +87,18 @@ func (ref ChunkDiskMapperRef) Unpack() (seq, offset int) { return seq, offset } +func (ref ChunkDiskMapperRef) GreaterThanOrEqualTo(r ChunkDiskMapperRef) bool { + s1, o1 := ref.Unpack() + s2, o2 := r.Unpack() + return s1 > s2 || (s1 == s2 && o1 >= o2) +} + +func (ref ChunkDiskMapperRef) GreaterThan(r ChunkDiskMapperRef) bool { + s1, o1 := ref.Unpack() + s2, o2 := r.Unpack() + return s1 > s2 || (s1 == s2 && o1 > o2) +} + // CorruptionErr is an error that's returned when corruption is encountered. type CorruptionErr struct { Dir string @@ -736,7 +748,7 @@ func (cdm *ChunkDiskMapper) Chunk(ref ChunkDiskMapperRef) (chunkenc.Chunk, error // and runs the provided function with information about each chunk. It returns on the first error encountered. // NOTE: This method needs to be called at least once after creating ChunkDiskMapper // to set the maxt of all the file. -func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, numSamples uint16) error) (err error) { +func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, numSamples uint16, encoding chunkenc.Encoding) error) (err error) { cdm.writePathMtx.Lock() defer cdm.writePathMtx.Unlock() @@ -799,7 +811,8 @@ func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chu break } - idx += ChunkEncodingSize // Skip encoding. + chkEnc := chunkenc.Encoding(mmapFile.byteSlice.Range(idx, idx+ChunkEncodingSize)[0]) + idx += ChunkEncodingSize dataLen, n := binary.Uvarint(mmapFile.byteSlice.Range(idx, idx+MaxChunkLengthFieldSize)) idx += n @@ -834,7 +847,7 @@ func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chu mmapFile.maxt = maxt } - if err := f(seriesRef, chunkRef, mint, maxt, numSamples); err != nil { + if err := f(seriesRef, chunkRef, mint, maxt, numSamples, chkEnc); err != nil { if cerr, ok := err.(*CorruptionErr); ok { cerr.Dir = cdm.dir.Name() cerr.FileIndex = segID @@ -857,12 +870,8 @@ func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chu return nil } -// Truncate deletes the head chunk files which are strictly below the mint. -// mint should be in milliseconds. -func (cdm *ChunkDiskMapper) Truncate(mint int64) error { - if !cdm.fileMaxtSet { - return errors.New("maxt of the files are not set") - } +// Truncate deletes the head chunk files whose file number is less than given fileNo. +func (cdm *ChunkDiskMapper) Truncate(fileNo uint32) error { cdm.readPathMtx.RLock() // Sort the file indices, else if files deletion fails in between, @@ -875,12 +884,10 @@ func (cdm *ChunkDiskMapper) Truncate(mint int64) error { var removedFiles []int for _, seq := range chkFileIndices { - if seq == cdm.curFileSequence || cdm.mmappedChunkFiles[seq].maxt >= mint { + if seq == cdm.curFileSequence || uint32(seq) >= fileNo { break } - if cdm.mmappedChunkFiles[seq].maxt < mint { - removedFiles = append(removedFiles, seq) - } + removedFiles = append(removedFiles, seq) } cdm.readPathMtx.RUnlock() diff --git a/tsdb/chunks/head_chunks_test.go b/tsdb/chunks/head_chunks_test.go index cc4fc2c09f..68a44479a8 100644 --- a/tsdb/chunks/head_chunks_test.go +++ b/tsdb/chunks/head_chunks_test.go @@ -58,6 +58,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) { mint, maxt int64 numSamples uint16 chunk chunkenc.Chunk + isOOO bool } expectedData := []expectedDataType{} @@ -67,7 +68,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) { for hrw.curFileSequence < 3 || hrw.chkWriter.Buffered() == 0 { addChunks := func(numChunks int) { for i := 0; i < numChunks; i++ { - seriesRef, chkRef, mint, maxt, chunk := createChunk(t, totalChunks, hrw) + seriesRef, chkRef, mint, maxt, chunk, isOOO := createChunk(t, totalChunks, hrw) totalChunks++ expectedData = append(expectedData, expectedDataType{ seriesRef: seriesRef, @@ -76,6 +77,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) { chunkRef: chkRef, chunk: chunk, numSamples: uint16(chunk.NumSamples()), + isOOO: isOOO, }) if hrw.curFileSequence != 1 { @@ -147,7 +149,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) { hrw = createChunkDiskMapper(t, dir) idx := 0 - require.NoError(t, hrw.IterateAllChunks(func(seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, numSamples uint16) error { + require.NoError(t, hrw.IterateAllChunks(func(seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, numSamples uint16, encoding chunkenc.Encoding) error { t.Helper() expData := expectedData[idx] @@ -156,6 +158,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) { require.Equal(t, expData.maxt, maxt) require.Equal(t, expData.maxt, maxt) require.Equal(t, expData.numSamples, numSamples) + require.Equal(t, expData.isOOO, chunkenc.IsOutOfOrderChunk(encoding)) actChunk, err := hrw.Chunk(expData.chunkRef) require.NoError(t, err) @@ -178,9 +181,7 @@ func TestChunkDiskMapper_Truncate(t *testing.T) { }() timeRange := 0 - fileTimeStep := 100 - var thirdFileMinT, sixthFileMinT int64 - addChunk := func() int { + addChunk := func() { t.Helper() step := 100 @@ -194,8 +195,6 @@ func TestChunkDiskMapper_Truncate(t *testing.T) { <-awaitCb require.NoError(t, err) timeRange += step - - return mint } verifyFiles := func(remainingFiles []int) { @@ -216,17 +215,12 @@ func TestChunkDiskMapper_Truncate(t *testing.T) { // Create segments 1 to 7. for i := 1; i <= 7; i++ { hrw.CutNewFile() - mint := int64(addChunk()) - if i == 3 { - thirdFileMinT = mint - } else if i == 6 { - sixthFileMinT = mint - } + addChunk() } verifyFiles([]int{1, 2, 3, 4, 5, 6, 7}) // Truncating files. - require.NoError(t, hrw.Truncate(thirdFileMinT)) + require.NoError(t, hrw.Truncate(3)) // Add a chunk to trigger cutting of new file. addChunk() @@ -245,11 +239,11 @@ func TestChunkDiskMapper_Truncate(t *testing.T) { verifyFiles([]int{3, 4, 5, 6, 7, 8, 9}) // Truncating files after restart. - require.NoError(t, hrw.Truncate(sixthFileMinT)) + require.NoError(t, hrw.Truncate(6)) verifyFiles([]int{6, 7, 8, 9}) // Truncating a second time without adding a chunk shouldn't create a new file. - require.NoError(t, hrw.Truncate(sixthFileMinT+1)) + require.NoError(t, hrw.Truncate(6)) verifyFiles([]int{6, 7, 8, 9}) // Add a chunk to trigger cutting of new file. @@ -257,8 +251,12 @@ func TestChunkDiskMapper_Truncate(t *testing.T) { verifyFiles([]int{6, 7, 8, 9, 10}) + // Truncation by file number. + require.NoError(t, hrw.Truncate(8)) + verifyFiles([]int{8, 9, 10}) + // Truncating till current time should not delete the current active file. - require.NoError(t, hrw.Truncate(int64(timeRange+(2*fileTimeStep)))) + require.NoError(t, hrw.Truncate(10)) // Add a chunk to trigger cutting of new file. addChunk() @@ -335,8 +333,7 @@ func TestChunkDiskMapper_Truncate_PreservesFileSequence(t *testing.T) { // Truncating files till 2. It should not delete anything after 3 (inclusive) // though files 4 and 6 are empty. - file2Maxt := hrw.mmappedChunkFiles[2].maxt - require.NoError(t, hrw.Truncate(file2Maxt+1)) + require.NoError(t, hrw.Truncate(3)) verifyFiles([]int{3, 4, 5, 6}) // Add chunk, so file 6 is not empty anymore. @@ -344,8 +341,7 @@ func TestChunkDiskMapper_Truncate_PreservesFileSequence(t *testing.T) { verifyFiles([]int{3, 4, 5, 6}) // Truncating till file 3 should also delete file 4, because it is empty. - file3Maxt := hrw.mmappedChunkFiles[3].maxt - require.NoError(t, hrw.Truncate(file3Maxt+1)) + require.NoError(t, hrw.Truncate(5)) addChunk() verifyFiles([]int{5, 6, 7}) @@ -381,7 +377,7 @@ func TestHeadReadWriter_TruncateAfterFailedIterateChunks(t *testing.T) { hrw = createChunkDiskMapper(t, dir) // Forcefully failing IterateAllChunks. - require.Error(t, hrw.IterateAllChunks(func(_ HeadSeriesRef, _ ChunkDiskMapperRef, _, _ int64, _ uint16) error { + require.Error(t, hrw.IterateAllChunks(func(_ HeadSeriesRef, _ ChunkDiskMapperRef, _, _ int64, _ uint16, _ chunkenc.Encoding) error { return errors.New("random error") })) @@ -471,7 +467,9 @@ func createChunkDiskMapper(t *testing.T, dir string) *ChunkDiskMapper { hrw, err := NewChunkDiskMapper(nil, dir, chunkenc.NewPool(), DefaultWriteBufferSize, writeQueueSize) require.NoError(t, err) require.False(t, hrw.fileMaxtSet) - require.NoError(t, hrw.IterateAllChunks(func(_ HeadSeriesRef, _ ChunkDiskMapperRef, _, _ int64, _ uint16) error { return nil })) + require.NoError(t, hrw.IterateAllChunks(func(_ HeadSeriesRef, _ ChunkDiskMapperRef, _, _ int64, _ uint16, _ chunkenc.Encoding) error { + return nil + })) require.True(t, hrw.fileMaxtSet) return hrw @@ -488,13 +486,17 @@ func randomChunk(t *testing.T) chunkenc.Chunk { return chunk } -func createChunk(t *testing.T, idx int, hrw *ChunkDiskMapper) (seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, chunk chunkenc.Chunk) { +func createChunk(t *testing.T, idx int, hrw *ChunkDiskMapper) (seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, chunk chunkenc.Chunk, isOOO bool) { var err error seriesRef = HeadSeriesRef(rand.Int63()) mint = int64((idx)*1000 + 1) maxt = int64((idx + 1) * 1000) chunk = randomChunk(t) awaitCb := make(chan struct{}) + if rand.Intn(2) == 0 { + isOOO = true + chunk = &chunkenc.OOOXORChunk{XORChunk: chunk.(*chunkenc.XORChunk)} + } chunkRef = hrw.WriteChunk(seriesRef, mint, maxt, chunk, func(cbErr error) { require.NoError(t, err) close(awaitCb) diff --git a/tsdb/compact_test.go b/tsdb/compact_test.go index 9b55131bef..9f24a81428 100644 --- a/tsdb/compact_test.go +++ b/tsdb/compact_test.go @@ -1080,7 +1080,7 @@ func BenchmarkCompactionFromHead(b *testing.B) { opts := DefaultHeadOptions() opts.ChunkRange = 1000 opts.ChunkDirRoot = chunkDir - h, err := NewHead(nil, nil, nil, opts, nil) + h, err := NewHead(nil, nil, nil, nil, opts, nil) require.NoError(b, err) for ln := 0; ln < labelNames; ln++ { app := h.Appender(context.Background()) diff --git a/tsdb/db.go b/tsdb/db.go index 00c1bceedf..7cf70bcc2d 100644 --- a/tsdb/db.go +++ b/tsdb/db.go @@ -33,6 +33,7 @@ import ( "github.com/oklog/ulid" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" + "go.uber.org/atomic" "golang.org/x/sync/errgroup" "github.com/prometheus/prometheus/config" @@ -69,18 +70,19 @@ var ErrNotReady = errors.New("TSDB not ready") // millisecond precision timestamps. func DefaultOptions() *Options { return &Options{ - WALSegmentSize: wal.DefaultSegmentSize, - MaxBlockChunkSegmentSize: chunks.DefaultChunkSegmentSize, - RetentionDuration: int64(15 * 24 * time.Hour / time.Millisecond), - MinBlockDuration: DefaultBlockDuration, - MaxBlockDuration: DefaultBlockDuration, - NoLockfile: false, - AllowOverlappingBlocks: false, - WALCompression: false, - StripeSize: DefaultStripeSize, - HeadChunksWriteBufferSize: chunks.DefaultWriteBufferSize, - IsolationDisabled: defaultIsolationDisabled, - HeadChunksWriteQueueSize: chunks.DefaultWriteQueueSize, + WALSegmentSize: wal.DefaultSegmentSize, + MaxBlockChunkSegmentSize: chunks.DefaultChunkSegmentSize, + RetentionDuration: int64(15 * 24 * time.Hour / time.Millisecond), + MinBlockDuration: DefaultBlockDuration, + MaxBlockDuration: DefaultBlockDuration, + NoLockfile: false, + AllowOverlappingCompaction: false, + AllowOverlappingQueries: false, + WALCompression: false, + StripeSize: DefaultStripeSize, + HeadChunksWriteBufferSize: chunks.DefaultWriteBufferSize, + IsolationDisabled: defaultIsolationDisabled, + OutOfOrderCapMax: DefaultOutOfOrderCapMax, } } @@ -112,9 +114,19 @@ type Options struct { // NoLockfile disables creation and consideration of a lock file. NoLockfile bool - // Overlapping blocks are allowed if AllowOverlappingBlocks is true. - // This in-turn enables vertical compaction and vertical query merge. - AllowOverlappingBlocks bool + // Querying on overlapping blocks are allowed if AllowOverlappingQueries is true. + // Since querying is a required operation for TSDB, if there are going to be + // overlapping blocks, then this should be set to true. + // NOTE: Do not use this directly in DB. Use it via DB.AllowOverlappingQueries(). + AllowOverlappingQueries bool + + // Compaction of overlapping blocks are allowed if AllowOverlappingCompaction is true. + // This is an optional flag for overlapping blocks. + // The reason why this flag exists is because there are various users of the TSDB + // that do not want vertical compaction happening on ingest time. Instead, + // they'd rather keep overlapping blocks and let another component do the overlapping compaction later. + // For Prometheus, this will always be enabled if overlapping queries is enabled. + AllowOverlappingCompaction bool // WALCompression will turn on Snappy compression for records on the WAL. WALCompression bool @@ -160,6 +172,15 @@ type Options struct { // Disables isolation between reads and in-flight appends. IsolationDisabled bool + + // OutOfOrderTimeWindow specifies how much out of order is allowed, if any. + // This can change during run-time, so this value from here should only be used + // while initialising. + OutOfOrderTimeWindow int64 + + // OutOfOrderCapMax is maximum capacity for OOO chunks (in samples). + // If it is <=0, the default value is assumed. + OutOfOrderCapMax int64 } type BlocksToDeleteFunc func(blocks []*Block) map[ulid.ULID]struct{} @@ -197,6 +218,13 @@ type DB struct { // Cancel a running compaction when a shutdown is initiated. compactCancel context.CancelFunc + + // oooWasEnabled is true if out of order support was enabled at least one time + // during the time TSDB was up. In which case we need to keep supporting + // out-of-order compaction and vertical queries. + oooWasEnabled atomic.Bool + + registerer prometheus.Registerer } type dbMetrics struct { @@ -372,9 +400,17 @@ func (db *DBReadOnly) FlushWAL(dir string) (returnErr error) { if err != nil { return err } + var wbl *wal.WAL + wblDir := filepath.Join(db.dir, wal.WblDirName) + if _, err := os.Stat(wblDir); !os.IsNotExist(err) { + wbl, err = wal.Open(db.logger, wblDir) + if err != nil { + return err + } + } opts := DefaultHeadOptions() opts.ChunkDirRoot = db.dir - head, err := NewHead(nil, db.logger, w, opts, NewHeadStats()) + head, err := NewHead(nil, db.logger, w, wbl, opts, NewHeadStats()) if err != nil { return err } @@ -430,7 +466,7 @@ func (db *DBReadOnly) loadDataAsQueryable(maxt int64) (storage.SampleAndChunkQue opts := DefaultHeadOptions() opts.ChunkDirRoot = db.dir - head, err := NewHead(nil, db.logger, nil, opts, NewHeadStats()) + head, err := NewHead(nil, db.logger, nil, nil, opts, NewHeadStats()) if err != nil { return nil, err } @@ -448,9 +484,17 @@ func (db *DBReadOnly) loadDataAsQueryable(maxt int64) (storage.SampleAndChunkQue if err != nil { return nil, err } + var wbl *wal.WAL + wblDir := filepath.Join(db.dir, wal.WblDirName) + if _, err := os.Stat(wblDir); !os.IsNotExist(err) { + wbl, err = wal.Open(db.logger, wblDir) + if err != nil { + return nil, err + } + } opts := DefaultHeadOptions() opts.ChunkDirRoot = db.dir - head, err = NewHead(nil, db.logger, w, opts, NewHeadStats()) + head, err = NewHead(nil, db.logger, w, wbl, opts, NewHeadStats()) if err != nil { return nil, err } @@ -598,6 +642,15 @@ func validateOpts(opts *Options, rngs []int64) (*Options, []int64) { if opts.MinBlockDuration > opts.MaxBlockDuration { opts.MaxBlockDuration = opts.MinBlockDuration } + if opts.OutOfOrderTimeWindow > 0 { + opts.AllowOverlappingQueries = true + } + if opts.OutOfOrderCapMax <= 0 { + opts.OutOfOrderCapMax = DefaultOutOfOrderCapMax + } + if opts.OutOfOrderTimeWindow < 0 { + opts.OutOfOrderTimeWindow = 0 + } if len(rngs) == 0 { // Start with smallest block duration and create exponential buckets until the exceed the @@ -634,6 +687,7 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs } walDir := filepath.Join(dir, "wal") + wblDir := filepath.Join(dir, wal.WblDirName) // Migrate old WAL if one exists. if err := MigrateWAL(l, walDir); err != nil { @@ -656,6 +710,7 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs autoCompact: true, chunkPool: chunkenc.NewPool(), blocksToDelete: opts.BlocksToDelete, + registerer: r, } defer func() { // Close files if startup fails somewhere. @@ -694,7 +749,7 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs } db.compactCancel = cancel - var wlog *wal.WAL + var wlog, wblog *wal.WAL segmentSize := wal.DefaultSegmentSize // Wal is enabled. if opts.WALSegmentSize >= 0 { @@ -706,8 +761,19 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs if err != nil { return nil, err } + // Check if there is a WBL on disk, in which case we should replay that data. + wblSize, err := fileutil.DirSize(wblDir) + if err != nil && !os.IsNotExist(err) { + return nil, err + } + if opts.OutOfOrderTimeWindow > 0 || wblSize > 0 { + wblog, err = wal.NewSize(l, r, wblDir, segmentSize, opts.WALCompression) + if err != nil { + return nil, err + } + } } - + db.oooWasEnabled.Store(opts.OutOfOrderTimeWindow > 0) headOpts := DefaultHeadOptions() headOpts.ChunkRange = rngs[0] headOpts.ChunkDirRoot = dir @@ -719,11 +785,13 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs headOpts.EnableExemplarStorage = opts.EnableExemplarStorage headOpts.MaxExemplars.Store(opts.MaxExemplars) headOpts.EnableMemorySnapshotOnShutdown = opts.EnableMemorySnapshotOnShutdown + headOpts.OutOfOrderTimeWindow.Store(opts.OutOfOrderTimeWindow) + headOpts.OutOfOrderCapMax.Store(opts.OutOfOrderCapMax) if opts.IsolationDisabled { // We only override this flag if isolation is disabled at DB level. We use the default otherwise. headOpts.IsolationDisabled = opts.IsolationDisabled } - db.head, err = NewHead(r, l, wlog, headOpts, stats.Head) + db.head, err = NewHead(r, l, wlog, wblog, headOpts, stats.Head) if err != nil { return nil, err } @@ -741,20 +809,36 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs } // Set the min valid time for the ingested samples // to be no lower than the maxt of the last block. - blocks := db.Blocks() minValidTime := int64(math.MinInt64) - if len(blocks) > 0 { - minValidTime = blocks[len(blocks)-1].Meta().MaxTime + // We do not consider blocks created from out-of-order samples for Head's minValidTime + // since minValidTime is only for the in-order data and we do not want to discard unnecessary + // samples from the Head. + inOrderMaxTime, ok := db.inOrderBlocksMaxTime() + if ok { + minValidTime = inOrderMaxTime } if initErr := db.head.Init(minValidTime); initErr != nil { db.head.metrics.walCorruptionsTotal.Inc() - level.Warn(db.logger).Log("msg", "Encountered WAL read error, attempting repair", "err", initErr) - if err := wlog.Repair(initErr); err != nil { - return nil, errors.Wrap(err, "repair corrupted WAL") + isOOOErr := isErrLoadOOOWal(initErr) + if isOOOErr { + level.Warn(db.logger).Log("msg", "Encountered OOO WAL read error, attempting repair", "err", initErr) + if err := wblog.Repair(initErr); err != nil { + return nil, errors.Wrap(err, "repair corrupted OOO WAL") + } + } else { + level.Warn(db.logger).Log("msg", "Encountered WAL read error, attempting repair", "err", initErr) + if err := wlog.Repair(initErr); err != nil { + return nil, errors.Wrap(err, "repair corrupted WAL") + } } } + if db.head.MinOOOTime() != int64(math.MaxInt64) { + // Some OOO data was replayed from the disk that needs compaction and cleanup. + db.oooWasEnabled.Store(true) + } + go db.run() return db, nil @@ -846,8 +930,58 @@ func (db *DB) Appender(ctx context.Context) storage.Appender { return dbAppender{db: db, Appender: db.head.Appender(ctx)} } +// ApplyConfig applies a new config to the DB. +// Behaviour of 'OutOfOrderTimeWindow' is as follows: +// OOO enabled = oooTimeWindow > 0. OOO disabled = oooTimeWindow is 0. +// 1) Before: OOO disabled, Now: OOO enabled => +// - A new WBL is created for the head block. +// - OOO compaction is enabled. +// - Overlapping queries are enabled. +// +// 2) Before: OOO enabled, Now: OOO enabled => +// - Only the time window is updated. +// +// 3) Before: OOO enabled, Now: OOO disabled => +// - Time Window set to 0. So no new OOO samples will be allowed. +// - OOO WBL will stay and will be eventually cleaned up. +// - OOO Compaction and overlapping queries will remain enabled until a restart or until all OOO samples are compacted. +// +// 4) Before: OOO disabled, Now: OOO disabled => no-op. func (db *DB) ApplyConfig(conf *config.Config) error { - return db.head.ApplyConfig(conf) + oooTimeWindow := int64(0) + if conf.StorageConfig.TSDBConfig != nil { + oooTimeWindow = conf.StorageConfig.TSDBConfig.OutOfOrderTimeWindow + } + if oooTimeWindow < 0 { + oooTimeWindow = 0 + } + + // Create WBL if it was not present and if OOO is enabled with WAL enabled. + var wblog *wal.WAL + var err error + if db.head.wbl != nil { + // The existing WBL from the disk might have been replayed while OOO was disabled. + wblog = db.head.wbl + } else if !db.oooWasEnabled.Load() && oooTimeWindow > 0 && db.opts.WALSegmentSize >= 0 { + segmentSize := wal.DefaultSegmentSize + // Wal is set to a custom size. + if db.opts.WALSegmentSize > 0 { + segmentSize = db.opts.WALSegmentSize + } + oooWalDir := filepath.Join(db.dir, wal.WblDirName) + wblog, err = wal.NewSize(db.logger, db.registerer, oooWalDir, segmentSize, db.opts.WALCompression) + if err != nil { + return err + } + } + + db.opts.OutOfOrderTimeWindow = oooTimeWindow + db.head.ApplyConfig(conf, wblog) + + if !db.oooWasEnabled.Load() { + db.oooWasEnabled.Store(oooTimeWindow > 0) + } + return nil } // dbAppender wraps the DB's head appender and triggers compactions on commit @@ -946,6 +1080,14 @@ func (db *DB) Compact() (returnErr error) { "block_range", db.head.chunkRange.Load(), ) } + + if lastBlockMaxt != math.MinInt64 { + // The head was compacted, so we compact OOO head as well. + if err := db.compactOOOHead(); err != nil { + return errors.Wrap(err, "compact ooo head") + } + } + return db.compactBlocks() } @@ -964,6 +1106,102 @@ func (db *DB) CompactHead(head *RangeHead) error { return nil } +// CompactOOOHead compacts the OOO Head. +func (db *DB) CompactOOOHead() error { + db.cmtx.Lock() + defer db.cmtx.Unlock() + + return db.compactOOOHead() +} + +func (db *DB) compactOOOHead() error { + if !db.oooWasEnabled.Load() { + return nil + } + oooHead, err := NewOOOCompactionHead(db.head) + if err != nil { + return errors.Wrap(err, "get ooo compaction head") + } + + ulids, err := db.compactOOO(db.dir, oooHead) + if err != nil { + return errors.Wrap(err, "compact ooo head") + } + if err := db.reloadBlocks(); err != nil { + errs := tsdb_errors.NewMulti(err) + for _, uid := range ulids { + if errRemoveAll := os.RemoveAll(filepath.Join(db.dir, uid.String())); errRemoveAll != nil { + errs.Add(errRemoveAll) + } + } + return errors.Wrap(errs.Err(), "reloadBlocks blocks after failed compact ooo head") + } + + lastWBLFile, minOOOMmapRef := oooHead.LastWBLFile(), oooHead.LastMmapRef() + if lastWBLFile != 0 || minOOOMmapRef != 0 { + if err := db.head.truncateOOO(lastWBLFile, minOOOMmapRef); err != nil { + return errors.Wrap(err, "truncate ooo wbl") + } + } + + return nil +} + +// compactOOO creates a new block per possible block range in the compactor's directory from the OOO Head given. +// Each ULID in the result corresponds to a block in a unique time range. +func (db *DB) compactOOO(dest string, oooHead *OOOCompactionHead) (_ []ulid.ULID, err error) { + start := time.Now() + + blockSize := oooHead.ChunkRange() + oooHeadMint, oooHeadMaxt := oooHead.MinTime(), oooHead.MaxTime() + ulids := make([]ulid.ULID, 0) + defer func() { + if err != nil { + // Best effort removal of created block on any error. + for _, uid := range ulids { + _ = os.RemoveAll(filepath.Join(db.dir, uid.String())) + } + } + }() + + for t := blockSize * (oooHeadMint / blockSize); t <= oooHeadMaxt; t = t + blockSize { + mint, maxt := t, t+blockSize + // Block intervals are half-open: [b.MinTime, b.MaxTime). Block intervals are always +1 than the total samples it includes. + uid, err := db.compactor.Write(dest, oooHead.CloneForTimeRange(mint, maxt-1), mint, maxt, nil) + if err != nil { + return nil, err + } + if uid.Compare(ulid.ULID{}) != 0 { + ulids = append(ulids, uid) + blockDir := filepath.Join(dest, uid.String()) + meta, _, err := readMetaFile(blockDir) + if err != nil { + return ulids, errors.Wrap(err, "read meta") + } + meta.Compaction.SetOutOfOrder() + _, err = writeMetaFile(db.logger, blockDir, meta) + if err != nil { + return ulids, errors.Wrap(err, "write meta") + } + } + } + + if len(ulids) == 0 { + level.Info(db.logger).Log( + "msg", "compact ooo head resulted in no blocks", + "duration", time.Since(start), + ) + return nil, nil + } + + level.Info(db.logger).Log( + "msg", "out-of-order compaction completed", + "duration", time.Since(start), + "ulids", fmt.Sprintf("%v", ulids), + ) + return ulids, nil +} + // compactHead compacts the given RangeHead. // The compaction mutex should be held before calling this method. func (db *DB) compactHead(head *RangeHead) error { @@ -1038,10 +1276,11 @@ func (db *DB) reload() error { if err := db.reloadBlocks(); err != nil { return errors.Wrap(err, "reloadBlocks") } - if len(db.blocks) == 0 { + maxt, ok := db.inOrderBlocksMaxTime() + if !ok { return nil } - if err := db.head.Truncate(db.blocks[len(db.blocks)-1].MaxTime()); err != nil { + if err := db.head.Truncate(maxt); err != nil { return errors.Wrap(err, "head truncate") } return nil @@ -1121,7 +1360,7 @@ func (db *DB) reloadBlocks() (err error) { sort.Slice(toLoad, func(i, j int) bool { return toLoad[i].Meta().MinTime < toLoad[j].Meta().MinTime }) - if !db.opts.AllowOverlappingBlocks { + if !db.AllowOverlappingQueries() { if err := validateBlockSequence(toLoad); err != nil { return errors.Wrap(err, "invalid block sequence") } @@ -1151,6 +1390,10 @@ func (db *DB) reloadBlocks() (err error) { return nil } +func (db *DB) AllowOverlappingQueries() bool { + return db.opts.AllowOverlappingQueries || db.oooWasEnabled.Load() +} + func openBlocks(l log.Logger, dir string, loaded []*Block, chunkPool chunkenc.Pool) (blocks []*Block, corrupted map[ulid.ULID]error, err error) { bDirs, err := blockDirs(dir) if err != nil { @@ -1428,6 +1671,21 @@ func (db *DB) Blocks() []*Block { return db.blocks } +// inOrderBlocksMaxTime returns the max time among the blocks that were not totally created +// out of out-of-order data. If the returned boolean is true, it means there is at least +// one such block. +func (db *DB) inOrderBlocksMaxTime() (maxt int64, ok bool) { + maxt, ok = int64(math.MinInt64), false + // If blocks are overlapping, last block might not have the max time. So check all blocks. + for _, b := range db.Blocks() { + if !b.meta.Compaction.FromOutOfOrder() && b.meta.MaxTime > maxt { + ok = true + maxt = b.meta.MaxTime + } + } + return maxt, ok +} + // Head returns the databases's head. func (db *DB) Head() *Head { return db.head @@ -1526,13 +1784,13 @@ func (db *DB) Querier(_ context.Context, mint, maxt int64) (storage.Querier, err blocks = append(blocks, b) } } - var headQuerier storage.Querier + var inOrderHeadQuerier storage.Querier if maxt >= db.head.MinTime() { rh := NewRangeHead(db.head, mint, maxt) var err error - headQuerier, err = NewBlockQuerier(rh, mint, maxt) + inOrderHeadQuerier, err = NewBlockQuerier(rh, mint, maxt) if err != nil { - return nil, errors.Wrapf(err, "open querier for head %s", rh) + return nil, errors.Wrapf(err, "open block querier for head %s", rh) } // Getting the querier above registers itself in the queue that the truncation waits on. @@ -1540,20 +1798,30 @@ func (db *DB) Querier(_ context.Context, mint, maxt int64) (storage.Querier, err // won't run into a race later since any truncation that comes after will wait on this querier if it overlaps. shouldClose, getNew, newMint := db.head.IsQuerierCollidingWithTruncation(mint, maxt) if shouldClose { - if err := headQuerier.Close(); err != nil { - return nil, errors.Wrapf(err, "closing head querier %s", rh) + if err := inOrderHeadQuerier.Close(); err != nil { + return nil, errors.Wrapf(err, "closing head block querier %s", rh) } - headQuerier = nil + inOrderHeadQuerier = nil } if getNew { rh := NewRangeHead(db.head, newMint, maxt) - headQuerier, err = NewBlockQuerier(rh, newMint, maxt) + inOrderHeadQuerier, err = NewBlockQuerier(rh, newMint, maxt) if err != nil { - return nil, errors.Wrapf(err, "open querier for head while getting new querier %s", rh) + return nil, errors.Wrapf(err, "open block querier for head while getting new querier %s", rh) } } } + var outOfOrderHeadQuerier storage.Querier + if overlapsClosedInterval(mint, maxt, db.head.MinOOOTime(), db.head.MaxOOOTime()) { + rh := NewOOORangeHead(db.head, mint, maxt) + var err error + outOfOrderHeadQuerier, err = NewBlockQuerier(rh, mint, maxt) + if err != nil { + return nil, errors.Wrapf(err, "open block querier for ooo head %s", rh) + } + } + blockQueriers := make([]storage.Querier, 0, len(blocks)) for _, b := range blocks { q, err := NewBlockQuerier(b, mint, maxt) @@ -1568,14 +1836,18 @@ func (db *DB) Querier(_ context.Context, mint, maxt int64) (storage.Querier, err } return nil, errors.Wrapf(err, "open querier for block %s", b) } - if headQuerier != nil { - blockQueriers = append(blockQueriers, headQuerier) + if inOrderHeadQuerier != nil { + blockQueriers = append(blockQueriers, inOrderHeadQuerier) + } + if outOfOrderHeadQuerier != nil { + blockQueriers = append(blockQueriers, outOfOrderHeadQuerier) } return storage.NewMergeQuerier(blockQueriers, nil, storage.ChainedSeriesMerge), nil } -// ChunkQuerier returns a new chunk querier over the data partition for the given time range. -func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQuerier, error) { +// blockQueriersForRange returns individual block chunk queriers from the persistent blocks, in-order head block, and the +// out-of-order head block, overlapping with the given time range. +func (db *DB) blockChunkQuerierForRange(mint, maxt int64) ([]storage.ChunkQuerier, error) { var blocks []BlockReader db.mtx.RLock() @@ -1586,11 +1858,11 @@ func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQu blocks = append(blocks, b) } } - var headQuerier storage.ChunkQuerier + var inOrderHeadQuerier storage.ChunkQuerier if maxt >= db.head.MinTime() { rh := NewRangeHead(db.head, mint, maxt) var err error - headQuerier, err = NewBlockChunkQuerier(rh, mint, maxt) + inOrderHeadQuerier, err = NewBlockChunkQuerier(rh, mint, maxt) if err != nil { return nil, errors.Wrapf(err, "open querier for head %s", rh) } @@ -1600,20 +1872,30 @@ func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQu // won't run into a race later since any truncation that comes after will wait on this querier if it overlaps. shouldClose, getNew, newMint := db.head.IsQuerierCollidingWithTruncation(mint, maxt) if shouldClose { - if err := headQuerier.Close(); err != nil { + if err := inOrderHeadQuerier.Close(); err != nil { return nil, errors.Wrapf(err, "closing head querier %s", rh) } - headQuerier = nil + inOrderHeadQuerier = nil } if getNew { rh := NewRangeHead(db.head, newMint, maxt) - headQuerier, err = NewBlockChunkQuerier(rh, newMint, maxt) + inOrderHeadQuerier, err = NewBlockChunkQuerier(rh, newMint, maxt) if err != nil { return nil, errors.Wrapf(err, "open querier for head while getting new querier %s", rh) } } } + var outOfOrderHeadQuerier storage.ChunkQuerier + if overlapsClosedInterval(mint, maxt, db.head.MinOOOTime(), db.head.MaxOOOTime()) { + rh := NewOOORangeHead(db.head, mint, maxt) + var err error + outOfOrderHeadQuerier, err = NewBlockChunkQuerier(rh, mint, maxt) + if err != nil { + return nil, errors.Wrapf(err, "open block chunk querier for ooo head %s", rh) + } + } + blockQueriers := make([]storage.ChunkQuerier, 0, len(blocks)) for _, b := range blocks { q, err := NewBlockChunkQuerier(b, mint, maxt) @@ -1628,10 +1910,22 @@ func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQu } return nil, errors.Wrapf(err, "open querier for block %s", b) } - if headQuerier != nil { - blockQueriers = append(blockQueriers, headQuerier) + if inOrderHeadQuerier != nil { + blockQueriers = append(blockQueriers, inOrderHeadQuerier) + } + if outOfOrderHeadQuerier != nil { + blockQueriers = append(blockQueriers, outOfOrderHeadQuerier) } + return blockQueriers, nil +} + +// ChunkQuerier returns a new chunk querier over the data partition for the given time range. +func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQuerier, error) { + blockQueriers, err := db.blockChunkQuerierForRange(mint, maxt) + if err != nil { + return nil, err + } return storage.NewMergeChunkQuerier(blockQueriers, nil, storage.NewCompactingChunkSeriesMerger(storage.ChainedSeriesMerge)), nil } diff --git a/tsdb/db_test.go b/tsdb/db_test.go index 53e6e824af..f996c423f7 100644 --- a/tsdb/db_test.go +++ b/tsdb/db_test.go @@ -40,6 +40,7 @@ import ( "github.com/stretchr/testify/require" "go.uber.org/goleak" + "github.com/prometheus/prometheus/config" "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/model/metadata" "github.com/prometheus/prometheus/storage" @@ -1335,6 +1336,7 @@ func intersection(oldBlocks, actualBlocks []string) (intersection []string) { } // mockCompactorFailing creates a new empty block on every write and fails when reached the max allowed total. +// For CompactOOO, it always fails. type mockCompactorFailing struct { t *testing.T blocks []*Block @@ -1373,6 +1375,10 @@ func (*mockCompactorFailing) Compact(string, []string, []*Block) (ulid.ULID, err return ulid.ULID{}, nil } +func (*mockCompactorFailing) CompactOOO(dest string, oooHead *OOOCompactionHead) (result []ulid.ULID, err error) { + return nil, fmt.Errorf("mock compaction failing CompactOOO") +} + func TestTimeRetention(t *testing.T) { db := openTestDB(t, nil, []int64{1000}) defer func() { @@ -1405,7 +1411,9 @@ func TestTimeRetention(t *testing.T) { } func TestSizeRetention(t *testing.T) { - db := openTestDB(t, nil, []int64{100}) + opts := DefaultOptions() + opts.OutOfOrderTimeWindow = 100 + db := openTestDB(t, opts, []int64{100}) defer func() { require.NoError(t, db.Close()) }() @@ -1428,9 +1436,11 @@ func TestSizeRetention(t *testing.T) { // Add some data to the WAL. headApp := db.Head().Appender(context.Background()) + var aSeries labels.Labels for _, m := range headBlocks { series := genSeries(100, 10, m.MinTime, m.MaxTime+1) for _, s := range series { + aSeries = s.Labels() it := s.Iterator() for it.Next() { tim, v := it.At() @@ -1488,6 +1498,26 @@ func TestSizeRetention(t *testing.T) { require.NoError(t, err) require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") + // Add some out of order samples to check the size of WBL. + headApp = db.Head().Appender(context.Background()) + for ts := int64(750); ts < 800; ts++ { + _, err := headApp.Append(0, aSeries, ts, float64(ts)) + require.NoError(t, err) + } + require.NoError(t, headApp.Commit()) + + walSize, err = db.Head().wal.Size() + require.NoError(t, err) + wblSize, err := db.Head().wbl.Size() + require.NoError(t, err) + require.NotZero(t, wblSize) + cdmSize, err = db.Head().chunkDiskMapper.Size() + require.NoError(t, err) + expSize = blockSize + walSize + wblSize + cdmSize + actSize, err = fileutil.DirSize(db.Dir()) + require.NoError(t, err) + require.Equal(t, expSize, actSize, "registered size doesn't match actual disk size") + // Decrease the max bytes limit so that a delete is triggered. // Check total size, total count and check that the oldest block was deleted. firstBlockSize := db.Blocks()[0].Size() @@ -1503,8 +1533,8 @@ func TestSizeRetention(t *testing.T) { cdmSize, err = db.Head().chunkDiskMapper.Size() require.NoError(t, err) require.NotZero(t, cdmSize) - // Expected size should take into account block size + WAL size - expSize = blockSize + walSize + cdmSize + // Expected size should take into account block size + WAL size + WBL size + expSize = blockSize + walSize + wblSize + cdmSize actRetentionCount := int(prom_testutil.ToFloat64(db.metrics.sizeRetentionCount)) actSize, err = fileutil.DirSize(db.Dir()) require.NoError(t, err) @@ -2753,7 +2783,7 @@ func TestChunkWriter_ReadAfterWrite(t *testing.T) { for _, chks := range test.chks { for _, chkExp := range chks { - chkAct, err := r.Chunk(chkExp.Ref) + chkAct, err := r.Chunk(chkExp) require.NoError(t, err) require.Equal(t, chkExp.Chunk.Bytes(), chkAct.Bytes()) } @@ -2813,7 +2843,7 @@ func TestChunkReader_ConcurrentReads(t *testing.T) { go func(chunk chunks.Meta) { defer wg.Done() - chkAct, err := r.Chunk(chunk.Ref) + chkAct, err := r.Chunk(chunk) require.NoError(t, err) require.Equal(t, chunk.Chunk.Bytes(), chkAct.Bytes()) }(chk) @@ -3053,7 +3083,8 @@ func TestOneCheckpointPerCompactCall(t *testing.T) { _, err = app.Append(0, lbls, (blockRange*i)+blockRange/2, rand.Float64()) require.NoError(t, err) // Rotate the WAL file so that there is >3 files for checkpoint to happen. - require.NoError(t, db.head.wal.NextSegment()) + _, err = db.head.wal.NextSegment() + require.NoError(t, err) } require.NoError(t, app.Commit()) @@ -3437,6 +3468,196 @@ func newTestDB(t *testing.T) *DB { return db } +func TestOOOWALWrite(t *testing.T) { + dir := t.TempDir() + + opts := DefaultOptions() + opts.OutOfOrderCapMax = 2 + opts.OutOfOrderTimeWindow = 30 * time.Minute.Milliseconds() + + db, err := Open(dir, nil, nil, opts, nil) + require.NoError(t, err) + + t.Cleanup(func() { + require.NoError(t, db.Close()) + }) + + s1, s2 := labels.FromStrings("l", "v1"), labels.FromStrings("l", "v2") + minutes := func(m int64) int64 { return m * time.Minute.Milliseconds() } + + appendSample := func(app storage.Appender, l labels.Labels, mins int64) { + _, err = app.Append(0, l, minutes(mins), float64(mins)) + require.NoError(t, err) + } + + // Ingest sample at 1h. + app := db.Appender(context.Background()) + appendSample(app, s1, 60) + appendSample(app, s2, 60) + require.NoError(t, app.Commit()) + + // OOO for s1. + app = db.Appender(context.Background()) + appendSample(app, s1, 40) + require.NoError(t, app.Commit()) + + // OOO for s2. + app = db.Appender(context.Background()) + appendSample(app, s2, 42) + require.NoError(t, app.Commit()) + + // OOO for both s1 and s2 in the same commit. + app = db.Appender(context.Background()) + appendSample(app, s2, 45) + appendSample(app, s1, 35) + appendSample(app, s1, 36) // m-maps. + appendSample(app, s1, 37) + require.NoError(t, app.Commit()) + + // OOO for s1 but not for s2 in the same commit. + app = db.Appender(context.Background()) + appendSample(app, s1, 50) // m-maps. + appendSample(app, s2, 65) + require.NoError(t, app.Commit()) + + // Single commit has 2 times m-mapping and more samples after m-map. + app = db.Appender(context.Background()) + appendSample(app, s2, 50) // m-maps. + appendSample(app, s2, 51) + appendSample(app, s2, 52) // m-maps. + appendSample(app, s2, 53) + require.NoError(t, app.Commit()) + + // The MmapRef in this are not hand calculated, and instead taken from the test run. + // What is important here is the order of records, and that MmapRef increases for each record. + oooRecords := []interface{}{ + []record.RefMmapMarker{ + {Ref: 1}, + }, + []record.RefSample{ + {Ref: 1, T: minutes(40), V: 40}, + }, + + []record.RefMmapMarker{ + {Ref: 2}, + }, + []record.RefSample{ + {Ref: 2, T: minutes(42), V: 42}, + }, + + []record.RefSample{ + {Ref: 2, T: minutes(45), V: 45}, + {Ref: 1, T: minutes(35), V: 35}, + }, + []record.RefMmapMarker{ // 3rd sample, hence m-mapped. + {Ref: 1, MmapRef: 4294967304}, + }, + []record.RefSample{ + {Ref: 1, T: minutes(36), V: 36}, + {Ref: 1, T: minutes(37), V: 37}, + }, + + []record.RefMmapMarker{ // 3rd sample, hence m-mapped. + {Ref: 1, MmapRef: 4294967354}, + }, + []record.RefSample{ // Does not contain the in-order sample here. + {Ref: 1, T: minutes(50), V: 50}, + }, + + // Single commit but multiple OOO records. + []record.RefMmapMarker{ + {Ref: 2, MmapRef: 4294967403}, + }, + []record.RefSample{ + {Ref: 2, T: minutes(50), V: 50}, + {Ref: 2, T: minutes(51), V: 51}, + }, + []record.RefMmapMarker{ + {Ref: 2, MmapRef: 4294967452}, + }, + []record.RefSample{ + {Ref: 2, T: minutes(52), V: 52}, + {Ref: 2, T: minutes(53), V: 53}, + }, + } + + inOrderRecords := []interface{}{ + []record.RefSeries{ + {Ref: 1, Labels: s1}, + {Ref: 2, Labels: s2}, + }, + []record.RefSample{ + {Ref: 1, T: minutes(60), V: 60}, + {Ref: 2, T: minutes(60), V: 60}, + }, + []record.RefSample{ + {Ref: 1, T: minutes(40), V: 40}, + }, + []record.RefSample{ + {Ref: 2, T: minutes(42), V: 42}, + }, + []record.RefSample{ + {Ref: 2, T: minutes(45), V: 45}, + {Ref: 1, T: minutes(35), V: 35}, + {Ref: 1, T: minutes(36), V: 36}, + {Ref: 1, T: minutes(37), V: 37}, + }, + []record.RefSample{ // Contains both in-order and ooo sample. + {Ref: 1, T: minutes(50), V: 50}, + {Ref: 2, T: minutes(65), V: 65}, + }, + []record.RefSample{ + {Ref: 2, T: minutes(50), V: 50}, + {Ref: 2, T: minutes(51), V: 51}, + {Ref: 2, T: minutes(52), V: 52}, + {Ref: 2, T: minutes(53), V: 53}, + }, + } + + getRecords := func(walDir string) []interface{} { + sr, err := wal.NewSegmentsReader(walDir) + require.NoError(t, err) + r := wal.NewReader(sr) + defer func() { + require.NoError(t, sr.Close()) + }() + + var ( + records []interface{} + dec record.Decoder + ) + for r.Next() { + rec := r.Record() + switch typ := dec.Type(rec); typ { + case record.Series: + series, err := dec.Series(rec, nil) + require.NoError(t, err) + records = append(records, series) + case record.Samples: + samples, err := dec.Samples(rec, nil) + require.NoError(t, err) + records = append(records, samples) + case record.MmapMarkers: + markers, err := dec.MmapMarkers(rec, nil) + require.NoError(t, err) + records = append(records, markers) + default: + t.Fatalf("got a WAL record that is not series or samples: %v", typ) + } + } + + return records + } + + // The normal WAL. + actRecs := getRecords(path.Join(dir, "wal")) + require.Equal(t, inOrderRecords, actRecs) + + // The OOO WAL. + actRecs = getRecords(path.Join(dir, wal.WblDirName)) + require.Equal(t, oooRecords, actRecs) +} + // Tests https://github.com/prometheus/prometheus/issues/10291#issuecomment-1044373110. func TestDBPanicOnMmappingHeadChunk(t *testing.T) { dir := t.TempDir() @@ -3568,7 +3789,7 @@ func TestMetadataCheckpointingOnlyKeepsLatestEntry(t *testing.T) { ctx := context.Background() numSamples := 10000 - hb, w := newTestHead(t, int64(numSamples)*10, false) + hb, w := newTestHead(t, int64(numSamples)*10, false, false) // Add some series so we can append metadata to them. app := hb.Appender(ctx) @@ -3745,3 +3966,1788 @@ func TestMetadataAssertInMemoryData(t *testing.T) { require.Equal(t, *reopenDB.head.series.getByHash(s3.Hash(), s3).meta, m3) require.Equal(t, *reopenDB.head.series.getByHash(s4.Hash(), s4).meta, m4) } + +// TODO(codesome): test more samples incoming once compaction has started. To verify new samples after the start +// +// are not included in this compaction. +func TestOOOCompaction(t *testing.T) { + dir := t.TempDir() + + opts := DefaultOptions() + opts.OutOfOrderCapMax = 30 + opts.OutOfOrderTimeWindow = 300 * time.Minute.Milliseconds() + opts.AllowOverlappingQueries = true + opts.AllowOverlappingCompaction = true + + db, err := Open(dir, nil, nil, opts, nil) + require.NoError(t, err) + db.DisableCompactions() // We want to manually call it. + t.Cleanup(func() { + require.NoError(t, db.Close()) + }) + + series1 := labels.FromStrings("foo", "bar1") + series2 := labels.FromStrings("foo", "bar2") + + addSample := func(fromMins, toMins int64) { + app := db.Appender(context.Background()) + for min := fromMins; min <= toMins; min++ { + ts := min * time.Minute.Milliseconds() + _, err := app.Append(0, series1, ts, float64(ts)) + require.NoError(t, err) + _, err = app.Append(0, series2, ts, float64(2*ts)) + require.NoError(t, err) + } + require.NoError(t, app.Commit()) + } + + // Add an in-order samples. + addSample(250, 350) + + // Verify that the in-memory ooo chunk is empty. + checkEmptyOOOChunk := func(lbls labels.Labels) { + ms, created, err := db.head.getOrCreate(lbls.Hash(), lbls) + require.NoError(t, err) + require.False(t, created) + require.Nil(t, ms.oooHeadChunk) + require.Equal(t, 0, len(ms.oooMmappedChunks)) + } + checkEmptyOOOChunk(series1) + checkEmptyOOOChunk(series2) + + // Add ooo samples that creates multiple chunks. + // 90 to 300 spans across 3 block ranges: [0, 120), [120, 240), [240, 360) + addSample(90, 310) + // Adding same samples to create overlapping chunks. + // Since the active chunk won't start at 90 again, all the new + // chunks will have different time ranges than the previous chunks. + addSample(90, 310) + + verifyDBSamples := func() { + var series1Samples, series2Samples []tsdbutil.Sample + for _, r := range [][2]int64{{90, 119}, {120, 239}, {240, 350}} { + fromMins, toMins := r[0], r[1] + for min := fromMins; min <= toMins; min++ { + ts := min * time.Minute.Milliseconds() + series1Samples = append(series1Samples, sample{ts, float64(ts)}) + series2Samples = append(series2Samples, sample{ts, float64(2 * ts)}) + } + } + expRes := map[string][]tsdbutil.Sample{ + series1.String(): series1Samples, + series2.String(): series2Samples, + } + + q, err := db.Querier(context.Background(), math.MinInt64, math.MaxInt64) + require.NoError(t, err) + + actRes := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar.*")) + require.Equal(t, expRes, actRes) + } + + verifyDBSamples() // Before any compaction. + + // Verify that the in-memory ooo chunk is not empty. + checkNonEmptyOOOChunk := func(lbls labels.Labels) { + ms, created, err := db.head.getOrCreate(lbls.Hash(), lbls) + require.NoError(t, err) + require.False(t, created) + require.Greater(t, ms.oooHeadChunk.chunk.NumSamples(), 0) + require.Equal(t, 14, len(ms.oooMmappedChunks)) // 7 original, 7 duplicate. + } + checkNonEmptyOOOChunk(series1) + checkNonEmptyOOOChunk(series2) + + // No blocks before compaction. + require.Equal(t, len(db.Blocks()), 0) + + // There is a 0th WBL file. + require.NoError(t, db.head.wbl.Sync()) // syncing to make sure wbl is flushed in windows + files, err := os.ReadDir(db.head.wbl.Dir()) + require.NoError(t, err) + require.Len(t, files, 1) + require.Equal(t, "00000000", files[0].Name()) + f, err := files[0].Info() + require.NoError(t, err) + require.Greater(t, f.Size(), int64(100)) + + // OOO compaction happens here. + require.NoError(t, db.CompactOOOHead()) + + // 3 blocks exist now. [0, 120), [120, 240), [240, 360) + require.Equal(t, len(db.Blocks()), 3) + + verifyDBSamples() // Blocks created out of OOO head now. + + // 0th WBL file will be deleted and 1st will be the only present. + files, err = os.ReadDir(db.head.wbl.Dir()) + require.NoError(t, err) + require.Len(t, files, 1) + require.Equal(t, "00000001", files[0].Name()) + f, err = files[0].Info() + require.NoError(t, err) + require.Equal(t, int64(0), f.Size()) + + // OOO stuff should not be present in the Head now. + checkEmptyOOOChunk(series1) + checkEmptyOOOChunk(series2) + + verifySamples := func(block *Block, fromMins, toMins int64) { + series1Samples := make([]tsdbutil.Sample, 0, toMins-fromMins+1) + series2Samples := make([]tsdbutil.Sample, 0, toMins-fromMins+1) + for min := fromMins; min <= toMins; min++ { + ts := min * time.Minute.Milliseconds() + series1Samples = append(series1Samples, sample{ts, float64(ts)}) + series2Samples = append(series2Samples, sample{ts, float64(2 * ts)}) + } + expRes := map[string][]tsdbutil.Sample{ + series1.String(): series1Samples, + series2.String(): series2Samples, + } + + q, err := NewBlockQuerier(block, math.MinInt64, math.MaxInt64) + require.NoError(t, err) + + actRes := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar.*")) + require.Equal(t, expRes, actRes) + } + + // Checking for expected data in the blocks. + verifySamples(db.Blocks()[0], 90, 119) + verifySamples(db.Blocks()[1], 120, 239) + verifySamples(db.Blocks()[2], 240, 310) + + // There should be a single m-map file. + mmapDir := mmappedChunksDir(db.head.opts.ChunkDirRoot) + files, err = os.ReadDir(mmapDir) + require.NoError(t, err) + require.Len(t, files, 1) + + // Compact the in-order head and expect another block. + // Since this is a forced compaction, this block is not aligned with 2h. + err = db.CompactHead(NewRangeHead(db.head, 250*time.Minute.Milliseconds(), 350*time.Minute.Milliseconds())) + require.NoError(t, err) + require.Equal(t, len(db.Blocks()), 4) // [0, 120), [120, 240), [240, 360), [250, 351) + verifySamples(db.Blocks()[3], 250, 350) + + verifyDBSamples() // Blocks created out of normal and OOO head now. But not merged. + + // The compaction also clears out the old m-map files. Including + // the file that has ooo chunks. + files, err = os.ReadDir(mmapDir) + require.NoError(t, err) + require.Len(t, files, 1) + require.Equal(t, "000001", files[0].Name()) + + // This will merge overlapping block. + require.NoError(t, db.Compact()) + + require.Equal(t, len(db.Blocks()), 3) // [0, 120), [120, 240), [240, 360) + verifySamples(db.Blocks()[0], 90, 119) + verifySamples(db.Blocks()[1], 120, 239) + verifySamples(db.Blocks()[2], 240, 350) // Merged block. + + verifyDBSamples() // Final state. Blocks from normal and OOO head are merged. +} + +// TestOOOCompactionWithNormalCompaction tests if OOO compaction is performed +// when the normal head's compaction is done. +func TestOOOCompactionWithNormalCompaction(t *testing.T) { + dir := t.TempDir() + + opts := DefaultOptions() + opts.OutOfOrderCapMax = 30 + opts.OutOfOrderTimeWindow = 300 * time.Minute.Milliseconds() + opts.AllowOverlappingQueries = true + opts.AllowOverlappingCompaction = true + + db, err := Open(dir, nil, nil, opts, nil) + require.NoError(t, err) + db.DisableCompactions() // We want to manually call it. + t.Cleanup(func() { + require.NoError(t, db.Close()) + }) + + series1 := labels.FromStrings("foo", "bar1") + series2 := labels.FromStrings("foo", "bar2") + + addSamples := func(fromMins, toMins int64) { + app := db.Appender(context.Background()) + for min := fromMins; min <= toMins; min++ { + ts := min * time.Minute.Milliseconds() + _, err := app.Append(0, series1, ts, float64(ts)) + require.NoError(t, err) + _, err = app.Append(0, series2, ts, float64(2*ts)) + require.NoError(t, err) + } + require.NoError(t, app.Commit()) + } + + // Add an in-order samples. + addSamples(250, 350) + + // Add ooo samples that will result into a single block. + addSamples(90, 110) + + // Checking that ooo chunk is not empty. + for _, lbls := range []labels.Labels{series1, series2} { + ms, created, err := db.head.getOrCreate(lbls.Hash(), lbls) + require.NoError(t, err) + require.False(t, created) + require.Greater(t, ms.oooHeadChunk.chunk.NumSamples(), 0) + } + + // If the normal Head is not compacted, the OOO head compaction does not take place. + require.NoError(t, db.Compact()) + require.Equal(t, len(db.Blocks()), 0) + + // Add more in-order samples in future that would trigger the compaction. + addSamples(400, 450) + + // No blocks before compaction. + require.Equal(t, len(db.Blocks()), 0) + + // Compacts normal and OOO head. + require.NoError(t, db.Compact()) + + // 2 blocks exist now. [0, 120), [250, 360) + require.Equal(t, len(db.Blocks()), 2) + require.Equal(t, int64(0), db.Blocks()[0].MinTime()) + require.Equal(t, 120*time.Minute.Milliseconds(), db.Blocks()[0].MaxTime()) + require.Equal(t, 250*time.Minute.Milliseconds(), db.Blocks()[1].MinTime()) + require.Equal(t, 360*time.Minute.Milliseconds(), db.Blocks()[1].MaxTime()) + + // Checking that ooo chunk is empty. + for _, lbls := range []labels.Labels{series1, series2} { + ms, created, err := db.head.getOrCreate(lbls.Hash(), lbls) + require.NoError(t, err) + require.False(t, created) + require.Nil(t, ms.oooHeadChunk) + require.Equal(t, 0, len(ms.oooMmappedChunks)) + } + + verifySamples := func(block *Block, fromMins, toMins int64) { + series1Samples := make([]tsdbutil.Sample, 0, toMins-fromMins+1) + series2Samples := make([]tsdbutil.Sample, 0, toMins-fromMins+1) + for min := fromMins; min <= toMins; min++ { + ts := min * time.Minute.Milliseconds() + series1Samples = append(series1Samples, sample{ts, float64(ts)}) + series2Samples = append(series2Samples, sample{ts, float64(2 * ts)}) + } + expRes := map[string][]tsdbutil.Sample{ + series1.String(): series1Samples, + series2.String(): series2Samples, + } + + q, err := NewBlockQuerier(block, math.MinInt64, math.MaxInt64) + require.NoError(t, err) + + actRes := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar.*")) + require.Equal(t, expRes, actRes) + } + + // Checking for expected data in the blocks. + verifySamples(db.Blocks()[0], 90, 110) + verifySamples(db.Blocks()[1], 250, 350) +} + +func Test_Querier_OOOQuery(t *testing.T) { + opts := DefaultOptions() + opts.OutOfOrderCapMax = 30 + opts.OutOfOrderTimeWindow = 24 * time.Hour.Milliseconds() + opts.AllowOverlappingQueries = true + opts.AllowOverlappingCompaction = false + + series1 := labels.FromStrings("foo", "bar1") + + minutes := func(m int64) int64 { return m * time.Minute.Milliseconds() } + addSample := func(db *DB, fromMins, toMins, queryMinT, queryMaxT int64, expSamples []tsdbutil.Sample) ([]tsdbutil.Sample, int) { + app := db.Appender(context.Background()) + totalAppended := 0 + for min := fromMins; min <= toMins; min += time.Minute.Milliseconds() { + _, err := app.Append(0, series1, min, float64(min)) + if min >= queryMinT && min <= queryMaxT { + expSamples = append(expSamples, sample{t: min, v: float64(min)}) + } + require.NoError(t, err) + totalAppended++ + } + require.NoError(t, app.Commit()) + return expSamples, totalAppended + } + + tests := []struct { + name string + queryMinT int64 + queryMaxT int64 + inOrderMinT int64 + inOrderMaxT int64 + oooMinT int64 + oooMaxT int64 + }{ + { + name: "query interval covering ooomint and inordermaxt returns all ingested samples", + queryMinT: minutes(0), + queryMaxT: minutes(200), + inOrderMinT: minutes(100), + inOrderMaxT: minutes(200), + oooMinT: minutes(0), + oooMaxT: minutes(99), + }, + { + name: "partial query interval returns only samples within interval", + queryMinT: minutes(20), + queryMaxT: minutes(180), + inOrderMinT: minutes(100), + inOrderMaxT: minutes(200), + oooMinT: minutes(0), + oooMaxT: minutes(99), + }, + } + for _, tc := range tests { + t.Run(fmt.Sprintf("name=%s", tc.name), func(t *testing.T) { + db := openTestDB(t, opts, nil) + db.DisableCompactions() + defer func() { + require.NoError(t, db.Close()) + }() + + var expSamples []tsdbutil.Sample + + // Add in-order samples. + expSamples, _ = addSample(db, tc.inOrderMinT, tc.inOrderMaxT, tc.queryMinT, tc.queryMaxT, expSamples) + + // Add out-of-order samples. + expSamples, oooSamples := addSample(db, tc.oooMinT, tc.oooMaxT, tc.queryMinT, tc.queryMaxT, expSamples) + + sort.Slice(expSamples, func(i, j int) bool { + return expSamples[i].T() < expSamples[j].T() + }) + + querier, err := db.Querier(context.TODO(), tc.queryMinT, tc.queryMaxT) + require.NoError(t, err) + defer querier.Close() + + seriesSet := query(t, querier, labels.MustNewMatcher(labels.MatchEqual, "foo", "bar1")) + require.NotNil(t, seriesSet[series1.String()]) + require.Equal(t, 1, len(seriesSet)) + require.Equal(t, expSamples, seriesSet[series1.String()]) + require.GreaterOrEqual(t, float64(oooSamples), prom_testutil.ToFloat64(db.head.metrics.outOfOrderSamplesAppended), "number of ooo appended samples mismatch") + }) + } +} + +func Test_ChunkQuerier_OOOQuery(t *testing.T) { + opts := DefaultOptions() + opts.OutOfOrderCapMax = 30 + opts.OutOfOrderTimeWindow = 24 * time.Hour.Milliseconds() + opts.AllowOverlappingQueries = true + opts.AllowOverlappingCompaction = false + + series1 := labels.FromStrings("foo", "bar1") + + minutes := func(m int64) int64 { return m * time.Minute.Milliseconds() } + addSample := func(db *DB, fromMins, toMins, queryMinT, queryMaxT int64, expSamples []tsdbutil.Sample) ([]tsdbutil.Sample, int) { + app := db.Appender(context.Background()) + totalAppended := 0 + for min := fromMins; min <= toMins; min += time.Minute.Milliseconds() { + _, err := app.Append(0, series1, min, float64(min)) + if min >= queryMinT && min <= queryMaxT { + expSamples = append(expSamples, sample{t: min, v: float64(min)}) + } + require.NoError(t, err) + totalAppended++ + } + require.NoError(t, app.Commit()) + return expSamples, totalAppended + } + + tests := []struct { + name string + queryMinT int64 + queryMaxT int64 + inOrderMinT int64 + inOrderMaxT int64 + oooMinT int64 + oooMaxT int64 + }{ + { + name: "query interval covering ooomint and inordermaxt returns all ingested samples", + queryMinT: minutes(0), + queryMaxT: minutes(200), + inOrderMinT: minutes(100), + inOrderMaxT: minutes(200), + oooMinT: minutes(0), + oooMaxT: minutes(99), + }, + { + name: "partial query interval returns only samples within interval", + queryMinT: minutes(20), + queryMaxT: minutes(180), + inOrderMinT: minutes(100), + inOrderMaxT: minutes(200), + oooMinT: minutes(0), + oooMaxT: minutes(99), + }, + } + for _, tc := range tests { + t.Run(fmt.Sprintf("name=%s", tc.name), func(t *testing.T) { + db := openTestDB(t, opts, nil) + db.DisableCompactions() + defer func() { + require.NoError(t, db.Close()) + }() + + var expSamples []tsdbutil.Sample + + // Add in-order samples. + expSamples, _ = addSample(db, tc.inOrderMinT, tc.inOrderMaxT, tc.queryMinT, tc.queryMaxT, expSamples) + + // Add out-of-order samples. + expSamples, oooSamples := addSample(db, tc.oooMinT, tc.oooMaxT, tc.queryMinT, tc.queryMaxT, expSamples) + + sort.Slice(expSamples, func(i, j int) bool { + return expSamples[i].T() < expSamples[j].T() + }) + + querier, err := db.ChunkQuerier(context.TODO(), tc.queryMinT, tc.queryMaxT) + require.NoError(t, err) + defer querier.Close() + + chks := queryChunks(t, querier, labels.MustNewMatcher(labels.MatchEqual, "foo", "bar1")) + require.NotNil(t, chks[series1.String()]) + require.Equal(t, 1, len(chks)) + require.Equal(t, float64(oooSamples), prom_testutil.ToFloat64(db.head.metrics.outOfOrderSamplesAppended), "number of ooo appended samples mismatch") + var gotSamples []tsdbutil.Sample + for _, chunk := range chks[series1.String()] { + it := chunk.Chunk.Iterator(nil) + for it.Next() { + ts, v := it.At() + gotSamples = append(gotSamples, sample{t: ts, v: v}) + } + } + require.Equal(t, expSamples, gotSamples) + }) + } +} + +func TestOOOAppendAndQuery(t *testing.T) { + opts := DefaultOptions() + opts.OutOfOrderCapMax = 30 + opts.OutOfOrderTimeWindow = 4 * time.Hour.Milliseconds() + opts.AllowOverlappingQueries = true + + db := openTestDB(t, opts, nil) + db.DisableCompactions() + t.Cleanup(func() { + require.NoError(t, db.Close()) + }) + + s1 := labels.FromStrings("foo", "bar1") + s2 := labels.FromStrings("foo", "bar2") + + minutes := func(m int64) int64 { return m * time.Minute.Milliseconds() } + appendedSamples := make(map[string][]tsdbutil.Sample) + totalSamples := 0 + addSample := func(lbls labels.Labels, fromMins, toMins int64, faceError bool) { + app := db.Appender(context.Background()) + key := lbls.String() + from, to := minutes(fromMins), minutes(toMins) + for min := from; min <= to; min += time.Minute.Milliseconds() { + val := rand.Float64() + _, err := app.Append(0, lbls, min, val) + if faceError { + require.Error(t, err) + } else { + require.NoError(t, err) + appendedSamples[key] = append(appendedSamples[key], sample{t: min, v: val}) + totalSamples++ + } + } + if faceError { + require.NoError(t, app.Rollback()) + } else { + require.NoError(t, app.Commit()) + } + } + + testQuery := func(from, to int64) { + querier, err := db.Querier(context.TODO(), from, to) + require.NoError(t, err) + + seriesSet := query(t, querier, labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar.")) + + for k, v := range appendedSamples { + sort.Slice(v, func(i, j int) bool { + return v[i].T() < v[j].T() + }) + appendedSamples[k] = v + } + + expSamples := make(map[string][]tsdbutil.Sample) + for k, samples := range appendedSamples { + for _, s := range samples { + if s.T() < from { + continue + } + if s.T() > to { + continue + } + expSamples[k] = append(expSamples[k], s) + } + } + require.Equal(t, expSamples, seriesSet) + require.Equal(t, float64(totalSamples-2), prom_testutil.ToFloat64(db.head.metrics.outOfOrderSamplesAppended), "number of ooo appended samples mismatch") + } + + verifyOOOMinMaxTimes := func(expMin, expMax int64) { + require.Equal(t, minutes(expMin), db.head.MinOOOTime()) + require.Equal(t, minutes(expMax), db.head.MaxOOOTime()) + } + + // In-order samples. + addSample(s1, 300, 300, false) + addSample(s2, 290, 290, false) + require.Equal(t, float64(2), prom_testutil.ToFloat64(db.head.metrics.chunksCreated)) + testQuery(math.MinInt64, math.MaxInt64) + + // Some ooo samples. + addSample(s1, 250, 260, false) + addSample(s2, 255, 265, false) + verifyOOOMinMaxTimes(250, 265) + testQuery(math.MinInt64, math.MaxInt64) + testQuery(minutes(250), minutes(265)) // Test querying ono data time range + testQuery(minutes(290), minutes(300)) // Test querying in-order data time range + testQuery(minutes(250), minutes(300)) // Test querying the entire range + + // Out of time window. + addSample(s1, 59, 59, true) + addSample(s2, 49, 49, true) + verifyOOOMinMaxTimes(250, 265) + testQuery(math.MinInt64, math.MaxInt64) + + // At the edge of time window, also it would be "out of bound" without the ooo support. + addSample(s1, 60, 65, false) + verifyOOOMinMaxTimes(60, 265) + testQuery(math.MinInt64, math.MaxInt64) + + // This sample is not within the time window w.r.t. the head's maxt, but it is within the window + // w.r.t. the series' maxt. But we consider only head's maxt. + addSample(s2, 59, 59, true) + verifyOOOMinMaxTimes(60, 265) + testQuery(math.MinInt64, math.MaxInt64) + + // Now the sample is within time window w.r.t. the head's maxt. + addSample(s2, 60, 65, false) + verifyOOOMinMaxTimes(60, 265) + testQuery(math.MinInt64, math.MaxInt64) + + // Out of time window again. + addSample(s1, 59, 59, true) + addSample(s2, 49, 49, true) + testQuery(math.MinInt64, math.MaxInt64) + + // Generating some m-map chunks. The m-map chunks here are in such a way + // that when sorted w.r.t. mint, the last chunk's maxt is not the overall maxt + // of the merged chunk. This tests a bug fixed in https://github.com/grafana/mimir-prometheus/pull/238/. + require.Equal(t, float64(4), prom_testutil.ToFloat64(db.head.metrics.chunksCreated)) + addSample(s1, 180, 249, false) + require.Equal(t, float64(6), prom_testutil.ToFloat64(db.head.metrics.chunksCreated)) + verifyOOOMinMaxTimes(60, 265) + testQuery(math.MinInt64, math.MaxInt64) +} + +func TestOOODisabled(t *testing.T) { + opts := DefaultOptions() + opts.OutOfOrderTimeWindow = 0 + db := openTestDB(t, opts, nil) + db.DisableCompactions() + t.Cleanup(func() { + require.NoError(t, db.Close()) + }) + + s1 := labels.FromStrings("foo", "bar1") + minutes := func(m int64) int64 { return m * time.Minute.Milliseconds() } + expSamples := make(map[string][]tsdbutil.Sample) + totalSamples := 0 + failedSamples := 0 + addSample := func(lbls labels.Labels, fromMins, toMins int64, faceError bool) { + app := db.Appender(context.Background()) + key := lbls.String() + from, to := minutes(fromMins), minutes(toMins) + for min := from; min <= to; min += time.Minute.Milliseconds() { + val := rand.Float64() + _, err := app.Append(0, lbls, min, val) + if faceError { + require.Error(t, err) + failedSamples++ + } else { + require.NoError(t, err) + expSamples[key] = append(expSamples[key], sample{t: min, v: val}) + totalSamples++ + } + } + if faceError { + require.NoError(t, app.Rollback()) + } else { + require.NoError(t, app.Commit()) + } + } + + addSample(s1, 300, 300, false) // In-order samples. + addSample(s1, 250, 260, true) // Some ooo samples. + addSample(s1, 59, 59, true) // Out of time window. + addSample(s1, 60, 65, true) // At the edge of time window, also it would be "out of bound" without the ooo support. + addSample(s1, 59, 59, true) // Out of time window again. + addSample(s1, 301, 310, false) // More in-order samples. + + querier, err := db.Querier(context.TODO(), math.MinInt64, math.MaxInt64) + require.NoError(t, err) + + seriesSet := query(t, querier, labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar.")) + require.Equal(t, expSamples, seriesSet) + require.Equal(t, float64(0), prom_testutil.ToFloat64(db.head.metrics.outOfOrderSamplesAppended), "number of ooo appended samples mismatch") + require.Equal(t, float64(failedSamples), + prom_testutil.ToFloat64(db.head.metrics.outOfOrderSamples)+prom_testutil.ToFloat64(db.head.metrics.outOfBoundSamples), + "number of ooo/oob samples mismatch") + + // Verifying that no OOO artifacts were generated. + _, err = os.ReadDir(path.Join(db.Dir(), wal.WblDirName)) + require.True(t, os.IsNotExist(err)) + + ms, created, err := db.head.getOrCreate(s1.Hash(), s1) + require.NoError(t, err) + require.False(t, created) + require.NotNil(t, ms) + require.Nil(t, ms.oooHeadChunk) + require.Len(t, ms.oooMmappedChunks, 0) +} + +func TestWBLAndMmapReplay(t *testing.T) { + opts := DefaultOptions() + opts.OutOfOrderCapMax = 30 + opts.OutOfOrderTimeWindow = 4 * time.Hour.Milliseconds() + opts.AllowOverlappingQueries = true + + db := openTestDB(t, opts, nil) + db.DisableCompactions() + t.Cleanup(func() { + require.NoError(t, db.Close()) + }) + + s1 := labels.FromStrings("foo", "bar1") + + minutes := func(m int64) int64 { return m * time.Minute.Milliseconds() } + expSamples := make(map[string][]tsdbutil.Sample) + totalSamples := 0 + addSample := func(lbls labels.Labels, fromMins, toMins int64) { + app := db.Appender(context.Background()) + key := lbls.String() + from, to := minutes(fromMins), minutes(toMins) + for min := from; min <= to; min += time.Minute.Milliseconds() { + val := rand.Float64() + _, err := app.Append(0, lbls, min, val) + require.NoError(t, err) + expSamples[key] = append(expSamples[key], sample{t: min, v: val}) + totalSamples++ + } + require.NoError(t, app.Commit()) + } + + testQuery := func(exp map[string][]tsdbutil.Sample) { + querier, err := db.Querier(context.TODO(), math.MinInt64, math.MaxInt64) + require.NoError(t, err) + + seriesSet := query(t, querier, labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar.")) + + for k, v := range exp { + sort.Slice(v, func(i, j int) bool { + return v[i].T() < v[j].T() + }) + exp[k] = v + } + require.Equal(t, exp, seriesSet) + } + + // In-order samples. + addSample(s1, 300, 300) + require.Equal(t, float64(1), prom_testutil.ToFloat64(db.head.metrics.chunksCreated)) + + // Some ooo samples. + addSample(s1, 250, 260) + addSample(s1, 195, 249) // This creates some m-map chunks. + require.Equal(t, float64(4), prom_testutil.ToFloat64(db.head.metrics.chunksCreated)) + testQuery(expSamples) + oooMint, oooMaxt := minutes(195), minutes(260) + + // Collect the samples only present in the ooo m-map chunks. + ms, created, err := db.head.getOrCreate(s1.Hash(), s1) + require.False(t, created) + require.NoError(t, err) + var s1MmapSamples []tsdbutil.Sample + for _, mc := range ms.oooMmappedChunks { + chk, err := db.head.chunkDiskMapper.Chunk(mc.ref) + require.NoError(t, err) + it := chk.Iterator(nil) + for it.Next() { + ts, val := it.At() + s1MmapSamples = append(s1MmapSamples, sample{t: ts, v: val}) + } + } + require.Greater(t, len(s1MmapSamples), 0) + + require.NoError(t, db.Close()) + + // Making a copy of original state of WBL and Mmap files to use it later. + mmapDir := mmappedChunksDir(db.head.opts.ChunkDirRoot) + wblDir := db.head.wbl.Dir() + originalWblDir := filepath.Join(t.TempDir(), "original_wbl") + originalMmapDir := filepath.Join(t.TempDir(), "original_mmap") + require.NoError(t, fileutil.CopyDirs(wblDir, originalWblDir)) + require.NoError(t, fileutil.CopyDirs(mmapDir, originalMmapDir)) + resetWBLToOriginal := func() { + require.NoError(t, os.RemoveAll(wblDir)) + require.NoError(t, fileutil.CopyDirs(originalWblDir, wblDir)) + } + resetMmapToOriginal := func() { + require.NoError(t, os.RemoveAll(mmapDir)) + require.NoError(t, fileutil.CopyDirs(originalMmapDir, mmapDir)) + } + + t.Run("Restart DB with both WBL and M-map files for ooo data", func(t *testing.T) { + db, err = Open(db.dir, nil, nil, opts, nil) + require.NoError(t, err) + require.Equal(t, oooMint, db.head.MinOOOTime()) + require.Equal(t, oooMaxt, db.head.MaxOOOTime()) + testQuery(expSamples) + require.NoError(t, db.Close()) + }) + + t.Run("Restart DB with only WBL for ooo data", func(t *testing.T) { + require.NoError(t, os.RemoveAll(mmapDir)) + + db, err = Open(db.dir, nil, nil, opts, nil) + require.NoError(t, err) + require.Equal(t, oooMint, db.head.MinOOOTime()) + require.Equal(t, oooMaxt, db.head.MaxOOOTime()) + testQuery(expSamples) + require.NoError(t, db.Close()) + }) + + t.Run("Restart DB with only M-map files for ooo data", func(t *testing.T) { + require.NoError(t, os.RemoveAll(wblDir)) + resetMmapToOriginal() + + db, err = Open(db.dir, nil, nil, opts, nil) + require.NoError(t, err) + require.Equal(t, oooMint, db.head.MinOOOTime()) + require.Equal(t, oooMaxt, db.head.MaxOOOTime()) + inOrderSample := expSamples[s1.String()][len(expSamples[s1.String()])-1] + testQuery(map[string][]tsdbutil.Sample{ + s1.String(): append(s1MmapSamples, inOrderSample), + }) + require.NoError(t, db.Close()) + }) + + t.Run("Restart DB with WBL+Mmap while increasing the OOOCapMax", func(t *testing.T) { + resetWBLToOriginal() + resetMmapToOriginal() + + opts.OutOfOrderCapMax = 60 + db, err = Open(db.dir, nil, nil, opts, nil) + require.NoError(t, err) + require.Equal(t, oooMint, db.head.MinOOOTime()) + require.Equal(t, oooMaxt, db.head.MaxOOOTime()) + testQuery(expSamples) + require.NoError(t, db.Close()) + }) + + t.Run("Restart DB with WBL+Mmap while decreasing the OOOCapMax", func(t *testing.T) { + resetMmapToOriginal() // We need to reset because new duplicate chunks can be written above. + + opts.OutOfOrderCapMax = 10 + db, err = Open(db.dir, nil, nil, opts, nil) + require.NoError(t, err) + require.Equal(t, oooMint, db.head.MinOOOTime()) + require.Equal(t, oooMaxt, db.head.MaxOOOTime()) + testQuery(expSamples) + require.NoError(t, db.Close()) + }) + + t.Run("Restart DB with WBL+Mmap while having no m-map markers in WBL", func(t *testing.T) { + resetMmapToOriginal() // We neet to reset because new duplicate chunks can be written above. + + // Removing m-map markers in WBL by rewriting it. + newWbl, err := wal.New(log.NewNopLogger(), nil, filepath.Join(t.TempDir(), "new_wbl"), false) + require.NoError(t, err) + sr, err := wal.NewSegmentsReader(originalWblDir) + require.NoError(t, err) + var dec record.Decoder + r, markers, addedRecs := wal.NewReader(sr), 0, 0 + for r.Next() { + rec := r.Record() + if dec.Type(rec) == record.MmapMarkers { + markers++ + continue + } + addedRecs++ + require.NoError(t, newWbl.Log(rec)) + } + require.Greater(t, markers, 0) + require.Greater(t, addedRecs, 0) + require.NoError(t, newWbl.Close()) + require.NoError(t, sr.Close()) + require.NoError(t, os.RemoveAll(wblDir)) + require.NoError(t, os.Rename(newWbl.Dir(), wblDir)) + + opts.OutOfOrderCapMax = 30 + db, err = Open(db.dir, nil, nil, opts, nil) + require.NoError(t, err) + require.Equal(t, oooMint, db.head.MinOOOTime()) + require.Equal(t, oooMaxt, db.head.MaxOOOTime()) + testQuery(expSamples) + }) +} + +func TestOOOCompactionFailure(t *testing.T) { + dir := t.TempDir() + + opts := DefaultOptions() + opts.OutOfOrderCapMax = 30 + opts.OutOfOrderTimeWindow = 300 * time.Minute.Milliseconds() + opts.AllowOverlappingQueries = true + opts.AllowOverlappingCompaction = true + + db, err := Open(dir, nil, nil, opts, nil) + require.NoError(t, err) + db.DisableCompactions() // We want to manually call it. + t.Cleanup(func() { + require.NoError(t, db.Close()) + }) + + series1 := labels.FromStrings("foo", "bar1") + + addSample := func(fromMins, toMins int64) { + app := db.Appender(context.Background()) + for min := fromMins; min <= toMins; min++ { + ts := min * time.Minute.Milliseconds() + _, err := app.Append(0, series1, ts, float64(ts)) + require.NoError(t, err) + } + require.NoError(t, app.Commit()) + } + + // Add an in-order samples. + addSample(250, 350) + + // Add ooo samples that creates multiple chunks. + addSample(90, 310) + + // No blocks before compaction. + require.Equal(t, len(db.Blocks()), 0) + + // There is a 0th WBL file. + verifyFirstWBLFileIs0 := func(count int) { + require.NoError(t, db.head.wbl.Sync()) // syncing to make sure wbl is flushed in windows + files, err := os.ReadDir(db.head.wbl.Dir()) + require.NoError(t, err) + require.Len(t, files, count) + require.Equal(t, "00000000", files[0].Name()) + f, err := files[0].Info() + require.NoError(t, err) + require.Greater(t, f.Size(), int64(100)) + } + verifyFirstWBLFileIs0(1) + + verifyMmapFiles := func(exp ...string) { + mmapDir := mmappedChunksDir(db.head.opts.ChunkDirRoot) + files, err := os.ReadDir(mmapDir) + require.NoError(t, err) + require.Len(t, files, len(exp)) + for i, f := range files { + require.Equal(t, exp[i], f.Name()) + } + } + + verifyMmapFiles("000001") + + // OOO compaction fails 5 times. + originalCompactor := db.compactor + db.compactor = &mockCompactorFailing{t: t} + for i := 0; i < 5; i++ { + require.Error(t, db.CompactOOOHead()) + } + require.Equal(t, len(db.Blocks()), 0) + + // M-map files don't change after failed compaction. + verifyMmapFiles("000001") + + // Because of 5 compaction attempts, there are 6 files now. + verifyFirstWBLFileIs0(6) + + db.compactor = originalCompactor + require.NoError(t, db.CompactOOOHead()) + oldBlocks := db.Blocks() + require.Equal(t, len(db.Blocks()), 3) + + // Check that the ooo chunks were removed. + ms, created, err := db.head.getOrCreate(series1.Hash(), series1) + require.NoError(t, err) + require.False(t, created) + require.Nil(t, ms.oooHeadChunk) + require.Len(t, ms.oooMmappedChunks, 0) + + // The failed compaction should not have left the ooo Head corrupted. + // Hence, expect no new blocks with another OOO compaction call. + require.NoError(t, db.CompactOOOHead()) + require.Equal(t, len(db.Blocks()), 3) + require.Equal(t, oldBlocks, db.Blocks()) + + // There should be a single m-map file + verifyMmapFiles("000001") + + // All but last WBL file will be deleted. + // 8 files in total (starting at 0) because of 7 compaction calls. + files, err := os.ReadDir(db.head.wbl.Dir()) + require.NoError(t, err) + require.Len(t, files, 1) + require.Equal(t, "00000007", files[0].Name()) + f, err := files[0].Info() + require.NoError(t, err) + require.Equal(t, int64(0), f.Size()) + + verifySamples := func(block *Block, fromMins, toMins int64) { + series1Samples := make([]tsdbutil.Sample, 0, toMins-fromMins+1) + for min := fromMins; min <= toMins; min++ { + ts := min * time.Minute.Milliseconds() + series1Samples = append(series1Samples, sample{ts, float64(ts)}) + } + expRes := map[string][]tsdbutil.Sample{ + series1.String(): series1Samples, + } + + q, err := NewBlockQuerier(block, math.MinInt64, math.MaxInt64) + require.NoError(t, err) + + actRes := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar.*")) + require.Equal(t, expRes, actRes) + } + + // Checking for expected data in the blocks. + verifySamples(db.Blocks()[0], 90, 119) + verifySamples(db.Blocks()[1], 120, 239) + verifySamples(db.Blocks()[2], 240, 310) + + // Compact the in-order head and expect another block. + // Since this is a forced compaction, this block is not aligned with 2h. + err = db.CompactHead(NewRangeHead(db.head, 250*time.Minute.Milliseconds(), 350*time.Minute.Milliseconds())) + require.NoError(t, err) + require.Equal(t, len(db.Blocks()), 4) // [0, 120), [120, 240), [240, 360), [250, 351) + verifySamples(db.Blocks()[3], 250, 350) + + // The compaction also clears out the old m-map files. Including + // the file that has ooo chunks. + verifyMmapFiles("000001") +} + +func TestWBLCorruption(t *testing.T) { + dir := t.TempDir() + + opts := DefaultOptions() + opts.OutOfOrderCapMax = 30 + opts.OutOfOrderTimeWindow = 300 * time.Minute.Milliseconds() + opts.AllowOverlappingQueries = true + opts.AllowOverlappingCompaction = true + + db, err := Open(dir, nil, nil, opts, nil) + require.NoError(t, err) + db.DisableCompactions() + t.Cleanup(func() { + require.NoError(t, db.Close()) + }) + + series1 := labels.FromStrings("foo", "bar1") + var allSamples, expAfterRestart []tsdbutil.Sample + addSamples := func(fromMins, toMins int64, afterRestart bool) { + app := db.Appender(context.Background()) + for min := fromMins; min <= toMins; min++ { + ts := min * time.Minute.Milliseconds() + _, err := app.Append(0, series1, ts, float64(ts)) + require.NoError(t, err) + allSamples = append(allSamples, sample{t: ts, v: float64(ts)}) + if afterRestart { + expAfterRestart = append(expAfterRestart, sample{t: ts, v: float64(ts)}) + } + } + require.NoError(t, app.Commit()) + } + + // Add an in-order samples. + addSamples(340, 350, true) + + // OOO samples. + addSamples(90, 99, true) + addSamples(100, 119, true) + addSamples(120, 130, true) + + // Moving onto the second file. + _, err = db.head.wbl.NextSegment() + require.NoError(t, err) + + // More OOO samples. + addSamples(200, 230, true) + addSamples(240, 255, true) + + // We corrupt WBL after the sample at 255. So everything added later + // should be deleted after replay. + + // Checking where we corrupt it. + require.NoError(t, db.head.wbl.Sync()) // syncing to make sure wbl is flushed in windows + files, err := os.ReadDir(db.head.wbl.Dir()) + require.NoError(t, err) + require.Len(t, files, 2) + f1, err := files[1].Info() + require.NoError(t, err) + corruptIndex := f1.Size() + corruptFilePath := path.Join(db.head.wbl.Dir(), files[1].Name()) + + // Corrupt the WBL by adding a malformed record. + require.NoError(t, db.head.wbl.Log([]byte{byte(record.Samples), 99, 9, 99, 9, 99, 9, 99})) + + // More samples after the corruption point. + addSamples(260, 280, false) + addSamples(290, 300, false) + + // Another file. + _, err = db.head.wbl.NextSegment() + require.NoError(t, err) + + addSamples(310, 320, false) + + // Verifying that we have data after corruption point. + require.NoError(t, db.head.wbl.Sync()) // syncing to make sure wbl is flushed in windows + files, err = os.ReadDir(db.head.wbl.Dir()) + require.NoError(t, err) + require.Len(t, files, 3) + f1, err = files[1].Info() + require.NoError(t, err) + require.Greater(t, f1.Size(), corruptIndex) + f0, err := files[0].Info() + require.NoError(t, err) + require.Greater(t, f0.Size(), int64(100)) + f2, err := files[2].Info() + require.NoError(t, err) + require.Greater(t, f2.Size(), int64(100)) + + verifySamples := func(expSamples []tsdbutil.Sample) { + sort.Slice(expSamples, func(i, j int) bool { + return expSamples[i].T() < expSamples[j].T() + }) + + expRes := map[string][]tsdbutil.Sample{ + series1.String(): expSamples, + } + + q, err := db.Querier(context.Background(), math.MinInt64, math.MaxInt64) + require.NoError(t, err) + + actRes := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar.*")) + require.Equal(t, expRes, actRes) + } + + verifySamples(allSamples) + + require.NoError(t, db.Close()) + + // We want everything to be replayed from the WBL. So we delete the m-map files. + require.NoError(t, os.RemoveAll(mmappedChunksDir(db.head.opts.ChunkDirRoot))) + + // Restart does the replay and repair. + db, err = Open(db.dir, nil, nil, opts, nil) + require.NoError(t, err) + require.Equal(t, 1.0, prom_testutil.ToFloat64(db.head.metrics.walCorruptionsTotal)) + require.Less(t, len(expAfterRestart), len(allSamples)) + verifySamples(expAfterRestart) + + // Verify that it did the repair on disk. + files, err = os.ReadDir(db.head.wbl.Dir()) + require.NoError(t, err) + require.Len(t, files, 3) + f0, err = files[0].Info() + require.NoError(t, err) + require.Greater(t, f0.Size(), int64(100)) + f2, err = files[2].Info() + require.NoError(t, err) + require.Equal(t, int64(0), f2.Size()) + require.Equal(t, corruptFilePath, path.Join(db.head.wbl.Dir(), files[1].Name())) + + // Verifying that everything after the corruption point is set to 0. + b, err := os.ReadFile(corruptFilePath) + require.NoError(t, err) + sum := 0 + for _, val := range b[corruptIndex:] { + sum += int(val) + } + require.Equal(t, 0, sum) + + // Another restart, everything normal with no repair. + require.NoError(t, db.Close()) + db, err = Open(db.dir, nil, nil, opts, nil) + require.NoError(t, err) + require.Equal(t, 0.0, prom_testutil.ToFloat64(db.head.metrics.walCorruptionsTotal)) + verifySamples(expAfterRestart) +} + +func TestOOOMmapCorruption(t *testing.T) { + dir := t.TempDir() + + opts := DefaultOptions() + opts.OutOfOrderCapMax = 10 + opts.OutOfOrderTimeWindow = 300 * time.Minute.Milliseconds() + opts.AllowOverlappingQueries = true + opts.AllowOverlappingCompaction = true + + db, err := Open(dir, nil, nil, opts, nil) + require.NoError(t, err) + db.DisableCompactions() + t.Cleanup(func() { + require.NoError(t, db.Close()) + }) + + series1 := labels.FromStrings("foo", "bar1") + var allSamples, expInMmapChunks []tsdbutil.Sample + addSamples := func(fromMins, toMins int64, inMmapAfterCorruption bool) { + app := db.Appender(context.Background()) + for min := fromMins; min <= toMins; min++ { + ts := min * time.Minute.Milliseconds() + _, err := app.Append(0, series1, ts, float64(ts)) + require.NoError(t, err) + allSamples = append(allSamples, sample{t: ts, v: float64(ts)}) + if inMmapAfterCorruption { + expInMmapChunks = append(expInMmapChunks, sample{t: ts, v: float64(ts)}) + } + } + require.NoError(t, app.Commit()) + } + + // Add an in-order samples. + addSamples(340, 350, true) + + // OOO samples. + addSamples(90, 99, true) + addSamples(100, 109, true) + // This sample m-maps a chunk. But 120 goes into a new chunk. + addSamples(120, 120, false) + + // Second m-map file. We will corrupt this file. Sample 120 goes into this new file. + db.head.chunkDiskMapper.CutNewFile() + + // More OOO samples. + addSamples(200, 230, false) + addSamples(240, 255, false) + + db.head.chunkDiskMapper.CutNewFile() + addSamples(260, 290, false) + + verifySamples := func(expSamples []tsdbutil.Sample) { + sort.Slice(expSamples, func(i, j int) bool { + return expSamples[i].T() < expSamples[j].T() + }) + + expRes := map[string][]tsdbutil.Sample{ + series1.String(): expSamples, + } + + q, err := db.Querier(context.Background(), math.MinInt64, math.MaxInt64) + require.NoError(t, err) + + actRes := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar.*")) + require.Equal(t, expRes, actRes) + } + + verifySamples(allSamples) + + // Verifying existing files. + mmapDir := mmappedChunksDir(db.head.opts.ChunkDirRoot) + files, err := os.ReadDir(mmapDir) + require.NoError(t, err) + require.Len(t, files, 3) + + // Corrupting the 2nd file. + f, err := os.OpenFile(path.Join(mmapDir, files[1].Name()), os.O_RDWR, 0o666) + require.NoError(t, err) + _, err = f.WriteAt([]byte{99, 9, 99, 9, 99}, 20) + require.NoError(t, err) + require.NoError(t, f.Close()) + firstFileName := files[0].Name() + + require.NoError(t, db.Close()) + + // Moving OOO WBL to use it later. + wblDir := db.head.wbl.Dir() + wblDirTmp := path.Join(t.TempDir(), "wbl_tmp") + require.NoError(t, os.Rename(wblDir, wblDirTmp)) + + // Restart does the replay and repair of m-map files. + db, err = Open(db.dir, nil, nil, opts, nil) + require.NoError(t, err) + require.Equal(t, 1.0, prom_testutil.ToFloat64(db.head.metrics.mmapChunkCorruptionTotal)) + require.Less(t, len(expInMmapChunks), len(allSamples)) + + // Since there is no WBL, only samples from m-map chunks comes in the query. + verifySamples(expInMmapChunks) + + // Verify that it did the repair on disk. All files from the point of corruption + // should be deleted. + files, err = os.ReadDir(mmapDir) + require.NoError(t, err) + require.Len(t, files, 1) + f0, err := files[0].Info() + require.NoError(t, err) + require.Greater(t, f0.Size(), int64(100)) + require.Equal(t, firstFileName, files[0].Name()) + + // Another restart, everything normal with no repair. + require.NoError(t, db.Close()) + db, err = Open(db.dir, nil, nil, opts, nil) + require.NoError(t, err) + require.Equal(t, 0.0, prom_testutil.ToFloat64(db.head.metrics.mmapChunkCorruptionTotal)) + verifySamples(expInMmapChunks) + + // Restart again with the WBL, all samples should be present now. + require.NoError(t, db.Close()) + require.NoError(t, os.RemoveAll(wblDir)) + require.NoError(t, os.Rename(wblDirTmp, wblDir)) + db, err = Open(db.dir, nil, nil, opts, nil) + require.NoError(t, err) + verifySamples(allSamples) +} + +func TestOutOfOrderRuntimeConfig(t *testing.T) { + getDB := func(oooTimeWindow int64) *DB { + dir := t.TempDir() + + opts := DefaultOptions() + opts.OutOfOrderTimeWindow = oooTimeWindow + + db, err := Open(dir, nil, nil, opts, nil) + require.NoError(t, err) + db.DisableCompactions() + t.Cleanup(func() { + require.NoError(t, db.Close()) + }) + + return db + } + + makeConfig := func(oooTimeWindow int) *config.Config { + return &config.Config{ + StorageConfig: config.StorageConfig{ + TSDBConfig: &config.TSDBConfig{ + OutOfOrderTimeWindow: int64(oooTimeWindow) * time.Minute.Milliseconds(), + }, + }, + } + } + + series1 := labels.FromStrings("foo", "bar1") + addSamples := func(t *testing.T, db *DB, fromMins, toMins int64, success bool, allSamples []tsdbutil.Sample) []tsdbutil.Sample { + app := db.Appender(context.Background()) + for min := fromMins; min <= toMins; min++ { + ts := min * time.Minute.Milliseconds() + _, err := app.Append(0, series1, ts, float64(ts)) + if success { + require.NoError(t, err) + allSamples = append(allSamples, sample{t: ts, v: float64(ts)}) + } else { + require.Error(t, err) + } + } + require.NoError(t, app.Commit()) + return allSamples + } + + verifySamples := func(t *testing.T, db *DB, expSamples []tsdbutil.Sample) { + sort.Slice(expSamples, func(i, j int) bool { + return expSamples[i].T() < expSamples[j].T() + }) + + expRes := map[string][]tsdbutil.Sample{ + series1.String(): expSamples, + } + + q, err := db.Querier(context.Background(), math.MinInt64, math.MaxInt64) + require.NoError(t, err) + + actRes := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar.*")) + require.Equal(t, expRes, actRes) + } + + doOOOCompaction := func(t *testing.T, db *DB) { + // WBL is not empty. + size, err := db.head.wbl.Size() + require.NoError(t, err) + require.Greater(t, size, int64(0)) + + require.Len(t, db.Blocks(), 0) + require.NoError(t, db.compactOOOHead()) + require.Greater(t, len(db.Blocks()), 0) + + // WBL is empty. + size, err = db.head.wbl.Size() + require.NoError(t, err) + require.Equal(t, int64(0), size) + } + + t.Run("increase time window", func(t *testing.T) { + var allSamples []tsdbutil.Sample + db := getDB(30 * time.Minute.Milliseconds()) + + // In-order. + allSamples = addSamples(t, db, 300, 310, true, allSamples) + + // OOO upto 30m old is success. + allSamples = addSamples(t, db, 281, 290, true, allSamples) + + // OOO of 59m old fails. + s := addSamples(t, db, 251, 260, false, nil) + require.Len(t, s, 0) + verifySamples(t, db, allSamples) + + oldWblPtr := fmt.Sprintf("%p", db.head.wbl) + + // Increase time window and try adding again. + err := db.ApplyConfig(makeConfig(60)) + require.NoError(t, err) + allSamples = addSamples(t, db, 251, 260, true, allSamples) + + // WBL does not change. + newWblPtr := fmt.Sprintf("%p", db.head.wbl) + require.Equal(t, oldWblPtr, newWblPtr) + + doOOOCompaction(t, db) + verifySamples(t, db, allSamples) + }) + + t.Run("decrease time window and increase again", func(t *testing.T) { + var allSamples []tsdbutil.Sample + db := getDB(60 * time.Minute.Milliseconds()) + + // In-order. + allSamples = addSamples(t, db, 300, 310, true, allSamples) + + // OOO upto 59m old is success. + allSamples = addSamples(t, db, 251, 260, true, allSamples) + + oldWblPtr := fmt.Sprintf("%p", db.head.wbl) + // Decrease time window. + err := db.ApplyConfig(makeConfig(30)) + require.NoError(t, err) + + // OOO of 49m old fails. + s := addSamples(t, db, 261, 270, false, nil) + require.Len(t, s, 0) + + // WBL does not change. + newWblPtr := fmt.Sprintf("%p", db.head.wbl) + require.Equal(t, oldWblPtr, newWblPtr) + + verifySamples(t, db, allSamples) + + // Increase time window again and check + err = db.ApplyConfig(makeConfig(60)) + require.NoError(t, err) + allSamples = addSamples(t, db, 261, 270, true, allSamples) + verifySamples(t, db, allSamples) + + // WBL does not change. + newWblPtr = fmt.Sprintf("%p", db.head.wbl) + require.Equal(t, oldWblPtr, newWblPtr) + + doOOOCompaction(t, db) + verifySamples(t, db, allSamples) + }) + + t.Run("disabled to enabled", func(t *testing.T) { + var allSamples []tsdbutil.Sample + db := getDB(0) + + // In-order. + allSamples = addSamples(t, db, 300, 310, true, allSamples) + + // OOO fails. + s := addSamples(t, db, 251, 260, false, nil) + require.Len(t, s, 0) + verifySamples(t, db, allSamples) + + require.Nil(t, db.head.wbl) + + // Increase time window and try adding again. + err := db.ApplyConfig(makeConfig(60)) + require.NoError(t, err) + allSamples = addSamples(t, db, 251, 260, true, allSamples) + + // WBL gets created. + require.NotNil(t, db.head.wbl) + + verifySamples(t, db, allSamples) + + // OOO compaction works now. + doOOOCompaction(t, db) + verifySamples(t, db, allSamples) + }) + + t.Run("enabled to disabled", func(t *testing.T) { + var allSamples []tsdbutil.Sample + db := getDB(60 * time.Minute.Milliseconds()) + + // In-order. + allSamples = addSamples(t, db, 300, 310, true, allSamples) + + // OOO upto 59m old is success. + allSamples = addSamples(t, db, 251, 260, true, allSamples) + + oldWblPtr := fmt.Sprintf("%p", db.head.wbl) + // Time Window to 0, hence disabled. + err := db.ApplyConfig(makeConfig(0)) + require.NoError(t, err) + + // OOO within old time window fails. + s := addSamples(t, db, 290, 309, false, nil) + require.Len(t, s, 0) + + // WBL does not change and is not removed. + newWblPtr := fmt.Sprintf("%p", db.head.wbl) + require.Equal(t, oldWblPtr, newWblPtr) + + verifySamples(t, db, allSamples) + + // Compaction still works after disabling with WBL cleanup. + doOOOCompaction(t, db) + verifySamples(t, db, allSamples) + }) + + t.Run("disabled to disabled", func(t *testing.T) { + var allSamples []tsdbutil.Sample + db := getDB(0) + + // In-order. + allSamples = addSamples(t, db, 300, 310, true, allSamples) + + // OOO fails. + s := addSamples(t, db, 290, 309, false, nil) + require.Len(t, s, 0) + verifySamples(t, db, allSamples) + require.Nil(t, db.head.wbl) + + // Time window to 0. + err := db.ApplyConfig(makeConfig(0)) + require.NoError(t, err) + + // OOO still fails. + s = addSamples(t, db, 290, 309, false, nil) + require.Len(t, s, 0) + verifySamples(t, db, allSamples) + require.Nil(t, db.head.wbl) + }) +} + +func TestNoGapAfterRestartWithOOO(t *testing.T) { + series1 := labels.FromStrings("foo", "bar1") + addSamples := func(t *testing.T, db *DB, fromMins, toMins int64, success bool) { + app := db.Appender(context.Background()) + for min := fromMins; min <= toMins; min++ { + ts := min * time.Minute.Milliseconds() + _, err := app.Append(0, series1, ts, float64(ts)) + if success { + require.NoError(t, err) + } else { + require.Error(t, err) + } + } + require.NoError(t, app.Commit()) + } + + verifySamples := func(t *testing.T, db *DB, fromMins, toMins int64) { + var expSamples []tsdbutil.Sample + for min := fromMins; min <= toMins; min++ { + ts := min * time.Minute.Milliseconds() + expSamples = append(expSamples, sample{t: ts, v: float64(ts)}) + } + + expRes := map[string][]tsdbutil.Sample{ + series1.String(): expSamples, + } + + q, err := db.Querier(context.Background(), math.MinInt64, math.MaxInt64) + require.NoError(t, err) + + actRes := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar.*")) + require.Equal(t, expRes, actRes) + } + + cases := []struct { + inOrderMint, inOrderMaxt int64 + oooMint, oooMaxt int64 + // After compaction. + blockRanges [][2]int64 + headMint, headMaxt int64 + }{ + { + 300, 490, + 489, 489, + [][2]int64{{300, 360}, {480, 600}}, + 360, 490, + }, + { + 300, 490, + 479, 479, + [][2]int64{{300, 360}, {360, 480}}, + 360, 490, + }, + } + + for i, c := range cases { + t.Run(fmt.Sprintf("case=%d", i), func(t *testing.T) { + dir := t.TempDir() + + opts := DefaultOptions() + opts.OutOfOrderTimeWindow = 30 * time.Minute.Milliseconds() + + db, err := Open(dir, nil, nil, opts, nil) + require.NoError(t, err) + db.DisableCompactions() + t.Cleanup(func() { + require.NoError(t, db.Close()) + }) + + // 3h10m=190m worth in-order data. + addSamples(t, db, c.inOrderMint, c.inOrderMaxt, true) + verifySamples(t, db, c.inOrderMint, c.inOrderMaxt) + + // One ooo samples. + addSamples(t, db, c.oooMint, c.oooMaxt, true) + verifySamples(t, db, c.inOrderMint, c.inOrderMaxt) + + // We get 2 blocks. 1 from OOO, 1 from in-order. + require.NoError(t, db.Compact()) + verifyBlockRanges := func() { + blocks := db.Blocks() + require.Equal(t, len(c.blockRanges), len(blocks)) + for j, br := range c.blockRanges { + require.Equal(t, br[0]*time.Minute.Milliseconds(), blocks[j].MinTime()) + require.Equal(t, br[1]*time.Minute.Milliseconds(), blocks[j].MaxTime()) + } + } + verifyBlockRanges() + require.Equal(t, c.headMint*time.Minute.Milliseconds(), db.head.MinTime()) + require.Equal(t, c.headMaxt*time.Minute.Milliseconds(), db.head.MaxTime()) + + // Restart and expect all samples to be present. + require.NoError(t, db.Close()) + + db, err = Open(dir, nil, nil, opts, nil) + require.NoError(t, err) + db.DisableCompactions() + + verifyBlockRanges() + require.Equal(t, c.headMint*time.Minute.Milliseconds(), db.head.MinTime()) + require.Equal(t, c.headMaxt*time.Minute.Milliseconds(), db.head.MaxTime()) + verifySamples(t, db, c.inOrderMint, c.inOrderMaxt) + }) + } +} + +func TestWblReplayAfterOOODisableAndRestart(t *testing.T) { + dir := t.TempDir() + + opts := DefaultOptions() + opts.OutOfOrderTimeWindow = 60 * time.Minute.Milliseconds() + opts.AllowOverlappingQueries = true + opts.AllowOverlappingCompaction = true + + db, err := Open(dir, nil, nil, opts, nil) + require.NoError(t, err) + db.DisableCompactions() + t.Cleanup(func() { + require.NoError(t, db.Close()) + }) + + series1 := labels.FromStrings("foo", "bar1") + var allSamples []tsdbutil.Sample + addSamples := func(fromMins, toMins int64) { + app := db.Appender(context.Background()) + for min := fromMins; min <= toMins; min++ { + ts := min * time.Minute.Milliseconds() + _, err := app.Append(0, series1, ts, float64(ts)) + require.NoError(t, err) + allSamples = append(allSamples, sample{t: ts, v: float64(ts)}) + } + require.NoError(t, app.Commit()) + } + + // In-order samples. + addSamples(290, 300) + // OOO samples. + addSamples(250, 260) + + verifySamples := func(expSamples []tsdbutil.Sample) { + sort.Slice(expSamples, func(i, j int) bool { + return expSamples[i].T() < expSamples[j].T() + }) + + expRes := map[string][]tsdbutil.Sample{ + series1.String(): expSamples, + } + + q, err := db.Querier(context.Background(), math.MinInt64, math.MaxInt64) + require.NoError(t, err) + + actRes := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar.*")) + require.Equal(t, expRes, actRes) + } + + verifySamples(allSamples) + + // Restart DB with OOO disabled. + require.NoError(t, db.Close()) + opts.OutOfOrderTimeWindow = 0 + db, err = Open(db.dir, nil, nil, opts, nil) + require.NoError(t, err) + + // We can still query OOO samples when OOO is disabled. + verifySamples(allSamples) +} + +func TestPanicOnApplyConfig(t *testing.T) { + dir := t.TempDir() + + opts := DefaultOptions() + opts.OutOfOrderTimeWindow = 60 * time.Minute.Milliseconds() + opts.AllowOverlappingQueries = true + + db, err := Open(dir, nil, nil, opts, nil) + require.NoError(t, err) + db.DisableCompactions() + t.Cleanup(func() { + require.NoError(t, db.Close()) + }) + + series1 := labels.FromStrings("foo", "bar1") + var allSamples []tsdbutil.Sample + addSamples := func(fromMins, toMins int64) { + app := db.Appender(context.Background()) + for min := fromMins; min <= toMins; min++ { + ts := min * time.Minute.Milliseconds() + _, err := app.Append(0, series1, ts, float64(ts)) + require.NoError(t, err) + allSamples = append(allSamples, sample{t: ts, v: float64(ts)}) + } + require.NoError(t, app.Commit()) + } + + // In-order samples. + addSamples(290, 300) + // OOO samples. + addSamples(250, 260) + + // Restart DB with OOO disabled. + require.NoError(t, db.Close()) + opts.OutOfOrderTimeWindow = 0 + db, err = Open(db.dir, nil, prometheus.NewRegistry(), opts, nil) + require.NoError(t, err) + + // ApplyConfig with OOO enabled and expect no panic. + err = db.ApplyConfig(&config.Config{ + StorageConfig: config.StorageConfig{ + TSDBConfig: &config.TSDBConfig{ + OutOfOrderTimeWindow: 60 * time.Minute.Milliseconds(), + }, + }, + }) + require.NoError(t, err) +} + +func TestDiskFillingUpAfterDisablingOOO(t *testing.T) { + dir := t.TempDir() + + opts := DefaultOptions() + opts.OutOfOrderTimeWindow = 60 * time.Minute.Milliseconds() + opts.AllowOverlappingQueries = true + + db, err := Open(dir, nil, nil, opts, nil) + require.NoError(t, err) + db.DisableCompactions() + t.Cleanup(func() { + require.NoError(t, db.Close()) + }) + + series1 := labels.FromStrings("foo", "bar1") + var allSamples []tsdbutil.Sample + addSamples := func(fromMins, toMins int64) { + app := db.Appender(context.Background()) + for min := fromMins; min <= toMins; min++ { + ts := min * time.Minute.Milliseconds() + _, err := app.Append(0, series1, ts, float64(ts)) + require.NoError(t, err) + allSamples = append(allSamples, sample{t: ts, v: float64(ts)}) + } + require.NoError(t, app.Commit()) + } + + // In-order samples. + addSamples(290, 300) + // OOO samples. + addSamples(250, 299) + + // Restart DB with OOO disabled. + require.NoError(t, db.Close()) + opts.OutOfOrderTimeWindow = 0 + db, err = Open(db.dir, nil, prometheus.NewRegistry(), opts, nil) + require.NoError(t, err) + db.DisableCompactions() + + ms := db.head.series.getByHash(series1.Hash(), series1) + require.Greater(t, len(ms.oooMmappedChunks), 0, "OOO mmap chunk was not replayed") + + checkMmapFileContents := func(contains, notContains []string) { + mmapDir := mmappedChunksDir(db.head.opts.ChunkDirRoot) + files, err := os.ReadDir(mmapDir) + require.NoError(t, err) + + fnames := make([]string, 0, len(files)) + for _, f := range files { + fnames = append(fnames, f.Name()) + } + + for _, f := range contains { + require.Contains(t, fnames, f) + } + for _, f := range notContains { + require.NotContains(t, fnames, f) + } + } + + // Add in-order samples until ready for compaction.. + addSamples(301, 500) + + // Check that m-map files gets deleted properly after compactions. + + checkMmapFileContents([]string{"000001", "000002"}, nil) + require.NoError(t, db.Compact()) + checkMmapFileContents([]string{"000002"}, []string{"000001"}) + require.Equal(t, 0, len(ms.oooMmappedChunks), "OOO mmap chunk was not compacted") + + addSamples(501, 650) + checkMmapFileContents([]string{"000002", "000003"}, []string{"000001"}) + require.NoError(t, db.Compact()) + checkMmapFileContents(nil, []string{"000001", "000002", "000003"}) + + // Verify that WBL is empty. + files, err := os.ReadDir(db.head.wbl.Dir()) + require.NoError(t, err) + require.Len(t, files, 1) // Last empty file after compaction. + finfo, err := files[0].Info() + require.NoError(t, err) + require.Equal(t, int64(0), finfo.Size()) +} diff --git a/tsdb/head.go b/tsdb/head.go index 32e85c5993..8aa5aa2c8b 100644 --- a/tsdb/head.go +++ b/tsdb/head.go @@ -25,9 +25,10 @@ import ( "github.com/go-kit/log/level" "github.com/oklog/ulid" "github.com/pkg/errors" - "github.com/prometheus/client_golang/prometheus" "go.uber.org/atomic" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/prometheus/config" "github.com/prometheus/prometheus/model/exemplar" "github.com/prometheus/prometheus/model/labels" @@ -62,15 +63,19 @@ var ( type Head struct { chunkRange atomic.Int64 numSeries atomic.Uint64 - minTime, maxTime atomic.Int64 // Current min and max of the samples included in the head. + minOOOTime, maxOOOTime atomic.Int64 // TODO(jesusvazquez) These should be updated after garbage collection. + minTime, maxTime atomic.Int64 // Current min and max of the samples included in the head. TODO(jesusvazquez) Ensure these are properly tracked. minValidTime atomic.Int64 // Mint allowed to be added to the head. It shouldn't be lower than the maxt of the last persisted block. lastWALTruncationTime atomic.Int64 lastMemoryTruncationTime atomic.Int64 lastSeriesID atomic.Uint64 + // All the ooo m-map chunks should be after this. This is used to truncate old ooo m-map chunks. + // This should be typecasted to chunks.ChunkDiskMapperRef after loading. + minOOOMmapRef atomic.Uint64 metrics *headMetrics opts *HeadOptions - wal *wal.WAL + wal, wbl *wal.WAL exemplarMetrics *ExemplarMetrics exemplars ExemplarStorage logger log.Logger @@ -87,6 +92,7 @@ type Head struct { deletedMtx sync.Mutex deleted map[chunks.HeadSeriesRef]int // Deleted series, and what WAL segment they must be kept until. + // TODO(codesome): Extend MemPostings to return only OOOPostings, Set OOOStatus, ... Like an additional map of ooo postings. postings *index.MemPostings // Postings lists for terms. tombstones *tombstones.MemTombstones @@ -130,6 +136,8 @@ type HeadOptions struct { ChunkPool chunkenc.Pool ChunkWriteBufferSize int ChunkWriteQueueSize int + OutOfOrderTimeWindow atomic.Int64 + OutOfOrderCapMax atomic.Int64 // StripeSize sets the number of entries in the hash map, it must be a power of 2. // A larger StripeSize will allocate more memory up-front, but will increase performance when handling a large number of series. @@ -142,8 +150,13 @@ type HeadOptions struct { IsolationDisabled bool } +const ( + // DefaultOutOfOrderCapMax is the default maximum size of an in-memory out-of-order chunk. + DefaultOutOfOrderCapMax int64 = 32 +) + func DefaultHeadOptions() *HeadOptions { - return &HeadOptions{ + ho := &HeadOptions{ ChunkRange: DefaultBlockDuration, ChunkDirRoot: "", ChunkPool: chunkenc.NewPool(), @@ -153,6 +166,8 @@ func DefaultHeadOptions() *HeadOptions { SeriesCallback: &noopSeriesLifecycleCallback{}, IsolationDisabled: defaultIsolationDisabled, } + ho.OutOfOrderCapMax.Store(DefaultOutOfOrderCapMax) + return ho } // SeriesLifecycleCallback specifies a list of callbacks that will be called during a lifecycle of a series. @@ -171,11 +186,23 @@ type SeriesLifecycleCallback interface { } // NewHead opens the head block in dir. -func NewHead(r prometheus.Registerer, l log.Logger, wal *wal.WAL, opts *HeadOptions, stats *HeadStats) (*Head, error) { +func NewHead(r prometheus.Registerer, l log.Logger, wal, wbl *wal.WAL, opts *HeadOptions, stats *HeadStats) (*Head, error) { var err error if l == nil { l = log.NewNopLogger() } + + if opts.OutOfOrderTimeWindow.Load() < 0 { + opts.OutOfOrderTimeWindow.Store(0) + } + + // Time window can be set on runtime. So the capMin and capMax should be valid + // even if ooo is not enabled yet. + capMax := opts.OutOfOrderCapMax.Load() + if capMax <= 0 || capMax > 255 { + return nil, errors.Errorf("OOOCapMax of %d is invalid. must be > 0 and <= 255", capMax) + } + if opts.ChunkRange < 1 { return nil, errors.Errorf("invalid chunk range %d", opts.ChunkRange) } @@ -193,6 +220,7 @@ func NewHead(r prometheus.Registerer, l log.Logger, wal *wal.WAL, opts *HeadOpti h := &Head{ wal: wal, + wbl: wbl, logger: l, opts: opts, memChunkPool: sync.Pool{ @@ -254,35 +282,40 @@ func (h *Head) resetInMemoryState() error { h.chunkRange.Store(h.opts.ChunkRange) h.minTime.Store(math.MaxInt64) h.maxTime.Store(math.MinInt64) + h.minOOOTime.Store(math.MaxInt64) + h.maxOOOTime.Store(math.MinInt64) h.lastWALTruncationTime.Store(math.MinInt64) h.lastMemoryTruncationTime.Store(math.MinInt64) return nil } type headMetrics struct { - activeAppenders prometheus.Gauge - series prometheus.GaugeFunc - seriesCreated prometheus.Counter - seriesRemoved prometheus.Counter - seriesNotFound prometheus.Counter - chunks prometheus.Gauge - chunksCreated prometheus.Counter - chunksRemoved prometheus.Counter - gcDuration prometheus.Summary - samplesAppended prometheus.Counter - outOfBoundSamples prometheus.Counter - outOfOrderSamples prometheus.Counter - walTruncateDuration prometheus.Summary - walCorruptionsTotal prometheus.Counter - walTotalReplayDuration prometheus.Gauge - headTruncateFail prometheus.Counter - headTruncateTotal prometheus.Counter - checkpointDeleteFail prometheus.Counter - checkpointDeleteTotal prometheus.Counter - checkpointCreationFail prometheus.Counter - checkpointCreationTotal prometheus.Counter - mmapChunkCorruptionTotal prometheus.Counter - snapshotReplayErrorTotal prometheus.Counter // Will be either 0 or 1. + activeAppenders prometheus.Gauge + series prometheus.GaugeFunc + seriesCreated prometheus.Counter + seriesRemoved prometheus.Counter + seriesNotFound prometheus.Counter + chunks prometheus.Gauge + chunksCreated prometheus.Counter + chunksRemoved prometheus.Counter + gcDuration prometheus.Summary + samplesAppended prometheus.Counter + outOfOrderSamplesAppended prometheus.Counter + outOfBoundSamples prometheus.Counter + outOfOrderSamples prometheus.Counter + tooOldSamples prometheus.Counter + walTruncateDuration prometheus.Summary + walCorruptionsTotal prometheus.Counter + dataTotalReplayDuration prometheus.Gauge + headTruncateFail prometheus.Counter + headTruncateTotal prometheus.Counter + checkpointDeleteFail prometheus.Counter + checkpointDeleteTotal prometheus.Counter + checkpointCreationFail prometheus.Counter + checkpointCreationTotal prometheus.Counter + mmapChunkCorruptionTotal prometheus.Counter + snapshotReplayErrorTotal prometheus.Counter // Will be either 0 or 1. + oooHistogram prometheus.Histogram } func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics { @@ -333,7 +366,7 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics { Name: "prometheus_tsdb_wal_corruptions_total", Help: "Total number of WAL corruptions.", }), - walTotalReplayDuration: prometheus.NewGauge(prometheus.GaugeOpts{ + dataTotalReplayDuration: prometheus.NewGauge(prometheus.GaugeOpts{ Name: "prometheus_tsdb_data_replay_duration_seconds", Help: "Time taken to replay the data on disk.", }), @@ -341,13 +374,21 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics { Name: "prometheus_tsdb_head_samples_appended_total", Help: "Total number of appended samples.", }), + outOfOrderSamplesAppended: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_head_out_of_order_samples_appended_total", + Help: "Total number of appended out of order samples.", + }), outOfBoundSamples: prometheus.NewCounter(prometheus.CounterOpts{ Name: "prometheus_tsdb_out_of_bound_samples_total", - Help: "Total number of out of bound samples ingestion failed attempts.", + Help: "Total number of out of bound samples ingestion failed attempts with out of order support disabled.", }), outOfOrderSamples: prometheus.NewCounter(prometheus.CounterOpts{ Name: "prometheus_tsdb_out_of_order_samples_total", - Help: "Total number of out of order samples ingestion failed attempts.", + Help: "Total number of out of order samples ingestion failed attempts due to out of order being disabled.", + }), + tooOldSamples: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_too_old_samples_total", + Help: "Total number of out of order samples ingestion failed attempts with out of support enabled, but sample outside of time window.", }), headTruncateFail: prometheus.NewCounter(prometheus.CounterOpts{ Name: "prometheus_tsdb_head_truncations_failed_total", @@ -381,6 +422,19 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics { Name: "prometheus_tsdb_snapshot_replay_error_total", Help: "Total number snapshot replays that failed.", }), + oooHistogram: prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "prometheus_tsdb_sample_ooo_delta", + Help: "Delta in seconds by which a sample is considered out of order (reported regardless of OOO time window and whether sample is accepted or not).", + Buckets: []float64{ + 60 * 10, // 10 min + 60 * 30, // 30 min + 60 * 60, // 60 min + 60 * 60 * 2, // 2h + 60 * 60 * 3, // 3h + 60 * 60 * 6, // 6h + 60 * 60 * 12, // 12h + }, + }), } if r != nil { @@ -396,10 +450,12 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics { m.gcDuration, m.walTruncateDuration, m.walCorruptionsTotal, - m.walTotalReplayDuration, + m.dataTotalReplayDuration, m.samplesAppended, + m.outOfOrderSamplesAppended, m.outOfBoundSamples, m.outOfOrderSamples, + m.tooOldSamples, m.headTruncateFail, m.headTruncateTotal, m.checkpointDeleteFail, @@ -517,8 +573,9 @@ func (h *Head) Init(minValidTime int64) error { } mmapChunkReplayStart := time.Now() - mmappedChunks, err := h.loadMmappedChunks(refSeries) + mmappedChunks, oooMmappedChunks, lastMmapRef, err := h.loadMmappedChunks(refSeries) if err != nil { + // TODO(codesome): clear out all m-map chunks here for refSeries. level.Error(h.logger).Log("msg", "Loading on-disk chunks failed", "err", err) if _, ok := errors.Cause(err).(*chunks.CorruptionErr); ok { h.metrics.mmapChunkCorruptionTotal.Inc() @@ -529,7 +586,7 @@ func (h *Head) Init(minValidTime int64) error { // If this fails, data will be recovered from WAL. // Hence we wont lose any data (given WAL is not corrupt). - mmappedChunks, err = h.removeCorruptedMmappedChunks(err) + mmappedChunks, oooMmappedChunks, lastMmapRef, err = h.removeCorruptedMmappedChunks(err) if err != nil { return err } @@ -572,7 +629,7 @@ func (h *Head) Init(minValidTime int64) error { // A corrupted checkpoint is a hard error for now and requires user // intervention. There's likely little data that can be recovered anyway. - if err := h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks); err != nil { + if err := h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks, oooMmappedChunks); err != nil { return errors.Wrap(err, "backfill checkpoint") } h.updateWALReplayStatusRead(startFrom) @@ -605,7 +662,7 @@ func (h *Head) Init(minValidTime int64) error { if err != nil { return errors.Wrapf(err, "segment reader (offset=%d)", offset) } - err = h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks) + err = h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks, oooMmappedChunks) if err := sr.Close(); err != nil { level.Warn(h.logger).Log("msg", "Error while closing the wal segments reader", "err", err) } @@ -615,26 +672,94 @@ func (h *Head) Init(minValidTime int64) error { level.Info(h.logger).Log("msg", "WAL segment loaded", "segment", i, "maxSegment", endAt) h.updateWALReplayStatusRead(i) } + walReplayDuration := time.Since(walReplayStart) - walReplayDuration := time.Since(start) - h.metrics.walTotalReplayDuration.Set(walReplayDuration.Seconds()) + wblReplayStart := time.Now() + if h.wbl != nil { + // Replay OOO WAL. + startFrom, endAt, e = wal.Segments(h.wbl.Dir()) + if e != nil { + return errors.Wrap(e, "finding OOO WAL segments") + } + h.startWALReplayStatus(startFrom, endAt) + + for i := startFrom; i <= endAt; i++ { + s, err := wal.OpenReadSegment(wal.SegmentName(h.wbl.Dir(), i)) + if err != nil { + return errors.Wrap(err, fmt.Sprintf("open WBL segment: %d", i)) + } + + sr := wal.NewSegmentBufReader(s) + err = h.loadWBL(wal.NewReader(sr), multiRef, lastMmapRef) + if err := sr.Close(); err != nil { + level.Warn(h.logger).Log("msg", "Error while closing the wbl segments reader", "err", err) + } + if err != nil { + return err + } + level.Info(h.logger).Log("msg", "WBL segment loaded", "segment", i, "maxSegment", endAt) + h.updateWALReplayStatusRead(i) + } + } + + wblReplayDuration := time.Since(wblReplayStart) + + totalReplayDuration := time.Since(start) + h.metrics.dataTotalReplayDuration.Set(totalReplayDuration.Seconds()) level.Info(h.logger).Log( "msg", "WAL replay completed", "checkpoint_replay_duration", checkpointReplayDuration.String(), - "wal_replay_duration", time.Since(walReplayStart).String(), - "total_replay_duration", walReplayDuration.String(), + "wal_replay_duration", walReplayDuration.String(), + "wbl_replay_duration", wblReplayDuration.String(), + "total_replay_duration", totalReplayDuration.String(), ) return nil } -func (h *Head) loadMmappedChunks(refSeries map[chunks.HeadSeriesRef]*memSeries) (map[chunks.HeadSeriesRef][]*mmappedChunk, error) { +func (h *Head) loadMmappedChunks(refSeries map[chunks.HeadSeriesRef]*memSeries) (map[chunks.HeadSeriesRef][]*mmappedChunk, map[chunks.HeadSeriesRef][]*mmappedChunk, chunks.ChunkDiskMapperRef, error) { mmappedChunks := map[chunks.HeadSeriesRef][]*mmappedChunk{} - if err := h.chunkDiskMapper.IterateAllChunks(func(seriesRef chunks.HeadSeriesRef, chunkRef chunks.ChunkDiskMapperRef, mint, maxt int64, numSamples uint16) error { - if maxt < h.minValidTime.Load() { + oooMmappedChunks := map[chunks.HeadSeriesRef][]*mmappedChunk{} + var lastRef, secondLastRef chunks.ChunkDiskMapperRef + if err := h.chunkDiskMapper.IterateAllChunks(func(seriesRef chunks.HeadSeriesRef, chunkRef chunks.ChunkDiskMapperRef, mint, maxt int64, numSamples uint16, encoding chunkenc.Encoding) error { + secondLastRef = lastRef + lastRef = chunkRef + isOOO := chunkenc.IsOutOfOrderChunk(encoding) + if !isOOO && maxt < h.minValidTime.Load() { return nil } + + // We ignore any chunk that doesn't have a valid encoding + if !chunkenc.IsValidEncoding(encoding) { + return nil + } + ms, ok := refSeries[seriesRef] + + if isOOO { + if !ok { + oooMmappedChunks[seriesRef] = append(oooMmappedChunks[seriesRef], &mmappedChunk{ + ref: chunkRef, + minTime: mint, + maxTime: maxt, + numSamples: numSamples, + }) + return nil + } + + h.metrics.chunks.Inc() + h.metrics.chunksCreated.Inc() + + ms.oooMmappedChunks = append(ms.oooMmappedChunks, &mmappedChunk{ + ref: chunkRef, + minTime: mint, + maxTime: maxt, + numSamples: numSamples, + }) + + return nil + } + if !ok { slice := mmappedChunks[seriesRef] if len(slice) > 0 && slice[len(slice)-1].maxTime >= mint { @@ -677,45 +802,57 @@ func (h *Head) loadMmappedChunks(refSeries map[chunks.HeadSeriesRef]*memSeries) } return nil }); err != nil { - return nil, errors.Wrap(err, "iterate on on-disk chunks") + // secondLastRef because the lastRef caused an error. + return nil, nil, secondLastRef, errors.Wrap(err, "iterate on on-disk chunks") } - return mmappedChunks, nil + return mmappedChunks, oooMmappedChunks, lastRef, nil } // removeCorruptedMmappedChunks attempts to delete the corrupted mmapped chunks and if it fails, it clears all the previously // loaded mmapped chunks. -func (h *Head) removeCorruptedMmappedChunks(err error) (map[chunks.HeadSeriesRef][]*mmappedChunk, error) { +func (h *Head) removeCorruptedMmappedChunks(err error) (map[chunks.HeadSeriesRef][]*mmappedChunk, map[chunks.HeadSeriesRef][]*mmappedChunk, chunks.ChunkDiskMapperRef, error) { + level.Info(h.logger).Log("msg", "Deleting mmapped chunk files") // We never want to preserve the in-memory series from snapshots if we are repairing m-map chunks. if err := h.resetInMemoryState(); err != nil { - return nil, err + return map[chunks.HeadSeriesRef][]*mmappedChunk{}, map[chunks.HeadSeriesRef][]*mmappedChunk{}, 0, err } level.Info(h.logger).Log("msg", "Deleting mmapped chunk files") if err := h.chunkDiskMapper.DeleteCorrupted(err); err != nil { level.Info(h.logger).Log("msg", "Deletion of corrupted mmap chunk files failed, discarding chunk files completely", "err", err) - if err := h.chunkDiskMapper.Truncate(math.MaxInt64); err != nil { + if err := h.chunkDiskMapper.Truncate(math.MaxUint32); err != nil { level.Error(h.logger).Log("msg", "Deletion of all mmap chunk files failed", "err", err) } - return map[chunks.HeadSeriesRef][]*mmappedChunk{}, nil + return map[chunks.HeadSeriesRef][]*mmappedChunk{}, map[chunks.HeadSeriesRef][]*mmappedChunk{}, 0, nil } level.Info(h.logger).Log("msg", "Deletion of mmap chunk files successful, reattempting m-mapping the on-disk chunks") - mmappedChunks, err := h.loadMmappedChunks(make(map[chunks.HeadSeriesRef]*memSeries)) + mmappedChunks, oooMmappedChunks, lastRef, err := h.loadMmappedChunks(make(map[chunks.HeadSeriesRef]*memSeries)) if err != nil { level.Error(h.logger).Log("msg", "Loading on-disk chunks failed, discarding chunk files completely", "err", err) - if err := h.chunkDiskMapper.Truncate(math.MaxInt64); err != nil { + if err := h.chunkDiskMapper.Truncate(math.MaxUint32); err != nil { level.Error(h.logger).Log("msg", "Deletion of all mmap chunk files failed after failed loading", "err", err) } mmappedChunks = map[chunks.HeadSeriesRef][]*mmappedChunk{} } - return mmappedChunks, nil + return mmappedChunks, oooMmappedChunks, lastRef, nil } -func (h *Head) ApplyConfig(cfg *config.Config) error { +func (h *Head) ApplyConfig(cfg *config.Config, wbl *wal.WAL) { + oooTimeWindow := int64(0) + if cfg.StorageConfig.TSDBConfig != nil { + oooTimeWindow = cfg.StorageConfig.TSDBConfig.OutOfOrderTimeWindow + } + if oooTimeWindow < 0 { + oooTimeWindow = 0 + } + + h.SetOutOfOrderTimeWindow(oooTimeWindow, wbl) + if !h.opts.EnableExemplarStorage { - return nil + return } // Head uses opts.MaxExemplars in combination with opts.EnableExemplarStorage @@ -726,12 +863,21 @@ func (h *Head) ApplyConfig(cfg *config.Config) error { newSize := h.opts.MaxExemplars.Load() if prevSize == newSize { - return nil + return } migrated := h.exemplars.(*CircularExemplarStorage).Resize(newSize) level.Info(h.logger).Log("msg", "Exemplar storage resized", "from", prevSize, "to", newSize, "migrated", migrated) - return nil +} + +// SetOutOfOrderTimeWindow updates the out of order related parameters. +// If the Head already has a WBL set, then the wbl will be ignored. +func (h *Head) SetOutOfOrderTimeWindow(oooTimeWindow int64, wbl *wal.WAL) { + if oooTimeWindow > 0 && h.wbl == nil { + h.wbl = wbl + } + + h.opts.OutOfOrderTimeWindow.Store(oooTimeWindow) } // PostingsCardinalityStats returns top 10 highest cardinality stats By label and value names. @@ -773,6 +919,27 @@ func (h *Head) updateMinMaxTime(mint, maxt int64) { } } +func (h *Head) updateMinOOOMaxOOOTime(mint, maxt int64) { + for { + lt := h.MinOOOTime() + if mint >= lt { + break + } + if h.minOOOTime.CompareAndSwap(lt, mint) { + break + } + } + for { + ht := h.MaxOOOTime() + if maxt <= ht { + break + } + if h.maxOOOTime.CompareAndSwap(ht, maxt) { + break + } + } +} + // SetMinValidTime sets the minimum timestamp the head can ingest. func (h *Head) SetMinValidTime(minValidTime int64) { h.minValidTime.Store(minValidTime) @@ -838,30 +1005,7 @@ func (h *Head) truncateMemory(mint int64) (err error) { } h.metrics.headTruncateTotal.Inc() - start := time.Now() - - actualMint := h.gc() - level.Info(h.logger).Log("msg", "Head GC completed", "duration", time.Since(start)) - h.metrics.gcDuration.Observe(time.Since(start).Seconds()) - if actualMint > h.minTime.Load() { - // The actual mint of the Head is higher than the one asked to truncate. - appendableMinValidTime := h.appendableMinValidTime() - if actualMint < appendableMinValidTime { - h.minTime.Store(actualMint) - h.minValidTime.Store(actualMint) - } else { - // The actual min time is in the appendable window. - // So we set the mint to the appendableMinValidTime. - h.minTime.Store(appendableMinValidTime) - h.minValidTime.Store(appendableMinValidTime) - } - } - - // Truncate the chunk m-mapper. - if err := h.chunkDiskMapper.Truncate(mint); err != nil { - return errors.Wrap(err, "truncate chunks.HeadReadWriter") - } - return nil + return h.truncateSeriesAndChunkDiskMapper("truncateMemory") } // WaitForPendingReadersInTimeRange waits for queries overlapping with given range to finish querying. @@ -950,7 +1094,7 @@ func (h *Head) truncateWAL(mint int64) error { } // Start a new segment, so low ingestion volume TSDB don't have more WAL than // needed. - if err := h.wal.NextSegment(); err != nil { + if _, err := h.wal.NextSegment(); err != nil { return errors.Wrap(err, "next segment") } last-- // Never consider last segment for checkpoint. @@ -1016,6 +1160,59 @@ func (h *Head) truncateWAL(mint int64) error { return nil } +// truncateOOO +// - truncates the OOO WBL files whose index is strictly less than lastWBLFile. +// - garbage collects all the m-map chunks from the memory that are less than or equal to minOOOMmapRef +// and then deletes the series that do not have any data anymore. +func (h *Head) truncateOOO(lastWBLFile int, minOOOMmapRef chunks.ChunkDiskMapperRef) error { + curMinOOOMmapRef := chunks.ChunkDiskMapperRef(h.minOOOMmapRef.Load()) + if minOOOMmapRef.GreaterThan(curMinOOOMmapRef) { + h.minOOOMmapRef.Store(uint64(minOOOMmapRef)) + if err := h.truncateSeriesAndChunkDiskMapper("truncateOOO"); err != nil { + return err + } + } + + return h.wbl.Truncate(lastWBLFile) +} + +// truncateSeriesAndChunkDiskMapper is a helper function for truncateMemory and truncateOOO. +// It runs GC on the Head and truncates the ChunkDiskMapper accordingly. +func (h *Head) truncateSeriesAndChunkDiskMapper(caller string) error { + start := time.Now() + headMaxt := h.MaxTime() + actualMint, minOOOTime, minMmapFile := h.gc() + level.Info(h.logger).Log("msg", "Head GC completed", "caller", caller, "duration", time.Since(start)) + h.metrics.gcDuration.Observe(time.Since(start).Seconds()) + + if actualMint > h.minTime.Load() { + // The actual mint of the head is higher than the one asked to truncate. + appendableMinValidTime := h.appendableMinValidTime() + if actualMint < appendableMinValidTime { + h.minTime.Store(actualMint) + h.minValidTime.Store(actualMint) + } else { + // The actual min time is in the appendable window. + // So we set the mint to the appendableMinValidTime. + h.minTime.Store(appendableMinValidTime) + h.minValidTime.Store(appendableMinValidTime) + } + } + if headMaxt-h.opts.OutOfOrderTimeWindow.Load() < minOOOTime { + // The allowed OOO window is lower than the min OOO time seen during GC. + // So it is possible that some OOO sample was inserted that was less that minOOOTime. + // So we play safe and set it to the min that was possible. + minOOOTime = headMaxt - h.opts.OutOfOrderTimeWindow.Load() + } + h.minOOOTime.Store(minOOOTime) + + // Truncate the chunk m-mapper. + if err := h.chunkDiskMapper.Truncate(uint32(minMmapFile)); err != nil { + return errors.Wrap(err, "truncate chunks.HeadReadWriter by file number") + } + return nil +} + type Stats struct { NumSeries uint64 MinTime, MaxTime int64 @@ -1149,14 +1346,20 @@ func (h *Head) Delete(mint, maxt int64, ms ...*labels.Matcher) error { } // gc removes data before the minimum timestamp from the head. -// It returns the actual min times of the chunks present in the Head. -func (h *Head) gc() int64 { +// It returns +// * The actual min times of the chunks present in the Head. +// * The min OOO time seen during the GC. +// * Min mmap file number seen in the series (in-order and out-of-order) after gc'ing the series. +func (h *Head) gc() (actualInOrderMint, minOOOTime int64, minMmapFile int) { // Only data strictly lower than this timestamp must be deleted. mint := h.MinTime() + // Only ooo m-map chunks strictly lower than or equal to this ref + // must be deleted. + minOOOMmapRef := chunks.ChunkDiskMapperRef(h.minOOOMmapRef.Load()) // Drop old chunks and remember series IDs and hashes if they can be // deleted entirely. - deleted, chunksRemoved, actualMint := h.series.gc(mint) + deleted, chunksRemoved, actualInOrderMint, minOOOTime, minMmapFile := h.series.gc(mint, minOOOMmapRef) seriesRemoved := len(deleted) h.metrics.seriesRemoved.Add(float64(seriesRemoved)) @@ -1186,7 +1389,7 @@ func (h *Head) gc() int64 { h.deletedMtx.Unlock() } - return actualMint + return actualInOrderMint, minOOOTime, minMmapFile } // Tombstones returns a new reader over the head's tombstones @@ -1224,6 +1427,18 @@ func (h *Head) MaxTime() int64 { return h.maxTime.Load() } +// MinOOOTime returns the lowest time bound on visible data in the out of order +// head. +func (h *Head) MinOOOTime() int64 { + return h.minOOOTime.Load() +} + +// MaxOOOTime returns the highest timestamp on visible data in the out of order +// head. +func (h *Head) MaxOOOTime() int64 { + return h.maxOOOTime.Load() +} + // compactable returns whether the head has a compactable range. // The head has a compactable range when the head time range is 1.5 times the chunk range. // The 0.5 acts as a buffer of the appendable window. @@ -1241,6 +1456,9 @@ func (h *Head) Close() error { if h.wal != nil { errs.Add(h.wal.Close()) } + if h.wbl != nil { + errs.Add(h.wbl.Close()) + } if errs.Err() == nil && h.opts.EnableMemorySnapshotOnShutdown { errs.Add(h.performChunkSnapshot()) } @@ -1271,7 +1489,7 @@ func (h *Head) getOrCreate(hash uint64, lset labels.Labels) (*memSeries, bool, e func (h *Head) getOrCreateWithID(id chunks.HeadSeriesRef, hash uint64, lset labels.Labels) (*memSeries, bool, error) { s, created, err := h.series.getOrSet(hash, lset, func() *memSeries { - return newMemSeries(lset, id, h.chunkRange.Load(), h.opts.IsolationDisabled) + return newMemSeries(lset, id, h.chunkRange.Load(), h.opts.OutOfOrderCapMax.Load(), h.opts.IsolationDisabled) }) if err != nil { return nil, false, err @@ -1333,7 +1551,7 @@ const ( ) // stripeSeries holds series by HeadSeriesRef ("ID") and also by hash of their labels. -// ID-based lookups via (getByID()) are preferred over getByHash() for performance reasons. +// ID-based lookups via getByID() are preferred over getByHash() for performance reasons. // It locks modulo ranges of IDs and hashes to reduce lock contention. // The locks are padded to not be on the same cache line. Filling the padded space // with the maps was profiled to be slower – likely due to the additional pointer @@ -1375,13 +1593,16 @@ func newStripeSeries(stripeSize int, seriesCallback SeriesLifecycleCallback) *st // note: returning map[chunks.HeadSeriesRef]struct{} would be more accurate, // but the returned map goes into postings.Delete() which expects a map[storage.SeriesRef]struct // and there's no easy way to cast maps. -func (s *stripeSeries) gc(mint int64) (map[storage.SeriesRef]struct{}, int, int64) { +// minMmapFile is the min mmap file number seen in the series (in-order and out-of-order) after gc'ing the series. +func (s *stripeSeries) gc(mint int64, minOOOMmapRef chunks.ChunkDiskMapperRef) (_ map[storage.SeriesRef]struct{}, _ int, _, _ int64, minMmapFile int) { var ( deleted = map[storage.SeriesRef]struct{}{} deletedForCallback = []labels.Labels{} rmChunks = 0 actualMint int64 = math.MaxInt64 + minOOOTime int64 = math.MaxInt64 ) + minMmapFile = math.MaxInt32 // Run through all series and truncate old chunks. Mark those with no // chunks left as deleted and store their ID. for i := 0; i < s.size; i++ { @@ -1390,9 +1611,32 @@ func (s *stripeSeries) gc(mint int64) (map[storage.SeriesRef]struct{}, int, int6 for hash, all := range s.hashes[i] { for _, series := range all { series.Lock() - rmChunks += series.truncateChunksBefore(mint) + rmChunks += series.truncateChunksBefore(mint, minOOOMmapRef) - if len(series.mmappedChunks) > 0 || series.headChunk != nil || series.pendingCommit { + if len(series.mmappedChunks) > 0 { + seq, _ := series.mmappedChunks[0].ref.Unpack() + if seq < minMmapFile { + minMmapFile = seq + } + } + if len(series.oooMmappedChunks) > 0 { + seq, _ := series.oooMmappedChunks[0].ref.Unpack() + if seq < minMmapFile { + minMmapFile = seq + } + for _, ch := range series.oooMmappedChunks { + if ch.minTime < minOOOTime { + minOOOTime = ch.minTime + } + } + } + if series.oooHeadChunk != nil { + if series.oooHeadChunk.minTime < minOOOTime { + minOOOTime = series.oooHeadChunk.minTime + } + } + if len(series.mmappedChunks) > 0 || len(series.oooMmappedChunks) > 0 || + series.headChunk != nil || series.oooHeadChunk != nil || series.pendingCommit { seriesMint := series.minTime() if seriesMint < actualMint { actualMint = seriesMint @@ -1435,7 +1679,7 @@ func (s *stripeSeries) gc(mint int64) (map[storage.SeriesRef]struct{}, int, int6 actualMint = mint } - return deleted, rmChunks, actualMint + return deleted, rmChunks, actualMint, minOOOTime, minMmapFile } func (s *stripeSeries) getByID(id chunks.HeadSeriesRef) *memSeries { @@ -1528,11 +1772,16 @@ type memSeries struct { // // pN is the pointer to the mmappedChunk referered to by HeadChunkID=N mmappedChunks []*mmappedChunk + headChunk *memChunk // Most recent chunk in memory that's still being built. + firstChunkID chunks.HeadChunkID // HeadChunkID for mmappedChunks[0] - mmMaxTime int64 // Max time of any mmapped chunk, only used during WAL replay. - headChunk *memChunk // Most recent chunk in memory that's still being built. - chunkRange int64 - firstChunkID chunks.HeadChunkID // HeadChunkID for mmappedChunks[0] + oooMmappedChunks []*mmappedChunk // Immutable chunks on disk containing OOO samples. + oooHeadChunk *oooHeadChunk // Most recent chunk for ooo samples in memory that's still being built. + firstOOOChunkID chunks.HeadChunkID // HeadOOOChunkID for oooMmappedChunks[0] + + mmMaxTime int64 // Max time of any mmapped chunk, only used during WAL replay. + chunkRange int64 + oooCapMax uint8 nextAt int64 // Timestamp at which to cut the next chunk. @@ -1551,12 +1800,13 @@ type memSeries struct { pendingCommit bool // Whether there are samples waiting to be committed to this series. } -func newMemSeries(lset labels.Labels, id chunks.HeadSeriesRef, chunkRange int64, isolationDisabled bool) *memSeries { +func newMemSeries(lset labels.Labels, id chunks.HeadSeriesRef, chunkRange, oooCapMax int64, isolationDisabled bool) *memSeries { s := &memSeries{ lset: lset, ref: id, chunkRange: chunkRange, nextAt: math.MinInt64, + oooCapMax: uint8(oooCapMax), } if !isolationDisabled { s.txs = newTxRing(4) @@ -1575,6 +1825,7 @@ func (s *memSeries) minTime() int64 { } func (s *memSeries) maxTime() int64 { + // The highest timestamps will always be in the regular (non-OOO) chunks, even if OOO is enabled. c := s.head() if c != nil { return c.maxTime @@ -1588,26 +1839,39 @@ func (s *memSeries) maxTime() int64 { // truncateChunksBefore removes all chunks from the series that // have no timestamp at or after mint. // Chunk IDs remain unchanged. -func (s *memSeries) truncateChunksBefore(mint int64) (removed int) { +func (s *memSeries) truncateChunksBefore(mint int64, minOOOMmapRef chunks.ChunkDiskMapperRef) int { + var removedInOrder int if s.headChunk != nil && s.headChunk.maxTime < mint { // If head chunk is truncated, we can truncate all mmapped chunks. - removed = 1 + len(s.mmappedChunks) - s.firstChunkID += chunks.HeadChunkID(removed) + removedInOrder = 1 + len(s.mmappedChunks) + s.firstChunkID += chunks.HeadChunkID(removedInOrder) s.headChunk = nil s.mmappedChunks = nil - return removed } if len(s.mmappedChunks) > 0 { for i, c := range s.mmappedChunks { if c.maxTime >= mint { break } - removed = i + 1 + removedInOrder = i + 1 } - s.mmappedChunks = append(s.mmappedChunks[:0], s.mmappedChunks[removed:]...) - s.firstChunkID += chunks.HeadChunkID(removed) + s.mmappedChunks = append(s.mmappedChunks[:0], s.mmappedChunks[removedInOrder:]...) + s.firstChunkID += chunks.HeadChunkID(removedInOrder) } - return removed + + var removedOOO int + if len(s.oooMmappedChunks) > 0 { + for i, c := range s.oooMmappedChunks { + if c.ref.GreaterThan(minOOOMmapRef) { + break + } + removedOOO = i + 1 + } + s.oooMmappedChunks = append(s.oooMmappedChunks[:0], s.oooMmappedChunks[removedOOO:]...) + s.firstOOOChunkID += chunks.HeadChunkID(removedOOO) + } + + return removedInOrder + removedOOO } // cleanupAppendIDsBelow cleans up older appendIDs. Has to be called after @@ -1627,6 +1891,16 @@ type memChunk struct { minTime, maxTime int64 } +type oooHeadChunk struct { + chunk *OOOChunk + minTime, maxTime int64 // can probably be removed and pulled out of the chunk instead +} + +// OverlapsClosedInterval returns true if the chunk overlaps [mint, maxt]. +func (mc *oooHeadChunk) OverlapsClosedInterval(mint, maxt int64) bool { + return overlapsClosedInterval(mc.minTime, mc.maxTime, mint, maxt) +} + // OverlapsClosedInterval returns true if the chunk overlaps [mint, maxt]. func (mc *memChunk) OverlapsClosedInterval(mint, maxt int64) bool { return overlapsClosedInterval(mc.minTime, mc.maxTime, mint, maxt) @@ -1655,12 +1929,15 @@ func (noopSeriesLifecycleCallback) PostCreation(labels.Labels) {} func (noopSeriesLifecycleCallback) PostDeletion(...labels.Labels) {} func (h *Head) Size() int64 { - var walSize int64 + var walSize, wblSize int64 if h.wal != nil { walSize, _ = h.wal.Size() } + if h.wbl != nil { + wblSize, _ = h.wbl.Size() + } cdmSize, _ := h.chunkDiskMapper.Size() - return walSize + cdmSize + return walSize + wblSize + cdmSize } func (h *RangeHead) Size() int64 { diff --git a/tsdb/head_append.go b/tsdb/head_append.go index 1331fbe252..cbd6ad8e2b 100644 --- a/tsdb/head_append.go +++ b/tsdb/head_append.go @@ -137,6 +137,8 @@ func (h *Head) appender() *headAppender { minValidTime: h.appendableMinValidTime(), mint: math.MaxInt64, maxt: math.MinInt64, + headMaxt: h.MaxTime(), + oooTimeWindow: h.opts.OutOfOrderTimeWindow.Load(), samples: h.getAppendBuffer(), sampleSeries: h.getSeriesBuffer(), exemplars: exemplarsBuf, @@ -252,9 +254,11 @@ type exemplarWithSeriesRef struct { } type headAppender struct { - head *Head - minValidTime int64 // No samples below this timestamp are allowed. - mint, maxt int64 + head *Head + minValidTime int64 // No samples below this timestamp are allowed. + mint, maxt int64 + headMaxt int64 // We track it here to not take the lock for every sample appended. + oooTimeWindow int64 // Use the same for the entire append, and don't load the atomic for each sample. series []record.RefSeries // New series held by this appender. metadata []record.RefMetadata // New metadata held by this appender. @@ -268,7 +272,9 @@ type headAppender struct { } func (a *headAppender) Append(ref storage.SeriesRef, lset labels.Labels, t int64, v float64) (storage.SeriesRef, error) { - if t < a.minValidTime { + // For OOO inserts, this restriction is irrelevant and will be checked later once we confirm the sample is an in-order append. + // If OOO inserts are disabled, we may as well as check this as early as we can and avoid more work. + if a.oooTimeWindow == 0 && t < a.minValidTime { a.head.metrics.outOfBoundSamples.Inc() return 0, storage.ErrOutOfBounds } @@ -300,15 +306,25 @@ func (a *headAppender) Append(ref storage.SeriesRef, lset labels.Labels, t int64 } s.Lock() - if err := s.appendable(t, v); err != nil { - s.Unlock() - if err == storage.ErrOutOfOrderSample { + // TODO(codesome): If we definitely know at this point that the sample is ooo, then optimise + // to skip that sample from the WAL and write only in the WBL. + _, delta, err := s.appendable(t, v, a.headMaxt, a.minValidTime, a.oooTimeWindow) + if err == nil { + s.pendingCommit = true + } + s.Unlock() + if delta > 0 { + a.head.metrics.oooHistogram.Observe(float64(delta)) + } + if err != nil { + switch err { + case storage.ErrOutOfOrderSample: a.head.metrics.outOfOrderSamples.Inc() + case storage.ErrTooOldSample: + a.head.metrics.tooOldSamples.Inc() } return 0, err } - s.pendingCommit = true - s.Unlock() if t < a.mint { a.mint = t @@ -326,25 +342,46 @@ func (a *headAppender) Append(ref storage.SeriesRef, lset labels.Labels, t int64 return storage.SeriesRef(s.ref), nil } -// appendable checks whether the given sample is valid for appending to the series. -func (s *memSeries) appendable(t int64, v float64) error { - c := s.head() - if c == nil { - return nil +// appendable checks whether the given sample is valid for appending to the series. (if we return false and no error) +// The sample belongs to the out of order chunk if we return true and no error. +// An error signifies the sample cannot be handled. +func (s *memSeries) appendable(t int64, v float64, headMaxt, minValidTime, oooTimeWindow int64) (isOOO bool, oooDelta int64, err error) { + // Check if we can append in the in-order chunk. + if t >= minValidTime { + if s.head() == nil { + // The series has no sample and was freshly created. + return false, 0, nil + } + msMaxt := s.maxTime() + if t > msMaxt { + return false, 0, nil + } + if t == msMaxt { + // We are allowing exact duplicates as we can encounter them in valid cases + // like federation and erroring out at that time would be extremely noisy. + // This only checks against the latest in-order sample. + // The OOO headchunk has its own method to detect these duplicates. + if math.Float64bits(s.sampleBuf[3].v) != math.Float64bits(v) { + return false, 0, storage.ErrDuplicateSampleForTimestamp + } + // Sample is identical (ts + value) with most current (highest ts) sample in sampleBuf. + return false, 0, nil + } } - if t > c.maxTime { - return nil + // The sample cannot go in the in-order chunk. Check if it can go in the out-of-order chunk. + if oooTimeWindow > 0 && t >= headMaxt-oooTimeWindow { + return true, headMaxt - t, nil } - if t < c.maxTime { - return storage.ErrOutOfOrderSample + + // The sample cannot go in both in-order and out-of-order chunk. + if oooTimeWindow > 0 { + return true, headMaxt - t, storage.ErrTooOldSample } - // We are allowing exact duplicates as we can encounter them in valid cases - // like federation and erroring out at that time would be extremely noisy. - if math.Float64bits(s.sampleBuf[3].v) != math.Float64bits(v) { - return storage.ErrDuplicateSampleForTimestamp + if t < minValidTime { + return false, headMaxt - t, storage.ErrOutOfBounds } - return nil + return false, headMaxt - t, storage.ErrOutOfOrderSample } // AppendExemplar for headAppender assumes the series ref already exists, and so it doesn't @@ -487,6 +524,7 @@ func exemplarsForEncoding(es []exemplarWithSeriesRef) []record.RefExemplar { } // Commit writes to the WAL and adds the data to the Head. +// TODO(codesome): Refactor this method to reduce indentation and make it more readable. func (a *headAppender) Commit() (err error) { if a.closed { return ErrAppenderClosed @@ -517,24 +555,143 @@ func (a *headAppender) Commit() (err error) { defer a.head.putMetadataBuffer(a.metadata) defer a.head.iso.closeAppend(a.appendID) - total := len(a.samples) - var series *memSeries + var ( + samplesAppended = len(a.samples) + oooAccepted int // number of samples out of order but accepted: with ooo enabled and within time window + oooRejected int // number of samples rejected due to: out of order but OOO support disabled. + tooOldRejected int // number of samples rejected due to: that are out of order but too old (OOO support enabled, but outside time window) + oobRejected int // number of samples rejected due to: out of bounds: with t < minValidTime (OOO support disabled) + inOrderMint int64 = math.MaxInt64 + inOrderMaxt int64 = math.MinInt64 + ooomint int64 = math.MaxInt64 + ooomaxt int64 = math.MinInt64 + wblSamples []record.RefSample + oooMmapMarkers map[chunks.HeadSeriesRef]chunks.ChunkDiskMapperRef + oooRecords [][]byte + series *memSeries + enc record.Encoder + ) + defer func() { + for i := range oooRecords { + a.head.putBytesBuffer(oooRecords[i][:0]) + } + }() + collectOOORecords := func() { + if a.head.wbl == nil { + // WBL is not enabled. So no need to collect. + wblSamples = nil + oooMmapMarkers = nil + return + } + // The m-map happens before adding a new sample. So we collect + // the m-map markers first, and then samples. + // WBL Graphically: + // WBL Before this Commit(): [old samples before this commit for chunk 1] + // WBL After this Commit(): [old samples before this commit for chunk 1][new samples in this commit for chunk 1]mmapmarker1[samples for chunk 2]mmapmarker2[samples for chunk 3] + if oooMmapMarkers != nil { + markers := make([]record.RefMmapMarker, 0, len(oooMmapMarkers)) + for ref, mmapRef := range oooMmapMarkers { + markers = append(markers, record.RefMmapMarker{ + Ref: ref, + MmapRef: mmapRef, + }) + } + r := enc.MmapMarkers(markers, a.head.getBytesBuffer()) + oooRecords = append(oooRecords, r) + } + + if len(wblSamples) > 0 { + r := enc.Samples(wblSamples, a.head.getBytesBuffer()) + oooRecords = append(oooRecords, r) + } + + wblSamples = nil + oooMmapMarkers = nil + } for i, s := range a.samples { series = a.sampleSeries[i] series.Lock() - ok, chunkCreated := series.append(s.T, s.V, a.appendID, a.head.chunkDiskMapper) - series.cleanupAppendIDsBelow(a.cleanupAppendIDsBelow) - series.pendingCommit = false - series.Unlock() - if !ok { - total-- - a.head.metrics.outOfOrderSamples.Inc() + oooSample, _, err := series.appendable(s.T, s.V, a.headMaxt, a.minValidTime, a.oooTimeWindow) + switch err { + case storage.ErrOutOfOrderSample: + samplesAppended-- + oooRejected++ + case storage.ErrOutOfBounds: + samplesAppended-- + oobRejected++ + case storage.ErrTooOldSample: + samplesAppended-- + tooOldRejected++ + case nil: + // Do nothing. + default: + samplesAppended-- } + + var ok, chunkCreated bool + + if err == nil && oooSample { + // Sample is OOO and OOO handling is enabled + // and the delta is within the OOO tolerance. + var mmapRef chunks.ChunkDiskMapperRef + ok, chunkCreated, mmapRef = series.insert(s.T, s.V, a.head.chunkDiskMapper) + if chunkCreated { + r, ok := oooMmapMarkers[series.ref] + if !ok || r != 0 { + // !ok means there are no markers collected for these samples yet. So we first flush the samples + // before setting this m-map marker. + + // r != 0 means we have already m-mapped a chunk for this series in the same Commit(). + // Hence, before we m-map again, we should add the samples and m-map markers + // seen till now to the WBL records. + collectOOORecords() + } + + if oooMmapMarkers == nil { + oooMmapMarkers = make(map[chunks.HeadSeriesRef]chunks.ChunkDiskMapperRef) + } + oooMmapMarkers[series.ref] = mmapRef + } + if ok { + wblSamples = append(wblSamples, s) + if s.T < ooomint { + ooomint = s.T + } + if s.T > ooomaxt { + ooomaxt = s.T + } + oooAccepted++ + } else { + // Sample is an exact duplicate of the last sample. + // NOTE: We can only detect updates if they clash with a sample in the OOOHeadChunk, + // not with samples in already flushed OOO chunks. + // TODO(codesome): Add error reporting? It depends on addressing https://github.com/prometheus/prometheus/discussions/10305. + samplesAppended-- + } + } else if err == nil { + ok, chunkCreated = series.append(s.T, s.V, a.appendID, a.head.chunkDiskMapper) + if ok { + if s.T < inOrderMint { + inOrderMint = s.T + } + if s.T > inOrderMaxt { + inOrderMaxt = s.T + } + } else { + // The sample is an exact duplicate, and should be silently dropped. + samplesAppended-- + } + } + if chunkCreated { a.head.metrics.chunks.Inc() a.head.metrics.chunksCreated.Inc() } + + series.cleanupAppendIDsBelow(a.cleanupAppendIDsBelow) + series.pendingCommit = false + series.Unlock() } for i, m := range a.metadata { @@ -544,12 +701,48 @@ func (a *headAppender) Commit() (err error) { series.Unlock() } - a.head.metrics.samplesAppended.Add(float64(total)) - a.head.updateMinMaxTime(a.mint, a.maxt) + a.head.metrics.outOfOrderSamples.Add(float64(oooRejected)) + a.head.metrics.outOfBoundSamples.Add(float64(oobRejected)) + a.head.metrics.tooOldSamples.Add(float64(tooOldRejected)) + a.head.metrics.samplesAppended.Add(float64(samplesAppended)) + a.head.metrics.outOfOrderSamplesAppended.Add(float64(oooAccepted)) + a.head.updateMinMaxTime(inOrderMint, inOrderMaxt) + a.head.updateMinOOOMaxOOOTime(ooomint, ooomaxt) + collectOOORecords() + if a.head.wbl != nil { + if err := a.head.wbl.Log(oooRecords...); err != nil { + // TODO(codesome): Currently WBL logging of ooo samples is best effort here since we cannot try logging + // until we have found what samples become OOO. We can try having a metric for this failure. + // Returning the error here is not correct because we have already put the samples into the memory, + // hence the append/insert was a success. + level.Error(a.head.logger).Log("msg", "Failed to log out of order samples into the WAL", "err", err) + } + } return nil } +// insert is like append, except it inserts. Used for OOO samples. +func (s *memSeries) insert(t int64, v float64, chunkDiskMapper *chunks.ChunkDiskMapper) (inserted, chunkCreated bool, mmapRef chunks.ChunkDiskMapperRef) { + c := s.oooHeadChunk + if c == nil || c.chunk.NumSamples() == int(s.oooCapMax) { + // Note: If no new samples come in then we rely on compaction to clean up stale in-memory OOO chunks. + c, mmapRef = s.cutNewOOOHeadChunk(t, chunkDiskMapper) + chunkCreated = true + } + + ok := c.chunk.Insert(t, v) + if ok { + if chunkCreated || t < c.minTime { + c.minTime = t + } + if chunkCreated || t > c.maxTime { + c.maxTime = t + } + } + return ok, chunkCreated, mmapRef +} + // append adds the sample (t, v) to the series. The caller also has to provide // the appendID for isolation. (The appendID can be zero, which results in no // isolation for this append.) @@ -567,7 +760,7 @@ func (s *memSeries) append(t int64, v float64, appendID uint64, chunkDiskMapper // Out of order sample. Sample timestamp is already in the mmapped chunks, so ignore it. return false, false } - // There is no chunk in this series yet, create the first chunk for the sample. + // There is no head chunk in this series yet, create the first chunk for the sample. c = s.cutNewHeadChunk(t, chunkDiskMapper) chunkCreated = true } @@ -651,6 +844,36 @@ func (s *memSeries) cutNewHeadChunk(mint int64, chunkDiskMapper *chunks.ChunkDis return s.headChunk } +func (s *memSeries) cutNewOOOHeadChunk(mint int64, chunkDiskMapper *chunks.ChunkDiskMapper) (*oooHeadChunk, chunks.ChunkDiskMapperRef) { + ref := s.mmapCurrentOOOHeadChunk(chunkDiskMapper) + + s.oooHeadChunk = &oooHeadChunk{ + chunk: NewOOOChunk(), + minTime: mint, + maxTime: math.MinInt64, + } + + return s.oooHeadChunk, ref +} + +func (s *memSeries) mmapCurrentOOOHeadChunk(chunkDiskMapper *chunks.ChunkDiskMapper) chunks.ChunkDiskMapperRef { + if s.oooHeadChunk == nil { + // There is no head chunk, so nothing to m-map here. + return 0 + } + xor, _ := s.oooHeadChunk.chunk.ToXOR() // Encode to XorChunk which is more compact and implements all of the needed functionality. + oooXor := &chunkenc.OOOXORChunk{XORChunk: xor} + chunkRef := chunkDiskMapper.WriteChunk(s.ref, s.oooHeadChunk.minTime, s.oooHeadChunk.maxTime, oooXor, handleChunkWriteError) + s.oooMmappedChunks = append(s.oooMmappedChunks, &mmappedChunk{ + ref: chunkRef, + numSamples: uint16(xor.NumSamples()), + minTime: s.oooHeadChunk.minTime, + maxTime: s.oooHeadChunk.maxTime, + }) + s.oooHeadChunk = nil + return chunkRef +} + func (s *memSeries) mmapCurrentHeadChunk(chunkDiskMapper *chunks.ChunkDiskMapper) { if s.headChunk == nil { // There is no head chunk, so nothing to m-map here. diff --git a/tsdb/head_bench_test.go b/tsdb/head_bench_test.go index c0f07a00f2..2f8e0ba374 100644 --- a/tsdb/head_bench_test.go +++ b/tsdb/head_bench_test.go @@ -30,7 +30,7 @@ func BenchmarkHeadStripeSeriesCreate(b *testing.B) { opts := DefaultHeadOptions() opts.ChunkRange = 1000 opts.ChunkDirRoot = chunkDir - h, err := NewHead(nil, nil, nil, opts, nil) + h, err := NewHead(nil, nil, nil, nil, opts, nil) require.NoError(b, err) defer h.Close() @@ -45,7 +45,7 @@ func BenchmarkHeadStripeSeriesCreateParallel(b *testing.B) { opts := DefaultHeadOptions() opts.ChunkRange = 1000 opts.ChunkDirRoot = chunkDir - h, err := NewHead(nil, nil, nil, opts, nil) + h, err := NewHead(nil, nil, nil, nil, opts, nil) require.NoError(b, err) defer h.Close() @@ -69,7 +69,7 @@ func BenchmarkHeadStripeSeriesCreate_PreCreationFailure(b *testing.B) { // Mock the PreCreation() callback to fail on each series. opts.SeriesCallback = failingSeriesLifecycleCallback{} - h, err := NewHead(nil, nil, nil, opts, nil) + h, err := NewHead(nil, nil, nil, nil, opts, nil) require.NoError(b, err) defer h.Close() diff --git a/tsdb/head_read.go b/tsdb/head_read.go index ca34b9bbdd..5b2a70c03d 100644 --- a/tsdb/head_read.go +++ b/tsdb/head_read.go @@ -183,11 +183,20 @@ func (h *headIndexReader) Series(ref storage.SeriesRef, lbls *labels.Labels, chk return nil } -// headChunkID returns the HeadChunkID corresponding to .mmappedChunks[pos] +// headChunkID returns the HeadChunkID referred to by the given position. +// * 0 <= pos < len(s.mmappedChunks) refer to s.mmappedChunks[pos] +// * pos == len(s.mmappedChunks) refers to s.headChunk func (s *memSeries) headChunkID(pos int) chunks.HeadChunkID { return chunks.HeadChunkID(pos) + s.firstChunkID } +// oooHeadChunkID returns the HeadChunkID referred to by the given position. +// * 0 <= pos < len(s.oooMmappedChunks) refer to s.oooMmappedChunks[pos] +// * pos == len(s.oooMmappedChunks) refers to s.oooHeadChunk +func (s *memSeries) oooHeadChunkID(pos int) chunks.HeadChunkID { + return chunks.HeadChunkID(pos) + s.firstOOOChunkID +} + // LabelValueFor returns label value for the given label name in the series referred to by ID. func (h *headIndexReader) LabelValueFor(id storage.SeriesRef, label string) (string, error) { memSeries := h.head.series.getByID(chunks.HeadSeriesRef(id)) @@ -258,8 +267,8 @@ func (h *headChunkReader) Close() error { } // Chunk returns the chunk for the reference number. -func (h *headChunkReader) Chunk(ref chunks.ChunkRef) (chunkenc.Chunk, error) { - sid, cid := chunks.HeadChunkRef(ref).Unpack() +func (h *headChunkReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) { + sid, cid := chunks.HeadChunkRef(meta.Ref).Unpack() s := h.head.series.getByID(sid) // This means that the series has been garbage collected. @@ -330,6 +339,260 @@ func (s *memSeries) chunk(id chunks.HeadChunkID, chunkDiskMapper *chunks.ChunkDi return mc, true, nil } +// oooMergedChunk returns the requested chunk based on the given chunks.Meta +// reference from memory or by m-mapping it from the disk. The returned chunk +// might be a merge of all the overlapping chunks, if any, amongst all the +// chunks in the OOOHead. +// This function is not thread safe unless the caller holds a lock. +func (s *memSeries) oooMergedChunk(meta chunks.Meta, cdm *chunks.ChunkDiskMapper, mint, maxt int64) (chunk *mergedOOOChunks, err error) { + _, cid := chunks.HeadChunkRef(meta.Ref).Unpack() + + // ix represents the index of chunk in the s.mmappedChunks slice. The chunk meta's are + // incremented by 1 when new chunk is created, hence (meta - firstChunkID) gives the slice index. + // The max index for the s.mmappedChunks slice can be len(s.mmappedChunks)-1, hence if the ix + // is len(s.mmappedChunks), it represents the next chunk, which is the head chunk. + ix := int(cid) - int(s.firstOOOChunkID) + if ix < 0 || ix > len(s.oooMmappedChunks) { + return nil, storage.ErrNotFound + } + + if ix == len(s.oooMmappedChunks) { + if s.oooHeadChunk == nil { + return nil, errors.New("invalid ooo head chunk") + } + } + + // We create a temporary slice of chunk metas to hold the information of all + // possible chunks that may overlap with the requested chunk. + tmpChks := make([]chunkMetaAndChunkDiskMapperRef, 0, len(s.oooMmappedChunks)) + + oooHeadRef := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(len(s.oooMmappedChunks)))) + if s.oooHeadChunk != nil && s.oooHeadChunk.OverlapsClosedInterval(mint, maxt) { + // We only want to append the head chunk if this chunk existed when + // Series() was called. This brings consistency in case new data + // is added in between Series() and Chunk() calls. + if oooHeadRef == meta.OOOLastRef { + tmpChks = append(tmpChks, chunkMetaAndChunkDiskMapperRef{ + meta: chunks.Meta{ + // Ignoring samples added before and after the last known min and max time for this chunk. + MinTime: meta.OOOLastMinTime, + MaxTime: meta.OOOLastMaxTime, + Ref: oooHeadRef, + }, + }) + } + } + + for i, c := range s.oooMmappedChunks { + chunkRef := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(i))) + // We can skip chunks that came in later than the last known OOOLastRef. + if chunkRef > meta.OOOLastRef { + break + } + + if chunkRef == meta.OOOLastRef { + tmpChks = append(tmpChks, chunkMetaAndChunkDiskMapperRef{ + meta: chunks.Meta{ + MinTime: meta.OOOLastMinTime, + MaxTime: meta.OOOLastMaxTime, + Ref: chunkRef, + }, + ref: c.ref, + origMinT: c.minTime, + origMaxT: c.maxTime, + }) + } else if c.OverlapsClosedInterval(mint, maxt) { + tmpChks = append(tmpChks, chunkMetaAndChunkDiskMapperRef{ + meta: chunks.Meta{ + MinTime: c.minTime, + MaxTime: c.maxTime, + Ref: chunkRef, + }, + ref: c.ref, + }) + } + } + + // Next we want to sort all the collected chunks by min time so we can find + // those that overlap and stop when we know the rest don't. + sort.Sort(byMinTimeAndMinRef(tmpChks)) + + mc := &mergedOOOChunks{} + absoluteMax := int64(math.MinInt64) + for _, c := range tmpChks { + if c.meta.Ref != meta.Ref && (len(mc.chunks) == 0 || c.meta.MinTime > absoluteMax) { + continue + } + if c.meta.Ref == oooHeadRef { + var xor *chunkenc.XORChunk + // If head chunk min and max time match the meta OOO markers + // that means that the chunk has not expanded so we can append + // it as it is. + if s.oooHeadChunk.minTime == meta.OOOLastMinTime && s.oooHeadChunk.maxTime == meta.OOOLastMaxTime { + xor, err = s.oooHeadChunk.chunk.ToXOR() // TODO(jesus.vazquez) (This is an optimization idea that has no priority and might not be that useful) See if we could use a copy of the underlying slice. That would leave the more expensive ToXOR() function only for the usecase where Bytes() is called. + } else { + // We need to remove samples that are outside of the markers + xor, err = s.oooHeadChunk.chunk.ToXORBetweenTimestamps(meta.OOOLastMinTime, meta.OOOLastMaxTime) + } + if err != nil { + return nil, errors.Wrap(err, "failed to convert ooo head chunk to xor chunk") + } + c.meta.Chunk = xor + } else { + chk, err := cdm.Chunk(c.ref) + if err != nil { + if _, ok := err.(*chunks.CorruptionErr); ok { + return nil, errors.Wrap(err, "invalid ooo mmapped chunk") + } + return nil, err + } + if c.meta.Ref == meta.OOOLastRef && + (c.origMinT != meta.OOOLastMinTime || c.origMaxT != meta.OOOLastMaxTime) { + // The head expanded and was memory mapped so now we need to + // wrap the chunk within a chunk that doesnt allows us to iterate + // through samples out of the OOOLastMinT and OOOLastMaxT + // markers. + c.meta.Chunk = boundedChunk{chk, meta.OOOLastMinTime, meta.OOOLastMaxTime} + } else { + c.meta.Chunk = chk + } + } + mc.chunks = append(mc.chunks, c.meta) + if c.meta.MaxTime > absoluteMax { + absoluteMax = c.meta.MaxTime + } + } + + return mc, nil +} + +var _ chunkenc.Chunk = &mergedOOOChunks{} + +// mergedOOOChunks holds the list of overlapping chunks. This struct satisfies +// chunkenc.Chunk. +type mergedOOOChunks struct { + chunks []chunks.Meta +} + +// Bytes is a very expensive method because its calling the iterator of all the +// chunks in the mergedOOOChunk and building a new chunk with the samples. +func (o mergedOOOChunks) Bytes() []byte { + xc := chunkenc.NewXORChunk() + app, err := xc.Appender() + if err != nil { + panic(err) + } + it := o.Iterator(nil) + for it.Next() { + t, v := it.At() + app.Append(t, v) + } + + return xc.Bytes() +} + +func (o mergedOOOChunks) Encoding() chunkenc.Encoding { + return chunkenc.EncXOR +} + +func (o mergedOOOChunks) Appender() (chunkenc.Appender, error) { + return nil, errors.New("can't append to mergedOOOChunks") +} + +func (o mergedOOOChunks) Iterator(iterator chunkenc.Iterator) chunkenc.Iterator { + iterators := make([]chunkenc.Iterator, 0, len(o.chunks)) + for _, c := range o.chunks { + iterators = append(iterators, c.Chunk.Iterator(nil)) + } + return storage.NewChainSampleIterator(iterators) +} + +func (o mergedOOOChunks) NumSamples() int { + samples := 0 + for _, c := range o.chunks { + samples += c.Chunk.NumSamples() + } + return samples +} + +func (o mergedOOOChunks) Compact() {} + +var _ chunkenc.Chunk = &boundedChunk{} + +// boundedChunk is an implementation of chunkenc.Chunk that uses a +// boundedIterator that only iterates through samples which timestamps are +// >= minT and <= maxT +type boundedChunk struct { + chunkenc.Chunk + minT int64 + maxT int64 +} + +func (b boundedChunk) Bytes() []byte { + xor := chunkenc.NewXORChunk() + a, _ := xor.Appender() + it := b.Iterator(nil) + for it.Next() { + t, v := it.At() + a.Append(t, v) + } + return xor.Bytes() +} + +func (b boundedChunk) Iterator(iterator chunkenc.Iterator) chunkenc.Iterator { + it := b.Chunk.Iterator(iterator) + if it == nil { + panic("iterator shouldn't be nil") + } + return boundedIterator{it, b.minT, b.maxT} +} + +var _ chunkenc.Iterator = &boundedIterator{} + +// boundedIterator is an implementation of Iterator that only iterates through +// samples which timestamps are >= minT and <= maxT +type boundedIterator struct { + chunkenc.Iterator + minT int64 + maxT int64 +} + +// Next the first time its called it will advance as many positions as necessary +// until its able to find a sample within the bounds minT and maxT. +// If there are samples within bounds it will advance one by one amongst them. +// If there are no samples within bounds it will return false. +func (b boundedIterator) Next() bool { + for b.Iterator.Next() { + t, _ := b.Iterator.At() + if t < b.minT { + continue + } else if t > b.maxT { + return false + } + return true + } + return false +} + +func (b boundedIterator) Seek(t int64) bool { + if t < b.minT { + // We must seek at least up to b.minT if it is asked for something before that. + ok := b.Iterator.Seek(b.minT) + if !ok { + return false + } + t, _ := b.Iterator.At() + return t <= b.maxT + } + if t > b.maxT { + // We seek anyway so that the subsequent Next() calls will also return false. + b.Iterator.Seek(t) + return false + } + return b.Iterator.Seek(t) +} + +// safeChunk makes sure that the chunk can be accessed without a race condition type safeChunk struct { chunkenc.Chunk s *memSeries diff --git a/tsdb/head_read_test.go b/tsdb/head_read_test.go new file mode 100644 index 0000000000..4c3ba885bb --- /dev/null +++ b/tsdb/head_read_test.go @@ -0,0 +1,178 @@ +// Copyright 2021 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tsdb + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/prometheus/prometheus/tsdb/chunkenc" +) + +func TestBoundedChunk(t *testing.T) { + tests := []struct { + name string + inputChunk chunkenc.Chunk + inputMinT int64 + inputMaxT int64 + initialSeek int64 + seekIsASuccess bool + expSamples []sample + }{ + { + name: "if there are no samples it returns nothing", + inputChunk: newTestChunk(0), + expSamples: nil, + }, + { + name: "bounds represent a single sample", + inputChunk: newTestChunk(10), + expSamples: []sample{ + {0, 0}, + }, + }, + { + name: "if there are bounds set only samples within them are returned", + inputChunk: newTestChunk(10), + inputMinT: 1, + inputMaxT: 8, + expSamples: []sample{ + {1, 1}, + {2, 2}, + {3, 3}, + {4, 4}, + {5, 5}, + {6, 6}, + {7, 7}, + {8, 8}, + }, + }, + { + name: "if bounds set and only maxt is less than actual maxt", + inputChunk: newTestChunk(10), + inputMinT: 0, + inputMaxT: 5, + expSamples: []sample{ + {0, 0}, + {1, 1}, + {2, 2}, + {3, 3}, + {4, 4}, + {5, 5}, + }, + }, + { + name: "if bounds set and only mint is more than actual mint", + inputChunk: newTestChunk(10), + inputMinT: 5, + inputMaxT: 9, + expSamples: []sample{ + {5, 5}, + {6, 6}, + {7, 7}, + {8, 8}, + {9, 9}, + }, + }, + { + name: "if there are bounds set with seek before mint", + inputChunk: newTestChunk(10), + inputMinT: 3, + inputMaxT: 7, + initialSeek: 1, + seekIsASuccess: true, + expSamples: []sample{ + {3, 3}, + {4, 4}, + {5, 5}, + {6, 6}, + {7, 7}, + }, + }, + { + name: "if there are bounds set with seek between mint and maxt", + inputChunk: newTestChunk(10), + inputMinT: 3, + inputMaxT: 7, + initialSeek: 5, + seekIsASuccess: true, + expSamples: []sample{ + {5, 5}, + {6, 6}, + {7, 7}, + }, + }, + { + name: "if there are bounds set with seek after maxt", + inputChunk: newTestChunk(10), + inputMinT: 3, + inputMaxT: 7, + initialSeek: 8, + seekIsASuccess: false, + }, + } + for _, tc := range tests { + t.Run(fmt.Sprintf("name=%s", tc.name), func(t *testing.T) { + chunk := boundedChunk{tc.inputChunk, tc.inputMinT, tc.inputMaxT} + + // Testing Bytes() + expChunk := chunkenc.NewXORChunk() + if tc.inputChunk.NumSamples() > 0 { + app, err := expChunk.Appender() + require.NoError(t, err) + for ts := tc.inputMinT; ts <= tc.inputMaxT; ts++ { + app.Append(ts, float64(ts)) + } + } + require.Equal(t, expChunk.Bytes(), chunk.Bytes()) + + var samples []sample + it := chunk.Iterator(nil) + + if tc.initialSeek != 0 { + // Testing Seek() + ok := it.Seek(tc.initialSeek) + require.Equal(t, tc.seekIsASuccess, ok) + if ok { + t, v := it.At() + samples = append(samples, sample{t, v}) + } + } + + // Testing Next() + for it.Next() { + t, v := it.At() + samples = append(samples, sample{t, v}) + } + + // it.Next() should keep returning false. + for i := 0; i < 10; i++ { + require.False(t, it.Next()) + } + + require.Equal(t, tc.expSamples, samples) + }) + } +} + +func newTestChunk(numSamples int) chunkenc.Chunk { + xor := chunkenc.NewXORChunk() + a, _ := xor.Appender() + for i := 0; i < numSamples; i++ { + a.Append(int64(i), float64(i)) + } + return xor +} diff --git a/tsdb/head_test.go b/tsdb/head_test.go index 7c580406ce..489dad65c9 100644 --- a/tsdb/head_test.go +++ b/tsdb/head_test.go @@ -49,7 +49,7 @@ import ( "github.com/prometheus/prometheus/tsdb/wal" ) -func newTestHead(t testing.TB, chunkRange int64, compressWAL bool) (*Head, *wal.WAL) { +func newTestHead(t testing.TB, chunkRange int64, compressWAL, oooEnabled bool) (*Head, *wal.WAL) { dir := t.TempDir() wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, compressWAL) require.NoError(t, err) @@ -59,18 +59,23 @@ func newTestHead(t testing.TB, chunkRange int64, compressWAL bool) (*Head, *wal. opts.ChunkDirRoot = dir opts.EnableExemplarStorage = true opts.MaxExemplars.Store(config.DefaultExemplarsConfig.MaxExemplars) + if oooEnabled { + opts.OutOfOrderTimeWindow.Store(10 * time.Minute.Milliseconds()) + } - h, err := NewHead(nil, nil, wlog, opts, nil) + h, err := NewHead(nil, nil, wlog, nil, opts, nil) require.NoError(t, err) - require.NoError(t, h.chunkDiskMapper.IterateAllChunks(func(_ chunks.HeadSeriesRef, _ chunks.ChunkDiskMapperRef, _, _ int64, _ uint16) error { return nil })) + require.NoError(t, h.chunkDiskMapper.IterateAllChunks(func(_ chunks.HeadSeriesRef, _ chunks.ChunkDiskMapperRef, _, _ int64, _ uint16, _ chunkenc.Encoding) error { + return nil + })) return h, wlog } func BenchmarkCreateSeries(b *testing.B) { series := genSeries(b.N, 10, 0, 0) - h, _ := newTestHead(b, 10000, false) + h, _ := newTestHead(b, 10000, false, false) defer func() { require.NoError(b, h.Close()) }() @@ -224,7 +229,7 @@ func BenchmarkLoadWAL(b *testing.B) { require.NoError(b, err) for k := 0; k < c.batches*c.seriesPerBatch; k++ { // Create one mmapped chunk per series, with one sample at the given time. - s := newMemSeries(labels.Labels{}, chunks.HeadSeriesRef(k)*101, c.mmappedChunkT, defaultIsolationDisabled) + s := newMemSeries(labels.Labels{}, chunks.HeadSeriesRef(k)*101, c.mmappedChunkT, 1, defaultIsolationDisabled) s.append(c.mmappedChunkT, 42, 0, chunkDiskMapper) s.mmapCurrentHeadChunk(chunkDiskMapper) } @@ -255,7 +260,7 @@ func BenchmarkLoadWAL(b *testing.B) { opts := DefaultHeadOptions() opts.ChunkRange = 1000 opts.ChunkDirRoot = w.Dir() - h, err := NewHead(nil, nil, w, opts, nil) + h, err := NewHead(nil, nil, w, nil, opts, nil) require.NoError(b, err) h.Init(0) } @@ -271,7 +276,7 @@ func BenchmarkLoadWAL(b *testing.B) { // While appending the samples to the head it concurrently queries them from multiple go routines and verifies that the // returned results are correct. func TestHead_HighConcurrencyReadAndWrite(t *testing.T) { - head, _ := newTestHead(t, DefaultBlockDuration, false) + head, _ := newTestHead(t, DefaultBlockDuration, false, false) defer func() { require.NoError(t, head.Close()) }() @@ -487,7 +492,7 @@ func TestHead_ReadWAL(t *testing.T) { }, } - head, w := newTestHead(t, 1000, compress) + head, w := newTestHead(t, 1000, compress, false) defer func() { require.NoError(t, head.Close()) }() @@ -531,7 +536,7 @@ func TestHead_ReadWAL(t *testing.T) { } func TestHead_WALMultiRef(t *testing.T) { - head, w := newTestHead(t, 1000, false) + head, w := newTestHead(t, 1000, false, false) require.NoError(t, head.Init(0)) @@ -572,7 +577,7 @@ func TestHead_WALMultiRef(t *testing.T) { opts := DefaultHeadOptions() opts.ChunkRange = 1000 opts.ChunkDirRoot = w.Dir() - head, err = NewHead(nil, nil, w, opts, nil) + head, err = NewHead(nil, nil, w, nil, opts, nil) require.NoError(t, err) require.NoError(t, head.Init(0)) defer func() { @@ -591,7 +596,7 @@ func TestHead_WALMultiRef(t *testing.T) { } func TestHead_ActiveAppenders(t *testing.T) { - head, _ := newTestHead(t, 1000, false) + head, _ := newTestHead(t, 1000, false, false) defer head.Close() require.NoError(t, head.Init(0)) @@ -624,14 +629,14 @@ func TestHead_ActiveAppenders(t *testing.T) { } func TestHead_UnknownWALRecord(t *testing.T) { - head, w := newTestHead(t, 1000, false) + head, w := newTestHead(t, 1000, false, false) w.Log([]byte{255, 42}) require.NoError(t, head.Init(0)) require.NoError(t, head.Close()) } func TestHead_Truncate(t *testing.T) { - h, _ := newTestHead(t, 1000, false) + h, _ := newTestHead(t, 1000, false, false) defer func() { require.NoError(t, h.Close()) }() @@ -733,7 +738,7 @@ func TestMemSeries_truncateChunks(t *testing.T) { }, } - s := newMemSeries(labels.FromStrings("a", "b"), 1, 2000, defaultIsolationDisabled) + s := newMemSeries(labels.FromStrings("a", "b"), 1, 2000, 1, defaultIsolationDisabled) for i := 0; i < 4000; i += 5 { ok, _ := s.append(int64(i), float64(i), 0, chunkDiskMapper) @@ -752,7 +757,7 @@ func TestMemSeries_truncateChunks(t *testing.T) { require.NotNil(t, chk) require.NoError(t, err) - s.truncateChunksBefore(2000) + s.truncateChunksBefore(2000, 0) require.Equal(t, int64(2000), s.mmappedChunks[0].minTime) _, _, err = s.chunk(0, chunkDiskMapper, &memChunkPool) @@ -789,7 +794,7 @@ func TestHeadDeleteSeriesWithoutSamples(t *testing.T) { {Ref: 50, T: 90, V: 1}, }, } - head, w := newTestHead(t, 1000, compress) + head, w := newTestHead(t, 1000, compress, false) defer func() { require.NoError(t, head.Close()) }() @@ -857,7 +862,8 @@ func TestHeadDeleteSimple(t *testing.T) { for _, compress := range []bool{false, true} { t.Run(fmt.Sprintf("compress=%t", compress), func(t *testing.T) { for _, c := range cases { - head, w := newTestHead(t, 1000, compress) + head, w := newTestHead(t, 1000, compress, false) + require.NoError(t, head.Init(0)) app := head.Appender(context.Background()) for _, smpl := range smplsAll { @@ -887,7 +893,7 @@ func TestHeadDeleteSimple(t *testing.T) { opts := DefaultHeadOptions() opts.ChunkRange = 1000 opts.ChunkDirRoot = reloadedW.Dir() - reloadedHead, err := NewHead(nil, nil, reloadedW, opts, nil) + reloadedHead, err := NewHead(nil, nil, reloadedW, nil, opts, nil) require.NoError(t, err) require.NoError(t, reloadedHead.Init(0)) @@ -937,7 +943,7 @@ func TestHeadDeleteSimple(t *testing.T) { } func TestDeleteUntilCurMax(t *testing.T) { - hb, _ := newTestHead(t, 1000000, false) + hb, _ := newTestHead(t, 1000000, false, false) defer func() { require.NoError(t, hb.Close()) }() @@ -990,7 +996,7 @@ func TestDeletedSamplesAndSeriesStillInWALAfterCheckpoint(t *testing.T) { numSamples := 10000 // Enough samples to cause a checkpoint. - hb, w := newTestHead(t, int64(numSamples)*10, false) + hb, w := newTestHead(t, int64(numSamples)*10, false, false) for i := 0; i < numSamples; i++ { app := hb.Appender(context.Background()) @@ -1082,7 +1088,7 @@ func TestDelete_e2e(t *testing.T) { seriesMap[labels.New(l...).String()] = []tsdbutil.Sample{} } - hb, _ := newTestHead(t, 100000, false) + hb, _ := newTestHead(t, 100000, false, false) defer func() { require.NoError(t, hb.Close()) }() @@ -1271,7 +1277,7 @@ func TestMemSeries_append(t *testing.T) { require.NoError(t, chunkDiskMapper.Close()) }() - s := newMemSeries(labels.Labels{}, 1, 500, defaultIsolationDisabled) + s := newMemSeries(labels.Labels{}, 1, 500, 1, defaultIsolationDisabled) // Add first two samples at the very end of a chunk range and the next two // on and after it. @@ -1325,7 +1331,7 @@ func TestMemSeries_append_atVariableRate(t *testing.T) { require.NoError(t, chunkDiskMapper.Close()) }) - s := newMemSeries(labels.Labels{}, 1, DefaultBlockDuration, defaultIsolationDisabled) + s := newMemSeries(labels.Labels{}, 1, DefaultBlockDuration, 0, defaultIsolationDisabled) // At this slow rate, we will fill the chunk in two block durations. slowRate := (DefaultBlockDuration * 2) / samplesPerChunk @@ -1361,7 +1367,7 @@ func TestMemSeries_append_atVariableRate(t *testing.T) { func TestGCChunkAccess(t *testing.T) { // Put a chunk, select it. GC it and then access it. - h, _ := newTestHead(t, 1000, false) + h, _ := newTestHead(t, 1000, false, false) defer func() { require.NoError(t, h.Close()) }() @@ -1398,22 +1404,22 @@ func TestGCChunkAccess(t *testing.T) { cr, err := h.chunksRange(0, 1500, nil) require.NoError(t, err) - _, err = cr.Chunk(chunks[0].Ref) + _, err = cr.Chunk(chunks[0]) require.NoError(t, err) - _, err = cr.Chunk(chunks[1].Ref) + _, err = cr.Chunk(chunks[1]) require.NoError(t, err) require.NoError(t, h.Truncate(1500)) // Remove a chunk. - _, err = cr.Chunk(chunks[0].Ref) + _, err = cr.Chunk(chunks[0]) require.Equal(t, storage.ErrNotFound, err) - _, err = cr.Chunk(chunks[1].Ref) + _, err = cr.Chunk(chunks[1]) require.NoError(t, err) } func TestGCSeriesAccess(t *testing.T) { // Put a series, select it. GC it and then access it. - h, _ := newTestHead(t, 1000, false) + h, _ := newTestHead(t, 1000, false, false) defer func() { require.NoError(t, h.Close()) }() @@ -1450,23 +1456,23 @@ func TestGCSeriesAccess(t *testing.T) { cr, err := h.chunksRange(0, 2000, nil) require.NoError(t, err) - _, err = cr.Chunk(chunks[0].Ref) + _, err = cr.Chunk(chunks[0]) require.NoError(t, err) - _, err = cr.Chunk(chunks[1].Ref) + _, err = cr.Chunk(chunks[1]) require.NoError(t, err) require.NoError(t, h.Truncate(2000)) // Remove the series. require.Equal(t, (*memSeries)(nil), h.series.getByID(1)) - _, err = cr.Chunk(chunks[0].Ref) + _, err = cr.Chunk(chunks[0]) require.Equal(t, storage.ErrNotFound, err) - _, err = cr.Chunk(chunks[1].Ref) + _, err = cr.Chunk(chunks[1]) require.Equal(t, storage.ErrNotFound, err) } func TestUncommittedSamplesNotLostOnTruncate(t *testing.T) { - h, _ := newTestHead(t, 1000, false) + h, _ := newTestHead(t, 1000, false, false) defer func() { require.NoError(t, h.Close()) }() @@ -1496,7 +1502,7 @@ func TestUncommittedSamplesNotLostOnTruncate(t *testing.T) { } func TestRemoveSeriesAfterRollbackAndTruncate(t *testing.T) { - h, _ := newTestHead(t, 1000, false) + h, _ := newTestHead(t, 1000, false, false) defer func() { require.NoError(t, h.Close()) }() @@ -1529,7 +1535,7 @@ func TestRemoveSeriesAfterRollbackAndTruncate(t *testing.T) { func TestHead_LogRollback(t *testing.T) { for _, compress := range []bool{false, true} { t.Run(fmt.Sprintf("compress=%t", compress), func(t *testing.T) { - h, w := newTestHead(t, 1000, compress) + h, w := newTestHead(t, 1000, compress, false) defer func() { require.NoError(t, h.Close()) }() @@ -1606,7 +1612,7 @@ func TestWalRepair_DecodingError(t *testing.T) { opts := DefaultHeadOptions() opts.ChunkRange = 1 opts.ChunkDirRoot = w.Dir() - h, err := NewHead(nil, nil, w, opts, nil) + h, err := NewHead(nil, nil, w, nil, opts, nil) require.NoError(t, err) require.Equal(t, 0.0, prom_testutil.ToFloat64(h.metrics.walCorruptionsTotal)) initErr := h.Init(math.MinInt64) @@ -1660,7 +1666,8 @@ func TestHeadReadWriterRepair(t *testing.T) { opts := DefaultHeadOptions() opts.ChunkRange = chunkRange opts.ChunkDirRoot = dir - h, err := NewHead(nil, nil, w, opts, nil) + opts.ChunkWriteQueueSize = 1 // We need to set this option so that we use the async queue. Upstream prometheus uses the queue directly. + h, err := NewHead(nil, nil, w, nil, opts, nil) require.NoError(t, err) require.Equal(t, 0.0, prom_testutil.ToFloat64(h.metrics.mmapChunkCorruptionTotal)) require.NoError(t, h.Init(math.MinInt64)) @@ -1715,7 +1722,7 @@ func TestHeadReadWriterRepair(t *testing.T) { } func TestNewWalSegmentOnTruncate(t *testing.T) { - h, wlog := newTestHead(t, 1000, false) + h, wlog := newTestHead(t, 1000, false, false) defer func() { require.NoError(t, h.Close()) }() @@ -1745,7 +1752,7 @@ func TestNewWalSegmentOnTruncate(t *testing.T) { } func TestAddDuplicateLabelName(t *testing.T) { - h, _ := newTestHead(t, 1000, false) + h, _ := newTestHead(t, 1000, false, false) defer func() { require.NoError(t, h.Close()) }() @@ -1828,7 +1835,7 @@ func TestMemSeriesIsolation(t *testing.T) { } // Test isolation without restart of Head. - hb, _ := newTestHead(t, 1000, false) + hb, _ := newTestHead(t, 1000, false, false) i := addSamples(hb) testIsolation(hb, i) @@ -1890,7 +1897,7 @@ func TestMemSeriesIsolation(t *testing.T) { require.NoError(t, hb.Close()) // Test isolation with restart of Head. This is to verify the num samples of chunks after m-map chunk replay. - hb, w := newTestHead(t, 1000, false) + hb, w := newTestHead(t, 1000, false, false) i = addSamples(hb) require.NoError(t, hb.Close()) @@ -1899,7 +1906,7 @@ func TestMemSeriesIsolation(t *testing.T) { opts := DefaultHeadOptions() opts.ChunkRange = 1000 opts.ChunkDirRoot = wlog.Dir() - hb, err = NewHead(nil, nil, wlog, opts, nil) + hb, err = NewHead(nil, nil, wlog, nil, opts, nil) defer func() { require.NoError(t, hb.Close()) }() require.NoError(t, err) require.NoError(t, hb.Init(0)) @@ -1943,7 +1950,7 @@ func TestIsolationRollback(t *testing.T) { } // Rollback after a failed append and test if the low watermark has progressed anyway. - hb, _ := newTestHead(t, 1000, false) + hb, _ := newTestHead(t, 1000, false, false) defer func() { require.NoError(t, hb.Close()) }() @@ -1974,7 +1981,7 @@ func TestIsolationLowWatermarkMonotonous(t *testing.T) { t.Skip("skipping test since tsdb isolation is disabled") } - hb, _ := newTestHead(t, 1000, false) + hb, _ := newTestHead(t, 1000, false, false) defer func() { require.NoError(t, hb.Close()) }() @@ -2011,7 +2018,7 @@ func TestIsolationAppendIDZeroIsNoop(t *testing.T) { t.Skip("skipping test since tsdb isolation is disabled") } - h, _ := newTestHead(t, 1000, false) + h, _ := newTestHead(t, 1000, false, false) defer func() { require.NoError(t, h.Close()) }() @@ -2036,7 +2043,7 @@ func TestIsolationWithoutAdd(t *testing.T) { t.Skip("skipping test since tsdb isolation is disabled") } - hb, _ := newTestHead(t, 1000, false) + hb, _ := newTestHead(t, 1000, false, false) defer func() { require.NoError(t, hb.Close()) }() @@ -2131,7 +2138,7 @@ func TestOutOfOrderSamplesMetric(t *testing.T) { } func testHeadSeriesChunkRace(t *testing.T) { - h, _ := newTestHead(t, 1000, false) + h, _ := newTestHead(t, 1000, false, false) defer func() { require.NoError(t, h.Close()) }() @@ -2166,7 +2173,7 @@ func testHeadSeriesChunkRace(t *testing.T) { } func TestHeadLabelNamesValuesWithMinMaxRange(t *testing.T) { - head, _ := newTestHead(t, 1000, false) + head, _ := newTestHead(t, 1000, false, false) defer func() { require.NoError(t, head.Close()) }() @@ -2226,7 +2233,7 @@ func TestHeadLabelNamesValuesWithMinMaxRange(t *testing.T) { } func TestHeadLabelValuesWithMatchers(t *testing.T) { - head, _ := newTestHead(t, 1000, false) + head, _ := newTestHead(t, 1000, false, false) t.Cleanup(func() { require.NoError(t, head.Close()) }) app := head.Appender(context.Background()) @@ -2285,7 +2292,7 @@ func TestHeadLabelValuesWithMatchers(t *testing.T) { } func TestHeadLabelNamesWithMatchers(t *testing.T) { - head, _ := newTestHead(t, 1000, false) + head, _ := newTestHead(t, 1000, false, false) defer func() { require.NoError(t, head.Close()) }() @@ -2353,7 +2360,7 @@ func TestHeadLabelNamesWithMatchers(t *testing.T) { } func TestErrReuseAppender(t *testing.T) { - head, _ := newTestHead(t, 1000, false) + head, _ := newTestHead(t, 1000, false, false) defer func() { require.NoError(t, head.Close()) }() @@ -2389,7 +2396,7 @@ func TestErrReuseAppender(t *testing.T) { func TestHeadMintAfterTruncation(t *testing.T) { chunkRange := int64(2000) - head, _ := newTestHead(t, chunkRange, false) + head, _ := newTestHead(t, chunkRange, false, false) app := head.Appender(context.Background()) _, err := app.Append(0, labels.FromStrings("a", "b"), 100, 100) @@ -2423,7 +2430,7 @@ func TestHeadMintAfterTruncation(t *testing.T) { func TestHeadExemplars(t *testing.T) { chunkRange := int64(2000) - head, _ := newTestHead(t, chunkRange, false) + head, _ := newTestHead(t, chunkRange, false, false) app := head.Appender(context.Background()) l := labels.FromStrings("traceId", "123") @@ -2445,7 +2452,7 @@ func TestHeadExemplars(t *testing.T) { func BenchmarkHeadLabelValuesWithMatchers(b *testing.B) { chunkRange := int64(2000) - head, _ := newTestHead(b, chunkRange, false) + head, _ := newTestHead(b, chunkRange, false, false) b.Cleanup(func() { require.NoError(b, head.Close()) }) app := head.Appender(context.Background()) @@ -2483,7 +2490,7 @@ func TestMemSafeIteratorSeekIntoBuffer(t *testing.T) { require.NoError(t, chunkDiskMapper.Close()) }() - s := newMemSeries(labels.Labels{}, 1, 500, defaultIsolationDisabled) + s := newMemSeries(labels.Labels{}, 1, 500, 1, defaultIsolationDisabled) for i := 0; i < 7; i++ { ok, _ := s.append(int64(i), float64(i), 0, chunkDiskMapper) @@ -2754,7 +2761,7 @@ func TestWaitForPendingReadersInTimeRange(t *testing.T) { } func TestChunkSnapshot(t *testing.T) { - head, _ := newTestHead(t, 120*4, false) + head, _ := newTestHead(t, 120*4, false, false) defer func() { head.opts.EnableMemorySnapshotOnShutdown = false require.NoError(t, head.Close()) @@ -2833,7 +2840,7 @@ func TestChunkSnapshot(t *testing.T) { openHeadAndCheckReplay := func() { w, err := wal.NewSize(nil, nil, head.wal.Dir(), 32768, false) require.NoError(t, err) - head, err = NewHead(nil, nil, w, head.opts, nil) + head, err = NewHead(nil, nil, w, nil, head.opts, nil) require.NoError(t, err) require.NoError(t, head.Init(math.MinInt64)) @@ -2996,7 +3003,7 @@ func TestChunkSnapshot(t *testing.T) { } func TestSnapshotError(t *testing.T) { - head, _ := newTestHead(t, 120*4, false) + head, _ := newTestHead(t, 120*4, false, false) defer func() { head.opts.EnableMemorySnapshotOnShutdown = false require.NoError(t, head.Close()) @@ -3043,7 +3050,7 @@ func TestSnapshotError(t *testing.T) { w, err := wal.NewSize(nil, nil, head.wal.Dir(), 32768, false) require.NoError(t, err) // Testing https://github.com/prometheus/prometheus/issues/9437 with the registry. - head, err = NewHead(prometheus.NewRegistry(), nil, w, head.opts, nil) + head, err = NewHead(prometheus.NewRegistry(), nil, w, nil, head.opts, nil) require.NoError(t, err) require.NoError(t, head.Init(math.MinInt64)) @@ -3102,7 +3109,7 @@ func TestChunkSnapshotReplayBug(t *testing.T) { opts := DefaultHeadOptions() opts.ChunkDirRoot = dir opts.EnableMemorySnapshotOnShutdown = true - head, err := NewHead(nil, nil, wlog, opts, nil) + head, err := NewHead(nil, nil, wlog, nil, opts, nil) require.NoError(t, err) require.NoError(t, head.Init(math.MinInt64)) defer func() { @@ -3136,7 +3143,7 @@ func TestChunkSnapshotTakenAfterIncompleteSnapshot(t *testing.T) { opts := DefaultHeadOptions() opts.ChunkDirRoot = dir opts.EnableMemorySnapshotOnShutdown = true - head, err := NewHead(nil, nil, wlog, opts, nil) + head, err := NewHead(nil, nil, wlog, nil, opts, nil) require.NoError(t, err) require.NoError(t, head.Init(math.MinInt64)) @@ -3159,6 +3166,251 @@ func TestChunkSnapshotTakenAfterIncompleteSnapshot(t *testing.T) { require.Greater(t, offset, 0) } +// TestOOOWalReplay checks the replay at a low level. +// TODO(codesome): Needs test for ooo WAL repair. +func TestOOOWalReplay(t *testing.T) { + dir := t.TempDir() + wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true) + require.NoError(t, err) + oooWlog, err := wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true) + require.NoError(t, err) + + opts := DefaultHeadOptions() + opts.ChunkRange = 1000 + opts.ChunkDirRoot = dir + opts.OutOfOrderTimeWindow.Store(30 * time.Minute.Milliseconds()) + + h, err := NewHead(nil, nil, wlog, oooWlog, opts, nil) + require.NoError(t, err) + require.NoError(t, h.Init(0)) + + var expOOOSamples []sample + l := labels.FromStrings("foo", "bar") + appendSample := func(mins int64, isOOO bool) { + app := h.Appender(context.Background()) + ts, v := mins*time.Minute.Milliseconds(), float64(mins) + _, err := app.Append(0, l, ts, v) + require.NoError(t, err) + require.NoError(t, app.Commit()) + + if isOOO { + expOOOSamples = append(expOOOSamples, sample{t: ts, v: v}) + } + } + + // In-order sample. + appendSample(60, false) + + // Out of order samples. + appendSample(40, true) + appendSample(35, true) + appendSample(50, true) + appendSample(55, true) + appendSample(59, true) + appendSample(31, true) + + // Check that Head's time ranges are set properly. + require.Equal(t, 60*time.Minute.Milliseconds(), h.MinTime()) + require.Equal(t, 60*time.Minute.Milliseconds(), h.MaxTime()) + require.Equal(t, 31*time.Minute.Milliseconds(), h.MinOOOTime()) + require.Equal(t, 59*time.Minute.Milliseconds(), h.MaxOOOTime()) + + // Restart head. + require.NoError(t, h.Close()) + wlog, err = wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true) + require.NoError(t, err) + oooWlog, err = wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true) + require.NoError(t, err) + h, err = NewHead(nil, nil, wlog, oooWlog, opts, nil) + require.NoError(t, err) + require.NoError(t, h.Init(0)) // Replay happens here. + + // Get the ooo samples from the Head. + ms, ok, err := h.getOrCreate(l.Hash(), l) + require.NoError(t, err) + require.False(t, ok) + require.NotNil(t, ms) + + xor, err := ms.oooHeadChunk.chunk.ToXOR() + require.NoError(t, err) + + it := xor.Iterator(nil) + actOOOSamples := make([]sample, 0, len(expOOOSamples)) + for it.Next() { + ts, v := it.At() + actOOOSamples = append(actOOOSamples, sample{t: ts, v: v}) + } + + // OOO chunk will be sorted. Hence sort the expected samples. + sort.Slice(expOOOSamples, func(i, j int) bool { + return expOOOSamples[i].t < expOOOSamples[j].t + }) + + require.Equal(t, expOOOSamples, actOOOSamples) + + require.NoError(t, h.Close()) +} + +// TestOOOMmapReplay checks the replay at a low level. +func TestOOOMmapReplay(t *testing.T) { + dir := t.TempDir() + wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true) + require.NoError(t, err) + oooWlog, err := wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true) + require.NoError(t, err) + + opts := DefaultHeadOptions() + opts.ChunkRange = 1000 + opts.ChunkDirRoot = dir + opts.OutOfOrderCapMax.Store(30) + opts.OutOfOrderTimeWindow.Store(1000 * time.Minute.Milliseconds()) + + h, err := NewHead(nil, nil, wlog, oooWlog, opts, nil) + require.NoError(t, err) + require.NoError(t, h.Init(0)) + + l := labels.FromStrings("foo", "bar") + appendSample := func(mins int64) { + app := h.Appender(context.Background()) + ts, v := mins*time.Minute.Milliseconds(), float64(mins) + _, err := app.Append(0, l, ts, v) + require.NoError(t, err) + require.NoError(t, app.Commit()) + } + + // In-order sample. + appendSample(200) + + // Out of order samples. 92 samples to create 3 m-map chunks. + for mins := int64(100); mins <= 191; mins++ { + appendSample(mins) + } + + ms, ok, err := h.getOrCreate(l.Hash(), l) + require.NoError(t, err) + require.False(t, ok) + require.NotNil(t, ms) + + require.Len(t, ms.oooMmappedChunks, 3) + // Verify that we can access the chunks without error. + for _, m := range ms.oooMmappedChunks { + chk, err := h.chunkDiskMapper.Chunk(m.ref) + require.NoError(t, err) + require.Equal(t, int(m.numSamples), chk.NumSamples()) + } + + expMmapChunks := make([]*mmappedChunk, 3) + copy(expMmapChunks, ms.oooMmappedChunks) + + // Restart head. + require.NoError(t, h.Close()) + + wlog, err = wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true) + require.NoError(t, err) + oooWlog, err = wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true) + require.NoError(t, err) + h, err = NewHead(nil, nil, wlog, oooWlog, opts, nil) + require.NoError(t, err) + require.NoError(t, h.Init(0)) // Replay happens here. + + // Get the mmap chunks from the Head. + ms, ok, err = h.getOrCreate(l.Hash(), l) + require.NoError(t, err) + require.False(t, ok) + require.NotNil(t, ms) + + require.Len(t, ms.oooMmappedChunks, len(expMmapChunks)) + // Verify that we can access the chunks without error. + for _, m := range ms.oooMmappedChunks { + chk, err := h.chunkDiskMapper.Chunk(m.ref) + require.NoError(t, err) + require.Equal(t, int(m.numSamples), chk.NumSamples()) + } + + actMmapChunks := make([]*mmappedChunk, len(expMmapChunks)) + copy(actMmapChunks, ms.oooMmappedChunks) + + require.Equal(t, expMmapChunks, actMmapChunks) + + require.NoError(t, h.Close()) +} + +func TestHeadInit_DiscardChunksWithUnsupportedEncoding(t *testing.T) { + h, _ := newTestHead(t, 1000, false, false) + defer func() { + require.NoError(t, h.Close()) + }() + + require.NoError(t, h.Init(0)) + + ctx := context.Background() + app := h.Appender(ctx) + seriesLabels := labels.FromStrings("a", "1") + var seriesRef storage.SeriesRef + var err error + for i := 0; i < 400; i++ { + seriesRef, err = app.Append(0, seriesLabels, int64(i), float64(i)) + require.NoError(t, err) + } + + require.NoError(t, app.Commit()) + require.Greater(t, prom_testutil.ToFloat64(h.metrics.chunksCreated), 1.0) + + uc := newUnsupportedChunk() + // Make this chunk not overlap with the previous and the next + h.chunkDiskMapper.WriteChunk(chunks.HeadSeriesRef(seriesRef), 500, 600, uc, func(err error) { require.NoError(t, err) }) + + app = h.Appender(ctx) + for i := 700; i < 1200; i++ { + _, err := app.Append(0, seriesLabels, int64(i), float64(i)) + require.NoError(t, err) + } + + require.NoError(t, app.Commit()) + require.Greater(t, prom_testutil.ToFloat64(h.metrics.chunksCreated), 4.0) + + series, created, err := h.getOrCreate(seriesLabels.Hash(), seriesLabels) + require.NoError(t, err) + require.False(t, created, "should already exist") + require.NotNil(t, series, "should return the series we created above") + + expChunks := make([]*mmappedChunk, len(series.mmappedChunks)) + copy(expChunks, series.mmappedChunks) + + require.NoError(t, h.Close()) + + wlog, err := wal.NewSize(nil, nil, filepath.Join(h.opts.ChunkDirRoot, "wal"), 32768, false) + require.NoError(t, err) + h, err = NewHead(nil, nil, wlog, nil, h.opts, nil) + require.NoError(t, err) + require.NoError(t, h.Init(0)) + + series, created, err = h.getOrCreate(seriesLabels.Hash(), seriesLabels) + require.NoError(t, err) + require.False(t, created, "should already exist") + require.NotNil(t, series, "should return the series we created above") + + require.Equal(t, expChunks, series.mmappedChunks) +} + +const ( + UnsupportedMask = 0b10000000 + EncUnsupportedXOR = chunkenc.EncXOR | UnsupportedMask +) + +// unsupportedChunk holds a XORChunk and overrides the Encoding() method. +type unsupportedChunk struct { + *chunkenc.XORChunk +} + +func newUnsupportedChunk() *unsupportedChunk { + return &unsupportedChunk{chunkenc.NewXORChunk()} +} + +func (c *unsupportedChunk) Encoding() chunkenc.Encoding { + return EncUnsupportedXOR +} + // Tests https://github.com/prometheus/prometheus/issues/10277. func TestMmapPanicAfterMmapReplayCorruption(t *testing.T) { dir := t.TempDir() @@ -3171,7 +3423,7 @@ func TestMmapPanicAfterMmapReplayCorruption(t *testing.T) { opts.EnableExemplarStorage = true opts.MaxExemplars.Store(config.DefaultExemplarsConfig.MaxExemplars) - h, err := NewHead(nil, nil, wlog, opts, nil) + h, err := NewHead(nil, nil, wlog, nil, opts, nil) require.NoError(t, err) require.NoError(t, h.Init(0)) @@ -3205,7 +3457,7 @@ func TestMmapPanicAfterMmapReplayCorruption(t *testing.T) { require.NoError(t, err) require.NoError(t, f.Close()) - h, err = NewHead(nil, nil, wlog, opts, nil) + h, err = NewHead(nil, nil, wlog, nil, opts, nil) require.NoError(t, err) require.NoError(t, h.Init(0)) @@ -3230,7 +3482,7 @@ func TestReplayAfterMmapReplayError(t *testing.T) { opts.EnableMemorySnapshotOnShutdown = true opts.MaxExemplars.Store(config.DefaultExemplarsConfig.MaxExemplars) - h, err = NewHead(nil, nil, wlog, opts, nil) + h, err = NewHead(nil, nil, wlog, nil, opts, nil) require.NoError(t, err) require.NoError(t, h.Init(0)) } @@ -3292,3 +3544,131 @@ func TestReplayAfterMmapReplayError(t *testing.T) { require.NoError(t, h.Close()) } + +func TestOOOAppendWithNoSeries(t *testing.T) { + dir := t.TempDir() + wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true) + require.NoError(t, err) + oooWlog, err := wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true) + require.NoError(t, err) + + opts := DefaultHeadOptions() + opts.ChunkDirRoot = dir + opts.OutOfOrderCapMax.Store(30) + opts.OutOfOrderTimeWindow.Store(120 * time.Minute.Milliseconds()) + + h, err := NewHead(nil, nil, wlog, oooWlog, opts, nil) + require.NoError(t, err) + t.Cleanup(func() { + require.NoError(t, h.Close()) + }) + require.NoError(t, h.Init(0)) + + appendSample := func(lbls labels.Labels, ts int64) { + app := h.Appender(context.Background()) + _, err := app.Append(0, lbls, ts*time.Minute.Milliseconds(), float64(ts)) + require.NoError(t, err) + require.NoError(t, app.Commit()) + } + + verifyOOOSamples := func(lbls labels.Labels, expSamples int) { + ms, created, err := h.getOrCreate(lbls.Hash(), lbls) + require.NoError(t, err) + require.False(t, created) + require.NotNil(t, ms) + + require.Nil(t, ms.headChunk) + require.NotNil(t, ms.oooHeadChunk) + require.Equal(t, expSamples, ms.oooHeadChunk.chunk.NumSamples()) + } + + verifyInOrderSamples := func(lbls labels.Labels, expSamples int) { + ms, created, err := h.getOrCreate(lbls.Hash(), lbls) + require.NoError(t, err) + require.False(t, created) + require.NotNil(t, ms) + + require.Nil(t, ms.oooHeadChunk) + require.NotNil(t, ms.headChunk) + require.Equal(t, expSamples, ms.headChunk.chunk.NumSamples()) + } + + newLabels := func(idx int) labels.Labels { return labels.FromStrings("foo", fmt.Sprintf("%d", idx)) } + + s1 := newLabels(1) + appendSample(s1, 300) // At 300m. + verifyInOrderSamples(s1, 1) + + // At 239m, the sample cannot be appended to in-order chunk since it is + // beyond the minValidTime. So it should go in OOO chunk. + // Series does not exist for s2 yet. + s2 := newLabels(2) + appendSample(s2, 239) // OOO sample. + verifyOOOSamples(s2, 1) + + // Similar for 180m. + s3 := newLabels(3) + appendSample(s3, 180) // OOO sample. + verifyOOOSamples(s3, 1) + + // Now 179m is too old. + s4 := newLabels(4) + app := h.Appender(context.Background()) + _, err = app.Append(0, s4, 179*time.Minute.Milliseconds(), float64(179)) + require.Equal(t, storage.ErrTooOldSample, err) + require.NoError(t, app.Rollback()) + verifyOOOSamples(s3, 1) + + // Samples still go into in-order chunk for samples within + // appendable minValidTime. + s5 := newLabels(5) + appendSample(s5, 240) + verifyInOrderSamples(s5, 1) +} + +func TestHeadMinOOOTimeUpdate(t *testing.T) { + dir := t.TempDir() + wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true) + require.NoError(t, err) + oooWlog, err := wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true) + require.NoError(t, err) + + opts := DefaultHeadOptions() + opts.ChunkDirRoot = dir + opts.OutOfOrderTimeWindow.Store(10 * time.Minute.Milliseconds()) + + h, err := NewHead(nil, nil, wlog, oooWlog, opts, nil) + require.NoError(t, err) + t.Cleanup(func() { + require.NoError(t, h.Close()) + }) + require.NoError(t, h.Init(0)) + + appendSample := func(ts int64) { + lbls := labels.FromStrings("foo", "bar") + app := h.Appender(context.Background()) + _, err := app.Append(0, lbls, ts*time.Minute.Milliseconds(), float64(ts)) + require.NoError(t, err) + require.NoError(t, app.Commit()) + } + + appendSample(300) // In-order sample. + + require.Equal(t, int64(math.MaxInt64), h.MinOOOTime()) + + appendSample(295) // OOO sample. + require.Equal(t, 295*time.Minute.Milliseconds(), h.MinOOOTime()) + + // Allowed window for OOO is >=290, which is before the earliest ooo sample 295, so it gets set to the lower value. + require.NoError(t, h.truncateOOO(0, 1)) + require.Equal(t, 290*time.Minute.Milliseconds(), h.MinOOOTime()) + + appendSample(310) // In-order sample. + appendSample(305) // OOO sample. + require.Equal(t, 290*time.Minute.Milliseconds(), h.MinOOOTime()) + + // Now the OOO sample 295 was not gc'ed yet. And allowed window for OOO is now >=300. + // So the lowest among them, 295, is set as minOOOTime. + require.NoError(t, h.truncateOOO(0, 2)) + require.Equal(t, 295*time.Minute.Milliseconds(), h.MinOOOTime()) +} diff --git a/tsdb/head_wal.go b/tsdb/head_wal.go index 6ea58bcd4e..8bbe33cc48 100644 --- a/tsdb/head_wal.go +++ b/tsdb/head_wal.go @@ -42,7 +42,7 @@ import ( "github.com/prometheus/prometheus/tsdb/wal" ) -func (h *Head) loadWAL(r *wal.Reader, multiRef map[chunks.HeadSeriesRef]chunks.HeadSeriesRef, mmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk) (err error) { +func (h *Head) loadWAL(r *wal.Reader, multiRef map[chunks.HeadSeriesRef]chunks.HeadSeriesRef, mmappedChunks, oooMmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk) (err error) { // Track number of samples that referenced a series we don't know about // for error reporting. var unknownRefs atomic.Uint64 @@ -107,7 +107,7 @@ func (h *Head) loadWAL(r *wal.Reader, multiRef map[chunks.HeadSeriesRef]chunks.H processors[i].setup() go func(wp *walSubsetProcessor) { - unknown, overlapping := wp.processWALSamples(h, mmappedChunks) + unknown, overlapping := wp.processWALSamples(h, mmappedChunks, oooMmappedChunks) unknownRefs.Add(unknown) mmapOverlappingChunks.Add(overlapping) wg.Done() @@ -343,7 +343,7 @@ Outer: } // resetSeriesWithMMappedChunks is only used during the WAL replay. -func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc []*mmappedChunk, walSeriesRef chunks.HeadSeriesRef) (overlapped bool) { +func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc, oooMmc []*mmappedChunk, walSeriesRef chunks.HeadSeriesRef) (overlapped bool) { if mSeries.ref != walSeriesRef { // Checking if the new m-mapped chunks overlap with the already existing ones. if len(mSeries.mmappedChunks) > 0 && len(mmc) > 0 { @@ -368,10 +368,11 @@ func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc []*mmappedCh } } - h.metrics.chunksCreated.Add(float64(len(mmc))) + h.metrics.chunksCreated.Add(float64(len(mmc) + len(oooMmc))) h.metrics.chunksRemoved.Add(float64(len(mSeries.mmappedChunks))) - h.metrics.chunks.Add(float64(len(mmc) - len(mSeries.mmappedChunks))) + h.metrics.chunks.Add(float64(len(mmc) + len(oooMmc) - len(mSeries.mmappedChunks))) mSeries.mmappedChunks = mmc + mSeries.oooMmappedChunks = oooMmc // Cache the last mmapped chunk time, so we can skip calling append() for samples it will reject. if len(mmc) == 0 { mSeries.mmMaxTime = math.MinInt64 @@ -379,6 +380,19 @@ func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc []*mmappedCh mSeries.mmMaxTime = mmc[len(mmc)-1].maxTime h.updateMinMaxTime(mmc[0].minTime, mSeries.mmMaxTime) } + if len(oooMmc) != 0 { + // Mint and maxt can be in any chunk, they are not sorted. + mint, maxt := int64(math.MaxInt64), int64(math.MinInt64) + for _, ch := range oooMmc { + if ch.minTime < mint { + mint = ch.minTime + } + if ch.maxTime > maxt { + maxt = ch.maxTime + } + } + h.updateMinOOOMaxOOOTime(mint, maxt) + } // Any samples replayed till now would already be compacted. Resetting the head chunk. mSeries.nextAt = 0 @@ -421,7 +435,7 @@ func (wp *walSubsetProcessor) reuseBuf() []record.RefSample { // processWALSamples adds the samples it receives to the head and passes // the buffer received to an output channel for reuse. -func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk) (unknownRefs, mmapOverlappingChunks uint64) { +func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks, oooMmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk) (unknownRefs, mmapOverlappingChunks uint64) { defer close(wp.output) mint, maxt := int64(math.MaxInt64), int64(math.MinInt64) @@ -429,7 +443,8 @@ func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks map[chunk for in := range wp.input { if in.existingSeries != nil { mmc := mmappedChunks[in.walSeriesRef] - if h.resetSeriesWithMMappedChunks(in.existingSeries, mmc, in.walSeriesRef) { + oooMmc := oooMmappedChunks[in.walSeriesRef] + if h.resetSeriesWithMMappedChunks(in.existingSeries, mmc, oooMmc, in.walSeriesRef) { mmapOverlappingChunks++ } continue @@ -465,6 +480,292 @@ func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks map[chunk return unknownRefs, mmapOverlappingChunks } +func (h *Head) loadWBL(r *wal.Reader, multiRef map[chunks.HeadSeriesRef]chunks.HeadSeriesRef, lastMmapRef chunks.ChunkDiskMapperRef) (err error) { + // Track number of samples, m-map markers, that referenced a series we don't know about + // for error reporting. + var unknownRefs, mmapMarkerUnknownRefs atomic.Uint64 + + lastSeq, lastOff := lastMmapRef.Unpack() + // Start workers that each process samples for a partition of the series ID space. + var ( + wg sync.WaitGroup + n = runtime.GOMAXPROCS(0) + processors = make([]wblSubsetProcessor, n) + + dec record.Decoder + shards = make([][]record.RefSample, n) + + decodedCh = make(chan interface{}, 10) + decodeErr error + samplesPool = sync.Pool{ + New: func() interface{} { + return []record.RefSample{} + }, + } + markersPool = sync.Pool{ + New: func() interface{} { + return []record.RefMmapMarker{} + }, + } + ) + + defer func() { + // For CorruptionErr ensure to terminate all workers before exiting. + // We also wrap it to identify OOO WBL corruption. + _, ok := err.(*wal.CorruptionErr) + if ok { + err = &errLoadWbl{err: err} + for i := 0; i < n; i++ { + processors[i].closeAndDrain() + } + wg.Wait() + } + }() + + wg.Add(n) + for i := 0; i < n; i++ { + processors[i].setup() + + go func(wp *wblSubsetProcessor) { + unknown := wp.processWBLSamples(h) + unknownRefs.Add(unknown) + wg.Done() + }(&processors[i]) + } + + go func() { + defer close(decodedCh) + for r.Next() { + rec := r.Record() + switch dec.Type(rec) { + case record.Samples: + samples := samplesPool.Get().([]record.RefSample)[:0] + samples, err = dec.Samples(rec, samples) + if err != nil { + decodeErr = &wal.CorruptionErr{ + Err: errors.Wrap(err, "decode samples"), + Segment: r.Segment(), + Offset: r.Offset(), + } + return + } + decodedCh <- samples + case record.MmapMarkers: + markers := markersPool.Get().([]record.RefMmapMarker)[:0] + markers, err = dec.MmapMarkers(rec, markers) + if err != nil { + decodeErr = &wal.CorruptionErr{ + Err: errors.Wrap(err, "decode mmap markers"), + Segment: r.Segment(), + Offset: r.Offset(), + } + return + } + decodedCh <- markers + default: + // Noop. + } + } + }() + + // The records are always replayed from the oldest to the newest. + for d := range decodedCh { + switch v := d.(type) { + case []record.RefSample: + samples := v + // We split up the samples into parts of 5000 samples or less. + // With O(300 * #cores) in-flight sample batches, large scrapes could otherwise + // cause thousands of very large in flight buffers occupying large amounts + // of unused memory. + for len(samples) > 0 { + m := 5000 + if len(samples) < m { + m = len(samples) + } + for i := 0; i < n; i++ { + shards[i] = processors[i].reuseBuf() + } + for _, sam := range samples[:m] { + if r, ok := multiRef[sam.Ref]; ok { + sam.Ref = r + } + mod := uint64(sam.Ref) % uint64(n) + shards[mod] = append(shards[mod], sam) + } + for i := 0; i < n; i++ { + processors[i].input <- shards[i] + } + samples = samples[m:] + } + //nolint:staticcheck // Ignore SA6002 relax staticcheck verification. + samplesPool.Put(d) + case []record.RefMmapMarker: + markers := v + for _, rm := range markers { + seq, off := rm.MmapRef.Unpack() + if seq > lastSeq || (seq == lastSeq && off > lastOff) { + // This m-map chunk from markers was not present during + // the load of mmapped chunks that happened in the head + // initialization. + continue + } + + if r, ok := multiRef[rm.Ref]; ok { + rm.Ref = r + } + + ms := h.series.getByID(rm.Ref) + if ms == nil { + mmapMarkerUnknownRefs.Inc() + continue + } + idx := uint64(ms.ref) % uint64(n) + // It is possible that some old sample is being processed in processWALSamples that + // could cause race below. So we wait for the goroutine to empty input the buffer and finish + // processing all old samples after emptying the buffer. + processors[idx].waitUntilIdle() + // Lock the subset so we can modify the series object + processors[idx].mx.Lock() + + // All samples till now have been m-mapped. Hence clear out the headChunk. + // In case some samples slipped through and went into m-map chunks because of changed + // chunk size parameters, we are not taking care of that here. + // TODO(codesome): see if there is a way to avoid duplicate m-map chunks if + // the size of ooo chunk was reduced between restart. + ms.oooHeadChunk = nil + + processors[idx].mx.Unlock() + } + default: + panic(fmt.Errorf("unexpected decodedCh type: %T", d)) + } + } + + if decodeErr != nil { + return decodeErr + } + + // Signal termination to each worker and wait for it to close its output channel. + for i := 0; i < n; i++ { + processors[i].closeAndDrain() + } + wg.Wait() + + if r.Err() != nil { + return errors.Wrap(r.Err(), "read records") + } + + if unknownRefs.Load() > 0 || mmapMarkerUnknownRefs.Load() > 0 { + level.Warn(h.logger).Log("msg", "Unknown series references for ooo WAL replay", "samples", unknownRefs.Load(), "mmap_markers", mmapMarkerUnknownRefs.Load()) + } + return nil +} + +type errLoadWbl struct { + err error +} + +func (e errLoadWbl) Error() string { + return e.err.Error() +} + +// To support errors.Cause(). +func (e errLoadWbl) Cause() error { + return e.err +} + +// To support errors.Unwrap(). +func (e errLoadWbl) Unwrap() error { + return e.err +} + +// isErrLoadOOOWal returns a boolean if the error is errLoadWbl. +func isErrLoadOOOWal(err error) bool { + _, ok := err.(*errLoadWbl) + return ok +} + +type wblSubsetProcessor struct { + mx sync.Mutex // Take this lock while modifying series in the subset. + input chan []record.RefSample + output chan []record.RefSample +} + +func (wp *wblSubsetProcessor) setup() { + wp.output = make(chan []record.RefSample, 300) + wp.input = make(chan []record.RefSample, 300) +} + +func (wp *wblSubsetProcessor) closeAndDrain() { + close(wp.input) + for range wp.output { + } +} + +// If there is a buffer in the output chan, return it for reuse, otherwise return nil. +func (wp *wblSubsetProcessor) reuseBuf() []record.RefSample { + select { + case buf := <-wp.output: + return buf[:0] + default: + } + return nil +} + +// processWBLSamples adds the samples it receives to the head and passes +// the buffer received to an output channel for reuse. +// Samples before the minValidTime timestamp are discarded. +func (wp *wblSubsetProcessor) processWBLSamples(h *Head) (unknownRefs uint64) { + defer close(wp.output) + + // We don't check for minValidTime for ooo samples. + mint, maxt := int64(math.MaxInt64), int64(math.MinInt64) + for samples := range wp.input { + wp.mx.Lock() + for _, s := range samples { + ms := h.series.getByID(s.Ref) + if ms == nil { + unknownRefs++ + continue + } + ok, chunkCreated, _ := ms.insert(s.T, s.V, h.chunkDiskMapper) + if chunkCreated { + h.metrics.chunksCreated.Inc() + h.metrics.chunks.Inc() + } + if ok { + if s.T < mint { + mint = s.T + } + if s.T > maxt { + maxt = s.T + } + } + } + wp.mx.Unlock() + wp.output <- samples + } + + h.updateMinOOOMaxOOOTime(mint, maxt) + + return unknownRefs +} + +func (wp *wblSubsetProcessor) waitUntilIdle() { + select { + case <-wp.output: // Allow output side to drain to avoid deadlock. + default: + } + wp.input <- []record.RefSample{} + for len(wp.input) != 0 { + time.Sleep(10 * time.Microsecond) + select { + case <-wp.output: // Allow output side to drain to avoid deadlock. + default: + } + } +} + const ( chunkSnapshotRecordTypeSeries uint8 = 1 chunkSnapshotRecordTypeTombstones uint8 = 2 diff --git a/tsdb/ooo_head.go b/tsdb/ooo_head.go new file mode 100644 index 0000000000..3af6039912 --- /dev/null +++ b/tsdb/ooo_head.go @@ -0,0 +1,159 @@ +// Copyright 2022 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tsdb + +import ( + "fmt" + "sort" + + "github.com/prometheus/prometheus/tsdb/chunkenc" + "github.com/prometheus/prometheus/tsdb/tombstones" +) + +// OOOChunk maintains samples in time-ascending order. +// Inserts for timestamps already seen, are dropped. +// Samples are stored uncompressed to allow easy sorting. +// Perhaps we can be more efficient later. +type OOOChunk struct { + samples []sample +} + +func NewOOOChunk() *OOOChunk { + return &OOOChunk{samples: make([]sample, 0, 4)} +} + +// Insert inserts the sample such that order is maintained. +// Returns false if insert was not possible due to the same timestamp already existing. +func (o *OOOChunk) Insert(t int64, v float64) bool { + // Find index of sample we should replace. + i := sort.Search(len(o.samples), func(i int) bool { return o.samples[i].t >= t }) + + if i >= len(o.samples) { + // none found. append it at the end + o.samples = append(o.samples, sample{t, v}) + return true + } + + if o.samples[i].t == t { + return false + } + + // Expand length by 1 to make room. use a zero sample, we will overwrite it anyway. + o.samples = append(o.samples, sample{}) + copy(o.samples[i+1:], o.samples[i:]) + o.samples[i] = sample{t, v} + + return true +} + +func (o *OOOChunk) NumSamples() int { + return len(o.samples) +} + +func (o *OOOChunk) ToXOR() (*chunkenc.XORChunk, error) { + x := chunkenc.NewXORChunk() + app, err := x.Appender() + if err != nil { + return nil, err + } + for _, s := range o.samples { + app.Append(s.t, s.v) + } + return x, nil +} + +func (o *OOOChunk) ToXORBetweenTimestamps(mint, maxt int64) (*chunkenc.XORChunk, error) { + x := chunkenc.NewXORChunk() + app, err := x.Appender() + if err != nil { + return nil, err + } + for _, s := range o.samples { + if s.t < mint { + continue + } + if s.t > maxt { + break + } + app.Append(s.t, s.v) + } + return x, nil +} + +var _ BlockReader = &OOORangeHead{} + +// OOORangeHead allows querying Head out of order samples via BlockReader +// interface implementation. +type OOORangeHead struct { + head *Head + // mint and maxt are tracked because when a query is handled we only want + // the timerange of the query and having preexisting pointers to the first + // and last timestamp help with that. + mint, maxt int64 +} + +func NewOOORangeHead(head *Head, mint, maxt int64) *OOORangeHead { + return &OOORangeHead{ + head: head, + mint: mint, + maxt: maxt, + } +} + +func (oh *OOORangeHead) Index() (IndexReader, error) { + return NewOOOHeadIndexReader(oh.head, oh.mint, oh.maxt), nil +} + +func (oh *OOORangeHead) Chunks() (ChunkReader, error) { + return NewOOOHeadChunkReader(oh.head, oh.mint, oh.maxt), nil +} + +func (oh *OOORangeHead) Tombstones() (tombstones.Reader, error) { + // As stated in the design doc https://docs.google.com/document/d/1Kppm7qL9C-BJB1j6yb6-9ObG3AbdZnFUBYPNNWwDBYM/edit?usp=sharing + // Tombstones are not supported for out of order metrics. + return tombstones.NewMemTombstones(), nil +} + +func (oh *OOORangeHead) Meta() BlockMeta { + var id [16]byte + copy(id[:], "____ooo_head____") + return BlockMeta{ + MinTime: oh.mint, + MaxTime: oh.maxt, + ULID: id, + Stats: BlockStats{ + NumSeries: oh.head.NumSeries(), + }, + } +} + +// Size returns the size taken by the Head block. +func (oh *OOORangeHead) Size() int64 { + return oh.head.Size() +} + +// String returns an human readable representation of the out of order range +// head. It's important to keep this function in order to avoid the struct dump +// when the head is stringified in errors or logs. +func (oh *OOORangeHead) String() string { + return fmt.Sprintf("ooo range head (mint: %d, maxt: %d)", oh.MinTime(), oh.MaxTime()) +} + +func (oh *OOORangeHead) MinTime() int64 { + return oh.mint +} + +func (oh *OOORangeHead) MaxTime() int64 { + return oh.maxt +} diff --git a/tsdb/ooo_head_read.go b/tsdb/ooo_head_read.go new file mode 100644 index 0000000000..f63607dc9c --- /dev/null +++ b/tsdb/ooo_head_read.go @@ -0,0 +1,433 @@ +// Copyright 2022 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tsdb + +import ( + "errors" + "math" + "sort" + + "github.com/prometheus/prometheus/model/labels" + "github.com/prometheus/prometheus/storage" + "github.com/prometheus/prometheus/tsdb/chunkenc" + "github.com/prometheus/prometheus/tsdb/chunks" + "github.com/prometheus/prometheus/tsdb/index" + "github.com/prometheus/prometheus/tsdb/tombstones" +) + +var _ IndexReader = &OOOHeadIndexReader{} + +// OOOHeadIndexReader implements IndexReader so ooo samples in the head can be +// accessed. +// It also has a reference to headIndexReader so we can leverage on its +// IndexReader implementation for all the methods that remain the same. We +// decided to do this to avoid code duplication. +// The only methods that change are the ones about getting Series and Postings. +type OOOHeadIndexReader struct { + *headIndexReader // A reference to the headIndexReader so we can reuse as many interface implementation as possible. +} + +func NewOOOHeadIndexReader(head *Head, mint, maxt int64) *OOOHeadIndexReader { + hr := &headIndexReader{ + head: head, + mint: mint, + maxt: maxt, + } + return &OOOHeadIndexReader{hr} +} + +func (oh *OOOHeadIndexReader) Series(ref storage.SeriesRef, lbls *labels.Labels, chks *[]chunks.Meta) error { + return oh.series(ref, lbls, chks, 0) +} + +// The passed lastMmapRef tells upto what max m-map chunk that we can consider. +// If it is 0, it means all chunks need to be considered. +// If it is non-0, then the oooHeadChunk must not be considered. +func (oh *OOOHeadIndexReader) series(ref storage.SeriesRef, lbls *labels.Labels, chks *[]chunks.Meta, lastMmapRef chunks.ChunkDiskMapperRef) error { + s := oh.head.series.getByID(chunks.HeadSeriesRef(ref)) + + if s == nil { + oh.head.metrics.seriesNotFound.Inc() + return storage.ErrNotFound + } + *lbls = append((*lbls)[:0], s.lset...) + + if chks == nil { + return nil + } + + s.Lock() + defer s.Unlock() + *chks = (*chks)[:0] + + tmpChks := make([]chunks.Meta, 0, len(s.oooMmappedChunks)) + + // We define these markers to track the last chunk reference while we + // fill the chunk meta. + // These markers are useful to give consistent responses to repeated queries + // even if new chunks that might be overlapping or not are added afterwards. + // Also, lastMinT and lastMaxT are initialized to the max int as a sentinel + // value to know they are unset. + var lastChunkRef chunks.ChunkRef + lastMinT, lastMaxT := int64(math.MaxInt64), int64(math.MaxInt64) + + addChunk := func(minT, maxT int64, ref chunks.ChunkRef) { + // the first time we get called is for the last included chunk. + // set the markers accordingly + if lastMinT == int64(math.MaxInt64) { + lastChunkRef = ref + lastMinT = minT + lastMaxT = maxT + } + + tmpChks = append(tmpChks, chunks.Meta{ + MinTime: minT, + MaxTime: maxT, + Ref: ref, + OOOLastRef: lastChunkRef, + OOOLastMinTime: lastMinT, + OOOLastMaxTime: lastMaxT, + }) + } + + // Collect all chunks that overlap the query range, in order from most recent to most old, + // so we can set the correct markers. + if s.oooHeadChunk != nil { + c := s.oooHeadChunk + if c.OverlapsClosedInterval(oh.mint, oh.maxt) && lastMmapRef == 0 { + ref := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(len(s.oooMmappedChunks)))) + addChunk(c.minTime, c.maxTime, ref) + } + } + for i := len(s.oooMmappedChunks) - 1; i >= 0; i-- { + c := s.oooMmappedChunks[i] + if c.OverlapsClosedInterval(oh.mint, oh.maxt) && (lastMmapRef == 0 || lastMmapRef.GreaterThanOrEqualTo(c.ref)) { + ref := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(i))) + addChunk(c.minTime, c.maxTime, ref) + } + } + + // There is nothing to do if we did not collect any chunk + if len(tmpChks) == 0 { + return nil + } + + // Next we want to sort all the collected chunks by min time so we can find + // those that overlap. + sort.Sort(metaByMinTimeAndMinRef(tmpChks)) + + // Next we want to iterate the sorted collected chunks and only return the + // chunks Meta the first chunk that overlaps with others. + // Example chunks of a series: 5:(100, 200) 6:(500, 600) 7:(150, 250) 8:(550, 650) + // In the example 5 overlaps with 7 and 6 overlaps with 8 so we only want to + // to return chunk Metas for chunk 5 and chunk 6 + *chks = append(*chks, tmpChks[0]) + maxTime := tmpChks[0].MaxTime // tracks the maxTime of the previous "to be merged chunk" + for _, c := range tmpChks[1:] { + if c.MinTime > maxTime { + *chks = append(*chks, c) + maxTime = c.MaxTime + } else if c.MaxTime > maxTime { + maxTime = c.MaxTime + (*chks)[len(*chks)-1].MaxTime = c.MaxTime + } + } + + return nil +} + +// LabelValues needs to be overridden from the headIndexReader implementation due +// to the check that happens at the beginning where we make sure that the query +// interval overlaps with the head minooot and maxooot. +func (oh *OOOHeadIndexReader) LabelValues(name string, matchers ...*labels.Matcher) ([]string, error) { + if oh.maxt < oh.head.MinOOOTime() || oh.mint > oh.head.MaxOOOTime() { + return []string{}, nil + } + + if len(matchers) == 0 { + return oh.head.postings.LabelValues(name), nil + } + + return labelValuesWithMatchers(oh, name, matchers...) +} + +type chunkMetaAndChunkDiskMapperRef struct { + meta chunks.Meta + ref chunks.ChunkDiskMapperRef + origMinT int64 + origMaxT int64 +} + +type byMinTimeAndMinRef []chunkMetaAndChunkDiskMapperRef + +func (b byMinTimeAndMinRef) Len() int { return len(b) } +func (b byMinTimeAndMinRef) Less(i, j int) bool { + if b[i].meta.MinTime == b[j].meta.MinTime { + return b[i].meta.Ref < b[j].meta.Ref + } + return b[i].meta.MinTime < b[j].meta.MinTime +} + +func (b byMinTimeAndMinRef) Swap(i, j int) { b[i], b[j] = b[j], b[i] } + +type metaByMinTimeAndMinRef []chunks.Meta + +func (b metaByMinTimeAndMinRef) Len() int { return len(b) } +func (b metaByMinTimeAndMinRef) Less(i, j int) bool { + if b[i].MinTime == b[j].MinTime { + return b[i].Ref < b[j].Ref + } + return b[i].MinTime < b[j].MinTime +} + +func (b metaByMinTimeAndMinRef) Swap(i, j int) { b[i], b[j] = b[j], b[i] } + +func (oh *OOOHeadIndexReader) Postings(name string, values ...string) (index.Postings, error) { + switch len(values) { + case 0: + return index.EmptyPostings(), nil + case 1: + return oh.head.postings.Get(name, values[0]), nil // TODO(ganesh) Also call GetOOOPostings + default: + // TODO(ganesh) We want to only return postings for out of order series. + res := make([]index.Postings, 0, len(values)) + for _, value := range values { + res = append(res, oh.head.postings.Get(name, value)) // TODO(ganesh) Also call GetOOOPostings + } + return index.Merge(res...), nil + } +} + +type OOOHeadChunkReader struct { + head *Head + mint, maxt int64 +} + +func NewOOOHeadChunkReader(head *Head, mint, maxt int64) *OOOHeadChunkReader { + return &OOOHeadChunkReader{ + head: head, + mint: mint, + maxt: maxt, + } +} + +func (cr OOOHeadChunkReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) { + sid, _ := chunks.HeadChunkRef(meta.Ref).Unpack() + + s := cr.head.series.getByID(sid) + // This means that the series has been garbage collected. + if s == nil { + return nil, storage.ErrNotFound + } + + s.Lock() + c, err := s.oooMergedChunk(meta, cr.head.chunkDiskMapper, cr.mint, cr.maxt) + s.Unlock() + if err != nil { + return nil, err + } + + // This means that the query range did not overlap with the requested chunk. + if len(c.chunks) == 0 { + return nil, storage.ErrNotFound + } + + return c, nil +} + +func (cr OOOHeadChunkReader) Close() error { + return nil +} + +type OOOCompactionHead struct { + oooIR *OOOHeadIndexReader + lastMmapRef chunks.ChunkDiskMapperRef + lastWBLFile int + postings []storage.SeriesRef + chunkRange int64 + mint, maxt int64 // Among all the compactable chunks. +} + +// NewOOOCompactionHead does the following: +// 1. M-maps all the in-memory ooo chunks. +// 2. Compute the expected block ranges while iterating through all ooo series and store it. +// 3. Store the list of postings having ooo series. +// 4. Cuts a new WBL file for the OOO WBL. +// All the above together have a bit of CPU and memory overhead, and can have a bit of impact +// on the sample append latency. So call NewOOOCompactionHead only right before compaction. +func NewOOOCompactionHead(head *Head) (*OOOCompactionHead, error) { + newWBLFile, err := head.wbl.NextSegmentSync() + if err != nil { + return nil, err + } + + ch := &OOOCompactionHead{ + chunkRange: head.chunkRange.Load(), + mint: math.MaxInt64, + maxt: math.MinInt64, + lastWBLFile: newWBLFile, + } + + ch.oooIR = NewOOOHeadIndexReader(head, math.MinInt64, math.MaxInt64) + n, v := index.AllPostingsKey() + + // TODO: verify this gets only ooo samples. + p, err := ch.oooIR.Postings(n, v) + if err != nil { + return nil, err + } + p = ch.oooIR.SortedPostings(p) + + var lastSeq, lastOff int + for p.Next() { + seriesRef := p.At() + ms := head.series.getByID(chunks.HeadSeriesRef(seriesRef)) + if ms == nil { + continue + } + + // M-map the in-memory chunk and keep track of the last one. + // Also build the block ranges -> series map. + // TODO: consider having a lock specifically for ooo data. + ms.Lock() + + mmapRef := ms.mmapCurrentOOOHeadChunk(head.chunkDiskMapper) + if mmapRef == 0 && len(ms.oooMmappedChunks) > 0 { + // Nothing was m-mapped. So take the mmapRef from the existing slice if it exists. + mmapRef = ms.oooMmappedChunks[len(ms.oooMmappedChunks)-1].ref + } + seq, off := mmapRef.Unpack() + if seq > lastSeq || (seq == lastSeq && off > lastOff) { + ch.lastMmapRef, lastSeq, lastOff = mmapRef, seq, off + } + if len(ms.oooMmappedChunks) > 0 { + ch.postings = append(ch.postings, seriesRef) + for _, c := range ms.oooMmappedChunks { + if c.minTime < ch.mint { + ch.mint = c.minTime + } + if c.maxTime > ch.maxt { + ch.maxt = c.maxTime + } + } + } + ms.Unlock() + } + + return ch, nil +} + +func (ch *OOOCompactionHead) Index() (IndexReader, error) { + return NewOOOCompactionHeadIndexReader(ch), nil +} + +func (ch *OOOCompactionHead) Chunks() (ChunkReader, error) { + return NewOOOHeadChunkReader(ch.oooIR.head, ch.oooIR.mint, ch.oooIR.maxt), nil +} + +func (ch *OOOCompactionHead) Tombstones() (tombstones.Reader, error) { + return tombstones.NewMemTombstones(), nil +} + +func (ch *OOOCompactionHead) Meta() BlockMeta { + var id [16]byte + copy(id[:], "copy(id[:], \"ooo_compact_head\")") + return BlockMeta{ + MinTime: ch.mint, + MaxTime: ch.maxt, + ULID: id, + Stats: BlockStats{ + NumSeries: uint64(len(ch.postings)), + }, + } +} + +// CloneForTimeRange clones the OOOCompactionHead such that the IndexReader and ChunkReader +// obtained from this only looks at the m-map chunks within the given time ranges while not looking +// beyond the ch.lastMmapRef. +// Only the method of BlockReader interface are valid for the cloned OOOCompactionHead. +func (ch *OOOCompactionHead) CloneForTimeRange(mint, maxt int64) *OOOCompactionHead { + return &OOOCompactionHead{ + oooIR: NewOOOHeadIndexReader(ch.oooIR.head, mint, maxt), + lastMmapRef: ch.lastMmapRef, + postings: ch.postings, + chunkRange: ch.chunkRange, + mint: ch.mint, + maxt: ch.maxt, + } +} + +func (ch *OOOCompactionHead) Size() int64 { return 0 } +func (ch *OOOCompactionHead) MinTime() int64 { return ch.mint } +func (ch *OOOCompactionHead) MaxTime() int64 { return ch.maxt } +func (ch *OOOCompactionHead) ChunkRange() int64 { return ch.chunkRange } +func (ch *OOOCompactionHead) LastMmapRef() chunks.ChunkDiskMapperRef { return ch.lastMmapRef } +func (ch *OOOCompactionHead) LastWBLFile() int { return ch.lastWBLFile } + +type OOOCompactionHeadIndexReader struct { + ch *OOOCompactionHead +} + +func NewOOOCompactionHeadIndexReader(ch *OOOCompactionHead) IndexReader { + return &OOOCompactionHeadIndexReader{ch: ch} +} + +func (ir *OOOCompactionHeadIndexReader) Symbols() index.StringIter { + return ir.ch.oooIR.Symbols() +} + +func (ir *OOOCompactionHeadIndexReader) Postings(name string, values ...string) (index.Postings, error) { + n, v := index.AllPostingsKey() + if name != n || len(values) != 1 || values[0] != v { + return nil, errors.New("only AllPostingsKey is supported") + } + return index.NewListPostings(ir.ch.postings), nil +} + +func (ir *OOOCompactionHeadIndexReader) SortedPostings(p index.Postings) index.Postings { + // This will already be sorted from the Postings() call above. + return p +} + +func (ir *OOOCompactionHeadIndexReader) Series(ref storage.SeriesRef, lset *labels.Labels, chks *[]chunks.Meta) error { + return ir.ch.oooIR.series(ref, lset, chks, ir.ch.lastMmapRef) +} + +func (ir *OOOCompactionHeadIndexReader) SortedLabelValues(name string, matchers ...*labels.Matcher) ([]string, error) { + return nil, errors.New("not implemented") +} + +func (ir *OOOCompactionHeadIndexReader) LabelValues(name string, matchers ...*labels.Matcher) ([]string, error) { + return nil, errors.New("not implemented") +} + +func (ir *OOOCompactionHeadIndexReader) PostingsForMatchers(concurrent bool, ms ...*labels.Matcher) (index.Postings, error) { + return nil, errors.New("not implemented") +} + +func (ir *OOOCompactionHeadIndexReader) LabelNames(matchers ...*labels.Matcher) ([]string, error) { + return nil, errors.New("not implemented") +} + +func (ir *OOOCompactionHeadIndexReader) LabelValueFor(id storage.SeriesRef, label string) (string, error) { + return "", errors.New("not implemented") +} + +func (ir *OOOCompactionHeadIndexReader) LabelNamesFor(ids ...storage.SeriesRef) ([]string, error) { + return nil, errors.New("not implemented") +} + +func (ir *OOOCompactionHeadIndexReader) Close() error { + return ir.ch.oooIR.Close() +} diff --git a/tsdb/ooo_head_read_test.go b/tsdb/ooo_head_read_test.go new file mode 100644 index 0000000000..486ca31f3f --- /dev/null +++ b/tsdb/ooo_head_read_test.go @@ -0,0 +1,1207 @@ +// Copyright 2022 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tsdb + +import ( + "context" + "fmt" + "math" + "sort" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/prometheus/prometheus/model/labels" + "github.com/prometheus/prometheus/storage" + "github.com/prometheus/prometheus/tsdb/chunkenc" + "github.com/prometheus/prometheus/tsdb/chunks" + "github.com/prometheus/prometheus/tsdb/tsdbutil" +) + +type chunkInterval struct { + // because we permutate the order of chunks, we cannot determine at test declaration time which chunkRefs we expect in the Output. + // This ID matches expected output chunks against test input chunks, the test runner will assert the chunkRef for the matching chunk + ID int + mint int64 + maxt int64 +} + +// permutateChunkIntervals returns all possible orders of the given chunkIntervals +func permutateChunkIntervals(in []chunkInterval, out [][]chunkInterval, left, right int) [][]chunkInterval { + if left == right { + inCopy := make([]chunkInterval, len(in)) + copy(inCopy, in) + return append(out, inCopy) + } + for i := left; i <= right; i++ { + in[left], in[i] = in[i], in[left] + out = permutateChunkIntervals(in, out, left+1, right) + in[left], in[i] = in[i], in[left] + } + return out +} + +// TestOOOHeadIndexReader_Series tests that the Series method works as expected. +// However it does so by creating chunks and memory mapping them unlike other +// tests of the head where samples are appended and we let the head memory map. +// We do this because the ingestion path and the appender for out of order +// samples are not ready yet. +func TestOOOHeadIndexReader_Series(t *testing.T) { + tests := []struct { + name string + queryMinT int64 + queryMaxT int64 + inputChunkIntervals []chunkInterval + expChunks []chunkInterval + }{ + { + name: "Empty result and no error when head is empty", + queryMinT: 0, + queryMaxT: 100, + expChunks: nil, + }, + { + name: "If query interval is bigger than the existing chunks nothing is returned", + queryMinT: 500, + queryMaxT: 700, + inputChunkIntervals: []chunkInterval{ + {0, 100, 400}, + }, + // ts 0 100 150 200 250 300 350 400 450 500 550 600 650 700 + // Query Interval [---------------------------------------] + // Chunk 0 [-----------------------------------------------------------] + expChunks: nil, + }, + { + name: "If query interval is smaller than the existing chunks nothing is returned", + queryMinT: 100, + queryMaxT: 400, + inputChunkIntervals: []chunkInterval{ + {0, 500, 700}, + }, + // ts 0 100 150 200 250 300 350 400 450 500 550 600 650 700 + // Query Interval [-----------------------------------------------------------] + // Chunk 0: [---------------------------------------] + expChunks: nil, + }, + { + name: "If query interval exceeds the existing chunk, it is returned", + queryMinT: 100, + queryMaxT: 400, + inputChunkIntervals: []chunkInterval{ + {0, 150, 350}, + }, + // ts 0 100 150 200 250 300 350 400 450 500 550 600 650 700 + // Query Interval [-----------------------------------------------------------] + // Chunk 0: [---------------------------------------] + expChunks: []chunkInterval{ + {0, 150, 350}, + }, + }, + { + name: "If chunk exceeds the query interval, it is returned", + queryMinT: 150, + queryMaxT: 350, + inputChunkIntervals: []chunkInterval{ + {0, 100, 400}, + }, + // ts 0 100 150 200 250 300 350 400 450 500 550 600 650 700 + // Query Interval: [---------------------------------------] + // Chunk 0: [-----------------------------------------------------------] + expChunks: []chunkInterval{ + {0, 100, 400}, + }, + }, + { + name: "Pairwise overlaps should return the references of the first of each pair", + queryMinT: 0, + queryMaxT: 700, + inputChunkIntervals: []chunkInterval{ + {0, 100, 200}, + {1, 500, 600}, + {2, 150, 250}, + {3, 550, 650}, + }, + // ts 0 100 150 200 250 300 350 400 450 500 550 600 650 700 + // Query Interval [---------------------------------------------------------------------------------------------------------------------------------] + // Chunk 0: [-------------------] + // Chunk 1: [-------------------] + // Chunk 2: [-------------------] + // Chunk 3: [-------------------] + // Output Graphically [-----------------------------] [-----------------------------] + expChunks: []chunkInterval{ + {0, 100, 250}, + {1, 500, 650}, + }, + }, + { + name: "If all chunks overlap, single big chunk is returned", + queryMinT: 0, + queryMaxT: 700, + inputChunkIntervals: []chunkInterval{ + {0, 100, 200}, + {1, 200, 300}, + {2, 300, 400}, + {3, 400, 500}, + }, + // ts 0 100 150 200 250 300 350 400 450 500 550 600 650 700 + // Query Interval [---------------------------------------------------------------------------------------------------------------------------------] + // Chunk 0: [-------------------] + // Chunk 1: [-------------------] + // Chunk 2: [-------------------] + // Chunk 3: [------------------] + // Output Graphically [------------------------------------------------------------------------------] + expChunks: []chunkInterval{ + {0, 100, 500}, + }, + }, + { + name: "If no chunks overlap, all chunks are returned", + queryMinT: 0, + queryMaxT: 700, + inputChunkIntervals: []chunkInterval{ + {0, 100, 199}, + {1, 200, 299}, + {2, 300, 399}, + {3, 400, 499}, + }, + // ts 0 100 150 200 250 300 350 400 450 500 550 600 650 700 + // Query Interval [---------------------------------------------------------------------------------------------------------------------------------] + // Chunk 0: [------------------] + // Chunk 1: [------------------] + // Chunk 2: [------------------] + // Chunk 3: [------------------] + // Output Graphically [------------------][------------------][------------------][------------------] + expChunks: []chunkInterval{ + {0, 100, 199}, + {1, 200, 299}, + {2, 300, 399}, + {3, 400, 499}, + }, + }, + { + name: "Triplet with pairwise overlaps, query range covers all, and distractor extra chunk", + queryMinT: 0, + queryMaxT: 400, + inputChunkIntervals: []chunkInterval{ + {0, 100, 200}, + {1, 150, 300}, + {2, 250, 350}, + {3, 450, 550}, + }, + // ts 0 100 150 200 250 300 350 400 450 500 550 600 650 700 + // Query Interval [--------------------------------------------------------------------] + // Chunk 0: [------------------] + // Chunk 1: [-----------------------------] + // Chunk 2: [------------------] + // Chunk 3: [------------------] + // Output Graphically [-----------------------------------------------] + expChunks: []chunkInterval{ + {0, 100, 350}, + }, + }, + { + name: "Query interval partially overlaps some chunks", + queryMinT: 100, + queryMaxT: 400, + inputChunkIntervals: []chunkInterval{ + {0, 250, 500}, + {1, 0, 200}, + {2, 150, 300}, + }, + // ts 0 100 150 200 250 300 350 400 450 500 550 600 650 700 + // Query Interval [------------------------------------------------------------] + // Chunk 0: [-------------------------------------------------] + // Chunk 1: [-----------------------------] + // Chunk 2: [------------------------------] + // Output Graphically [-----------------------------------------------------------------------------------------] + expChunks: []chunkInterval{ + {1, 0, 500}, + }, + }, + { + name: "A full overlap pair and disjointed triplet", + queryMinT: 0, + queryMaxT: 900, + inputChunkIntervals: []chunkInterval{ + {0, 100, 300}, + {1, 770, 850}, + {2, 150, 250}, + {3, 650, 750}, + {4, 600, 800}, + }, + // ts 0 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 + // Query Interval [---------------------------------------------------------------------------------------------------------------------------------------------------------------] + // Chunk 0: [---------------------------------------] + // Chunk 1: [--------------] + // Chunk 2: [-------------------] + // Chunk 3: [-------------------] + // Chunk 4: [---------------------------------------] + // Output Graphically [---------------------------------------] [------------------------------------------------] + expChunks: []chunkInterval{ + {0, 100, 300}, + {4, 600, 850}, + }, + }, + { + name: "Query range covers 3 disjoint chunks", + queryMinT: 0, + queryMaxT: 650, + inputChunkIntervals: []chunkInterval{ + {0, 100, 150}, + {1, 300, 350}, + {2, 200, 250}, + }, + // ts 0 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 + // Query Interval [----------------------------------------------------------------------------------------------------------------------] + // Chunk 0: [-------] + // Chunk 1: [----------] + // Chunk 2: [--------] + // Output Graphically [-------] [--------] [----------] + expChunks: []chunkInterval{ + {0, 100, 150}, + {1, 300, 350}, + {2, 200, 250}, + }, + }, + } + + s1Lset := labels.FromStrings("foo", "bar") + s1ID := uint64(1) + + for _, tc := range tests { + var permutations [][]chunkInterval + if len(tc.inputChunkIntervals) == 0 { + // handle special case + permutations = [][]chunkInterval{ + nil, + } + } else { + permutations = permutateChunkIntervals(tc.inputChunkIntervals, nil, 0, len(tc.inputChunkIntervals)-1) + } + for perm, intervals := range permutations { + for _, headChunk := range []bool{false, true} { + t.Run(fmt.Sprintf("name=%s, permutation=%d, headChunk=%t", tc.name, perm, headChunk), func(t *testing.T) { + h, _ := newTestHead(t, 1000, false, true) + defer func() { + require.NoError(t, h.Close()) + }() + require.NoError(t, h.Init(0)) + + s1, _, _ := h.getOrCreate(s1ID, s1Lset) + + var lastChunk chunkInterval + var lastChunkPos int + + // the marker should be set based on whichever is the last chunk/interval that overlaps with the query range + for i, interv := range intervals { + if overlapsClosedInterval(interv.mint, interv.maxt, tc.queryMinT, tc.queryMaxT) { + lastChunk = interv + lastChunkPos = i + } + } + lastChunkRef := chunks.ChunkRef(chunks.NewHeadChunkRef(1, chunks.HeadChunkID(uint64(lastChunkPos)))) + + // define our expected chunks, by looking at the expected ChunkIntervals and setting... + var expChunks []chunks.Meta + for _, e := range tc.expChunks { + meta := chunks.Meta{ + Chunk: chunkenc.Chunk(nil), + MinTime: e.mint, + MaxTime: e.maxt, + // markers based on the last chunk we found above + OOOLastMinTime: lastChunk.mint, + OOOLastMaxTime: lastChunk.maxt, + OOOLastRef: lastChunkRef, + } + + // Ref to whatever Ref the chunk has, that we refer to by ID + for ref, c := range intervals { + if c.ID == e.ID { + meta.Ref = chunks.ChunkRef(chunks.NewHeadChunkRef(chunks.HeadSeriesRef(s1ID), chunks.HeadChunkID(ref))) + break + } + } + expChunks = append(expChunks, meta) + } + sort.Sort(metaByMinTimeAndMinRef(expChunks)) // we always want the chunks to come back sorted by minTime asc + + if headChunk && len(intervals) > 0 { + // Put the last interval in the head chunk + s1.oooHeadChunk = &oooHeadChunk{ + minTime: intervals[len(intervals)-1].mint, + maxTime: intervals[len(intervals)-1].maxt, + } + intervals = intervals[:len(intervals)-1] + } + + for _, ic := range intervals { + s1.oooMmappedChunks = append(s1.oooMmappedChunks, &mmappedChunk{ + minTime: ic.mint, + maxTime: ic.maxt, + }) + } + + ir := NewOOOHeadIndexReader(h, tc.queryMinT, tc.queryMaxT) + + var chks []chunks.Meta + var respLset labels.Labels + err := ir.Series(storage.SeriesRef(s1ID), &respLset, &chks) + require.NoError(t, err) + require.Equal(t, s1Lset, respLset) + require.Equal(t, expChunks, chks) + + err = ir.Series(storage.SeriesRef(s1ID+1), &respLset, &chks) + require.Equal(t, storage.ErrNotFound, err) + }) + } + } + } +} + +func TestOOOHeadChunkReader_LabelValues(t *testing.T) { + chunkRange := int64(2000) + head, _ := newTestHead(t, chunkRange, false, true) + t.Cleanup(func() { require.NoError(t, head.Close()) }) + + app := head.Appender(context.Background()) + + // Add in-order samples + _, err := app.Append(0, labels.Labels{ + {Name: "foo", Value: "bar1"}, + }, 100, 1) + require.NoError(t, err) + _, err = app.Append(0, labels.Labels{ + {Name: "foo", Value: "bar2"}, + }, 100, 2) + require.NoError(t, err) + + // Add ooo samples for those series + _, err = app.Append(0, labels.Labels{ + {Name: "foo", Value: "bar1"}, + }, 90, 1) + require.NoError(t, err) + _, err = app.Append(0, labels.Labels{ + {Name: "foo", Value: "bar2"}, + }, 90, 2) + require.NoError(t, err) + + require.NoError(t, app.Commit()) + + cases := []struct { + name string + queryMinT int64 + queryMaxT int64 + expValues1 []string + expValues2 []string + expValues3 []string + expValues4 []string + }{ + { + name: "LabelValues calls when ooo head has max query range", + queryMinT: math.MinInt64, + queryMaxT: math.MaxInt64, + expValues1: []string{"bar1"}, + expValues2: []string{}, + expValues3: []string{"bar1", "bar2"}, + expValues4: []string{"bar1", "bar2"}, + }, + { + name: "LabelValues calls with ooo head query range not overlapping in-order data", + queryMinT: 90, + queryMaxT: 90, + expValues1: []string{"bar1"}, + expValues2: []string{}, + expValues3: []string{"bar1", "bar2"}, + expValues4: []string{"bar1", "bar2"}, + }, + { + name: "LabelValues calls with ooo head query range not overlapping out-of-order data", + queryMinT: 100, + queryMaxT: 100, + expValues1: []string{}, + expValues2: []string{}, + expValues3: []string{}, + expValues4: []string{}, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + // We first want to test using a head index reader that covers the biggest query interval + oh := NewOOOHeadIndexReader(head, tc.queryMinT, tc.queryMaxT) + matchers := []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar1")} + values, err := oh.LabelValues("foo", matchers...) + sort.Strings(values) + require.NoError(t, err) + require.Equal(t, tc.expValues1, values) + + matchers = []*labels.Matcher{labels.MustNewMatcher(labels.MatchNotRegexp, "foo", "^bar.")} + values, err = oh.LabelValues("foo", matchers...) + sort.Strings(values) + require.NoError(t, err) + require.Equal(t, tc.expValues2, values) + + matchers = []*labels.Matcher{labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar.")} + values, err = oh.LabelValues("foo", matchers...) + sort.Strings(values) + require.NoError(t, err) + require.Equal(t, tc.expValues3, values) + + values, err = oh.LabelValues("foo") + sort.Strings(values) + require.NoError(t, err) + require.Equal(t, tc.expValues4, values) + }) + } +} + +// TestOOOHeadChunkReader_Chunk tests that the Chunk method works as expected. +// It does so by appending out of order samples to the db and then initializing +// an OOOHeadChunkReader to read chunks from it. +func TestOOOHeadChunkReader_Chunk(t *testing.T) { + opts := DefaultOptions() + opts.OutOfOrderCapMax = 5 + opts.OutOfOrderTimeWindow = 120 * time.Minute.Milliseconds() + + s1 := labels.FromStrings("l", "v1") + minutes := func(m int64) int64 { return m * time.Minute.Milliseconds() } + + appendSample := func(app storage.Appender, l labels.Labels, timestamp int64, value float64) storage.SeriesRef { + ref, err := app.Append(0, l, timestamp, value) + require.NoError(t, err) + return ref + } + + t.Run("Getting a non existing chunk fails with not found error", func(t *testing.T) { + db := newTestDBWithOpts(t, opts) + + cr := NewOOOHeadChunkReader(db.head, 0, 1000) + c, err := cr.Chunk(chunks.Meta{ + Ref: 0x1000000, Chunk: chunkenc.Chunk(nil), MinTime: 100, MaxTime: 300, + }) + require.Equal(t, err, fmt.Errorf("not found")) + require.Equal(t, c, nil) + }) + + tests := []struct { + name string + queryMinT int64 + queryMaxT int64 + firstInOrderSampleAt int64 + inputSamples tsdbutil.SampleSlice + expChunkError bool + expChunksSamples []tsdbutil.SampleSlice + }{ + { + name: "Getting the head when there are no overlapping chunks returns just the samples in the head", + queryMinT: minutes(0), + queryMaxT: minutes(100), + firstInOrderSampleAt: minutes(120), + inputSamples: tsdbutil.SampleSlice{ + sample{t: minutes(30), v: float64(0)}, + sample{t: minutes(40), v: float64(0)}, + }, + expChunkError: false, + // ts (in minutes) 0 10 20 30 40 50 60 70 80 90 100 + // Query Interval [------------------------------------------------------------------------------------------] + // Chunk 0: Current Head [--------] (With 2 samples) + // Output Graphically [--------] (With 2 samples) + expChunksSamples: []tsdbutil.SampleSlice{ + { + sample{t: minutes(30), v: float64(0)}, + sample{t: minutes(40), v: float64(0)}, + }, + }, + }, + { + name: "Getting the head chunk when there are overlapping chunks returns all combined", + queryMinT: minutes(0), + queryMaxT: minutes(100), + firstInOrderSampleAt: minutes(120), + inputSamples: tsdbutil.SampleSlice{ + // opts.OOOCapMax is 5 so these will be mmapped to the first mmapped chunk + sample{t: minutes(41), v: float64(0)}, + sample{t: minutes(42), v: float64(0)}, + sample{t: minutes(43), v: float64(0)}, + sample{t: minutes(44), v: float64(0)}, + sample{t: minutes(45), v: float64(0)}, + // The following samples will go to the head chunk, and we want it + // to overlap with the previous chunk + sample{t: minutes(30), v: float64(1)}, + sample{t: minutes(50), v: float64(1)}, + }, + expChunkError: false, + // ts (in minutes) 0 10 20 30 40 50 60 70 80 90 100 + // Query Interval [------------------------------------------------------------------------------------------] + // Chunk 0 [---] (With 5 samples) + // Chunk 1: Current Head [-----------------] (With 2 samples) + // Output Graphically [-----------------] (With 7 samples) + expChunksSamples: []tsdbutil.SampleSlice{ + { + sample{t: minutes(30), v: float64(1)}, + sample{t: minutes(41), v: float64(0)}, + sample{t: minutes(42), v: float64(0)}, + sample{t: minutes(43), v: float64(0)}, + sample{t: minutes(44), v: float64(0)}, + sample{t: minutes(45), v: float64(0)}, + sample{t: minutes(50), v: float64(1)}, + }, + }, + }, + { + name: "Two windows of overlapping chunks get properly converged", + queryMinT: minutes(0), + queryMaxT: minutes(100), + firstInOrderSampleAt: minutes(120), + inputSamples: tsdbutil.SampleSlice{ + // Chunk 0 + sample{t: minutes(10), v: float64(0)}, + sample{t: minutes(12), v: float64(0)}, + sample{t: minutes(14), v: float64(0)}, + sample{t: minutes(16), v: float64(0)}, + sample{t: minutes(20), v: float64(0)}, + // Chunk 1 + sample{t: minutes(20), v: float64(1)}, + sample{t: minutes(22), v: float64(1)}, + sample{t: minutes(24), v: float64(1)}, + sample{t: minutes(26), v: float64(1)}, + sample{t: minutes(29), v: float64(1)}, + // Chunk 2 + sample{t: minutes(30), v: float64(2)}, + sample{t: minutes(32), v: float64(2)}, + sample{t: minutes(34), v: float64(2)}, + sample{t: minutes(36), v: float64(2)}, + sample{t: minutes(40), v: float64(2)}, + // Head + sample{t: minutes(40), v: float64(3)}, + sample{t: minutes(50), v: float64(3)}, + }, + expChunkError: false, + // ts (in minutes) 0 10 20 30 40 50 60 70 80 90 100 + // Query Interval [------------------------------------------------------------------------------------------] + // Chunk 0 [--------] + // Chunk 1 [-------] + // Chunk 2 [--------] + // Chunk 3: Current Head [--------] + // Output Graphically [----------------][-----------------] + expChunksSamples: []tsdbutil.SampleSlice{ + { + sample{t: minutes(10), v: float64(0)}, + sample{t: minutes(12), v: float64(0)}, + sample{t: minutes(14), v: float64(0)}, + sample{t: minutes(16), v: float64(0)}, + sample{t: minutes(20), v: float64(1)}, + sample{t: minutes(22), v: float64(1)}, + sample{t: minutes(24), v: float64(1)}, + sample{t: minutes(26), v: float64(1)}, + sample{t: minutes(29), v: float64(1)}, + }, + { + sample{t: minutes(30), v: float64(2)}, + sample{t: minutes(32), v: float64(2)}, + sample{t: minutes(34), v: float64(2)}, + sample{t: minutes(36), v: float64(2)}, + sample{t: minutes(40), v: float64(3)}, + sample{t: minutes(50), v: float64(3)}, + }, + }, + }, + { + name: "Two windows of overlapping chunks in descending order get properly converged", + queryMinT: minutes(0), + queryMaxT: minutes(100), + firstInOrderSampleAt: minutes(120), + inputSamples: tsdbutil.SampleSlice{ + // Chunk 0 + sample{t: minutes(40), v: float64(0)}, + sample{t: minutes(42), v: float64(0)}, + sample{t: minutes(44), v: float64(0)}, + sample{t: minutes(46), v: float64(0)}, + sample{t: minutes(50), v: float64(0)}, + // Chunk 1 + sample{t: minutes(30), v: float64(1)}, + sample{t: minutes(32), v: float64(1)}, + sample{t: minutes(34), v: float64(1)}, + sample{t: minutes(36), v: float64(1)}, + sample{t: minutes(40), v: float64(1)}, + // Chunk 2 + sample{t: minutes(20), v: float64(2)}, + sample{t: minutes(22), v: float64(2)}, + sample{t: minutes(24), v: float64(2)}, + sample{t: minutes(26), v: float64(2)}, + sample{t: minutes(29), v: float64(2)}, + // Head + sample{t: minutes(10), v: float64(3)}, + sample{t: minutes(20), v: float64(3)}, + }, + expChunkError: false, + // ts (in minutes) 0 10 20 30 40 50 60 70 80 90 100 + // Query Interval [------------------------------------------------------------------------------------------] + // Chunk 0 [--------] + // Chunk 1 [--------] + // Chunk 2 [-------] + // Chunk 3: Current Head [--------] + // Output Graphically [----------------][-----------------] + expChunksSamples: []tsdbutil.SampleSlice{ + { + sample{t: minutes(10), v: float64(3)}, + sample{t: minutes(20), v: float64(2)}, + sample{t: minutes(22), v: float64(2)}, + sample{t: minutes(24), v: float64(2)}, + sample{t: minutes(26), v: float64(2)}, + sample{t: minutes(29), v: float64(2)}, + }, + { + sample{t: minutes(30), v: float64(1)}, + sample{t: minutes(32), v: float64(1)}, + sample{t: minutes(34), v: float64(1)}, + sample{t: minutes(36), v: float64(1)}, + sample{t: minutes(40), v: float64(0)}, + sample{t: minutes(42), v: float64(0)}, + sample{t: minutes(44), v: float64(0)}, + sample{t: minutes(46), v: float64(0)}, + sample{t: minutes(50), v: float64(0)}, + }, + }, + }, + { + name: "If chunks are not overlapped they are not converged", + queryMinT: minutes(0), + queryMaxT: minutes(100), + firstInOrderSampleAt: minutes(120), + inputSamples: tsdbutil.SampleSlice{ + // Chunk 0 + sample{t: minutes(10), v: float64(0)}, + sample{t: minutes(12), v: float64(0)}, + sample{t: minutes(14), v: float64(0)}, + sample{t: minutes(16), v: float64(0)}, + sample{t: minutes(18), v: float64(0)}, + // Chunk 1 + sample{t: minutes(20), v: float64(1)}, + sample{t: minutes(22), v: float64(1)}, + sample{t: minutes(24), v: float64(1)}, + sample{t: minutes(26), v: float64(1)}, + sample{t: minutes(28), v: float64(1)}, + // Chunk 2 + sample{t: minutes(30), v: float64(2)}, + sample{t: minutes(32), v: float64(2)}, + sample{t: minutes(34), v: float64(2)}, + sample{t: minutes(36), v: float64(2)}, + sample{t: minutes(38), v: float64(2)}, + // Head + sample{t: minutes(40), v: float64(3)}, + sample{t: minutes(42), v: float64(3)}, + }, + expChunkError: false, + // ts (in minutes) 0 10 20 30 40 50 60 70 80 90 100 + // Query Interval [------------------------------------------------------------------------------------------] + // Chunk 0 [-------] + // Chunk 1 [-------] + // Chunk 2 [-------] + // Chunk 3: Current Head [-------] + // Output Graphically [-------][-------][-------][--------] + expChunksSamples: []tsdbutil.SampleSlice{ + { + sample{t: minutes(10), v: float64(0)}, + sample{t: minutes(12), v: float64(0)}, + sample{t: minutes(14), v: float64(0)}, + sample{t: minutes(16), v: float64(0)}, + sample{t: minutes(18), v: float64(0)}, + }, + { + sample{t: minutes(20), v: float64(1)}, + sample{t: minutes(22), v: float64(1)}, + sample{t: minutes(24), v: float64(1)}, + sample{t: minutes(26), v: float64(1)}, + sample{t: minutes(28), v: float64(1)}, + }, + { + sample{t: minutes(30), v: float64(2)}, + sample{t: minutes(32), v: float64(2)}, + sample{t: minutes(34), v: float64(2)}, + sample{t: minutes(36), v: float64(2)}, + sample{t: minutes(38), v: float64(2)}, + }, + { + sample{t: minutes(40), v: float64(3)}, + sample{t: minutes(42), v: float64(3)}, + }, + }, + }, + { + name: "Triplet of chunks overlapping returns a single merged chunk", + queryMinT: minutes(0), + queryMaxT: minutes(100), + firstInOrderSampleAt: minutes(120), + inputSamples: tsdbutil.SampleSlice{ + // Chunk 0 + sample{t: minutes(10), v: float64(0)}, + sample{t: minutes(15), v: float64(0)}, + sample{t: minutes(20), v: float64(0)}, + sample{t: minutes(25), v: float64(0)}, + sample{t: minutes(30), v: float64(0)}, + // Chunk 1 + sample{t: minutes(20), v: float64(1)}, + sample{t: minutes(25), v: float64(1)}, + sample{t: minutes(30), v: float64(1)}, + sample{t: minutes(35), v: float64(1)}, + sample{t: minutes(42), v: float64(1)}, + // Chunk 2 Head + sample{t: minutes(32), v: float64(2)}, + sample{t: minutes(50), v: float64(2)}, + }, + expChunkError: false, + // ts (in minutes) 0 10 20 30 40 50 60 70 80 90 100 + // Query Interval [------------------------------------------------------------------------------------------] + // Chunk 0 [-----------------] + // Chunk 1 [--------------------] + // Chunk 2 Current Head [--------------] + // Output Graphically [-----------------------------------] + expChunksSamples: []tsdbutil.SampleSlice{ + { + sample{t: minutes(10), v: float64(0)}, + sample{t: minutes(15), v: float64(0)}, + sample{t: minutes(20), v: float64(1)}, + sample{t: minutes(25), v: float64(1)}, + sample{t: minutes(30), v: float64(1)}, + sample{t: minutes(32), v: float64(2)}, + sample{t: minutes(35), v: float64(1)}, + sample{t: minutes(42), v: float64(1)}, + sample{t: minutes(50), v: float64(2)}, + }, + }, + }, + { + name: "Query interval partially overlaps with a triplet of chunks but still returns a single merged chunk", + queryMinT: minutes(12), + queryMaxT: minutes(33), + firstInOrderSampleAt: minutes(120), + inputSamples: tsdbutil.SampleSlice{ + // Chunk 0 + sample{t: minutes(10), v: float64(0)}, + sample{t: minutes(15), v: float64(0)}, + sample{t: minutes(20), v: float64(0)}, + sample{t: minutes(25), v: float64(0)}, + sample{t: minutes(30), v: float64(0)}, + // Chunk 1 + sample{t: minutes(20), v: float64(1)}, + sample{t: minutes(25), v: float64(1)}, + sample{t: minutes(30), v: float64(1)}, + sample{t: minutes(35), v: float64(1)}, + sample{t: minutes(42), v: float64(1)}, + // Chunk 2 Head + sample{t: minutes(32), v: float64(2)}, + sample{t: minutes(50), v: float64(2)}, + }, + expChunkError: false, + // ts (in minutes) 0 10 20 30 40 50 60 70 80 90 100 + // Query Interval [------------------] + // Chunk 0 [-----------------] + // Chunk 1 [--------------------] + // Chunk 2 Current Head [--------------] + // Output Graphically [-----------------------------------] + expChunksSamples: []tsdbutil.SampleSlice{ + { + sample{t: minutes(10), v: float64(0)}, + sample{t: minutes(15), v: float64(0)}, + sample{t: minutes(20), v: float64(1)}, + sample{t: minutes(25), v: float64(1)}, + sample{t: minutes(30), v: float64(1)}, + sample{t: minutes(32), v: float64(2)}, + sample{t: minutes(35), v: float64(1)}, + sample{t: minutes(42), v: float64(1)}, + sample{t: minutes(50), v: float64(2)}, + }, + }, + }, + } + + for _, tc := range tests { + t.Run(fmt.Sprintf("name=%s", tc.name), func(t *testing.T) { + db := newTestDBWithOpts(t, opts) + + app := db.Appender(context.Background()) + s1Ref := appendSample(app, s1, tc.firstInOrderSampleAt, float64(tc.firstInOrderSampleAt/1*time.Minute.Milliseconds())) + require.NoError(t, app.Commit()) + + // OOO few samples for s1. + app = db.Appender(context.Background()) + for _, s := range tc.inputSamples { + appendSample(app, s1, s.T(), s.V()) + } + require.NoError(t, app.Commit()) + + // The Series method is the one that populates the chunk meta OOO + // markers like OOOLastRef. These are then used by the ChunkReader. + ir := NewOOOHeadIndexReader(db.head, tc.queryMinT, tc.queryMaxT) + var chks []chunks.Meta + var respLset labels.Labels + err := ir.Series(s1Ref, &respLset, &chks) + require.NoError(t, err) + require.Equal(t, len(tc.expChunksSamples), len(chks)) + + cr := NewOOOHeadChunkReader(db.head, tc.queryMinT, tc.queryMaxT) + for i := 0; i < len(chks); i++ { + c, err := cr.Chunk(chks[i]) + require.NoError(t, err) + + var resultSamples tsdbutil.SampleSlice + it := c.Iterator(nil) + for it.Next() { + t, v := it.At() + resultSamples = append(resultSamples, sample{t: t, v: v}) + } + require.Equal(t, tc.expChunksSamples[i], resultSamples) + } + }) + } +} + +// TestOOOHeadChunkReader_Chunk_ConsistentQueryResponseDespiteOfHeadExpanding tests +// that if a query comes and performs a Series() call followed by a Chunks() call +// the response is consistent with the data seen by Series() even if the OOO +// head receives more samples before Chunks() is called. +// An example: +// - Response A comes from: Series() then Chunk() +// - Response B comes from : Series(), in parallel new samples added to the head, then Chunk() +// - A == B +func TestOOOHeadChunkReader_Chunk_ConsistentQueryResponseDespiteOfHeadExpanding(t *testing.T) { + opts := DefaultOptions() + opts.OutOfOrderCapMax = 5 + opts.OutOfOrderTimeWindow = 120 * time.Minute.Milliseconds() + + s1 := labels.FromStrings("l", "v1") + minutes := func(m int64) int64 { return m * time.Minute.Milliseconds() } + + appendSample := func(app storage.Appender, l labels.Labels, timestamp int64, value float64) storage.SeriesRef { + ref, err := app.Append(0, l, timestamp, value) + require.NoError(t, err) + return ref + } + + tests := []struct { + name string + queryMinT int64 + queryMaxT int64 + firstInOrderSampleAt int64 + initialSamples tsdbutil.SampleSlice + samplesAfterSeriesCall tsdbutil.SampleSlice + expChunkError bool + expChunksSamples []tsdbutil.SampleSlice + }{ + { + name: "Current head gets old, new and in between sample after Series call, they all should be omitted from the result", + queryMinT: minutes(0), + queryMaxT: minutes(100), + firstInOrderSampleAt: minutes(120), + initialSamples: tsdbutil.SampleSlice{ + // Chunk 0 + sample{t: minutes(20), v: float64(0)}, + sample{t: minutes(22), v: float64(0)}, + sample{t: minutes(24), v: float64(0)}, + sample{t: minutes(26), v: float64(0)}, + sample{t: minutes(30), v: float64(0)}, + // Chunk 1 Head + sample{t: minutes(25), v: float64(1)}, + sample{t: minutes(35), v: float64(1)}, + }, + samplesAfterSeriesCall: tsdbutil.SampleSlice{ + sample{t: minutes(10), v: float64(1)}, + sample{t: minutes(32), v: float64(1)}, + sample{t: minutes(50), v: float64(1)}, + }, + expChunkError: false, + // ts (in minutes) 0 10 20 30 40 50 60 70 80 90 100 + // Query Interval [-----------------------------------] + // Chunk 0: [--------] (5 samples) + // Chunk 1: Current Head [-------] (2 samples) + // New samples added after Series() + // Chunk 1: Current Head [-----------------------------------] (5 samples) + // Output Graphically [------------] (With 8 samples, samples newer than lastmint or older than lastmaxt are omitted but the ones in between are kept) + expChunksSamples: []tsdbutil.SampleSlice{ + { + sample{t: minutes(20), v: float64(0)}, + sample{t: minutes(22), v: float64(0)}, + sample{t: minutes(24), v: float64(0)}, + sample{t: minutes(25), v: float64(1)}, + sample{t: minutes(26), v: float64(0)}, + sample{t: minutes(30), v: float64(0)}, + sample{t: minutes(32), v: float64(1)}, // This sample was added after Series() but before Chunk() and its in between the lastmint and maxt so it should be kept + sample{t: minutes(35), v: float64(1)}, + }, + }, + }, + { + name: "After Series() previous head gets mmapped after getting samples, new head gets new samples also overlapping, none of these should appear in the response.", + queryMinT: minutes(0), + queryMaxT: minutes(100), + firstInOrderSampleAt: minutes(120), + initialSamples: tsdbutil.SampleSlice{ + // Chunk 0 + sample{t: minutes(20), v: float64(0)}, + sample{t: minutes(22), v: float64(0)}, + sample{t: minutes(24), v: float64(0)}, + sample{t: minutes(26), v: float64(0)}, + sample{t: minutes(30), v: float64(0)}, + // Chunk 1 Head + sample{t: minutes(25), v: float64(1)}, + sample{t: minutes(35), v: float64(1)}, + }, + samplesAfterSeriesCall: tsdbutil.SampleSlice{ + sample{t: minutes(10), v: float64(1)}, + sample{t: minutes(32), v: float64(1)}, + sample{t: minutes(50), v: float64(1)}, + // Chunk 1 gets mmapped and Chunk 2, the new head is born + sample{t: minutes(25), v: float64(2)}, + sample{t: minutes(31), v: float64(2)}, + }, + expChunkError: false, + // ts (in minutes) 0 10 20 30 40 50 60 70 80 90 100 + // Query Interval [-----------------------------------] + // Chunk 0: [--------] (5 samples) + // Chunk 1: Current Head [-------] (2 samples) + // New samples added after Series() + // Chunk 1 (mmapped) [-------------------------] (5 samples) + // Chunk 2: Current Head [-----------] (2 samples) + // Output Graphically [------------] (8 samples) It has 5 from Chunk 0 and 3 from Chunk 1 + expChunksSamples: []tsdbutil.SampleSlice{ + { + sample{t: minutes(20), v: float64(0)}, + sample{t: minutes(22), v: float64(0)}, + sample{t: minutes(24), v: float64(0)}, + sample{t: minutes(25), v: float64(1)}, + sample{t: minutes(26), v: float64(0)}, + sample{t: minutes(30), v: float64(0)}, + sample{t: minutes(32), v: float64(1)}, // This sample was added after Series() but before Chunk() and its in between the lastmint and maxt so it should be kept + sample{t: minutes(35), v: float64(1)}, + }, + }, + }, + } + + for _, tc := range tests { + t.Run(fmt.Sprintf("name=%s", tc.name), func(t *testing.T) { + db := newTestDBWithOpts(t, opts) + + app := db.Appender(context.Background()) + s1Ref := appendSample(app, s1, tc.firstInOrderSampleAt, float64(tc.firstInOrderSampleAt/1*time.Minute.Milliseconds())) + require.NoError(t, app.Commit()) + + // OOO few samples for s1. + app = db.Appender(context.Background()) + for _, s := range tc.initialSamples { + appendSample(app, s1, s.T(), s.V()) + } + require.NoError(t, app.Commit()) + + // The Series method is the one that populates the chunk meta OOO + // markers like OOOLastRef. These are then used by the ChunkReader. + ir := NewOOOHeadIndexReader(db.head, tc.queryMinT, tc.queryMaxT) + var chks []chunks.Meta + var respLset labels.Labels + err := ir.Series(s1Ref, &respLset, &chks) + require.NoError(t, err) + require.Equal(t, len(tc.expChunksSamples), len(chks)) + + // Now we keep receiving ooo samples + // OOO few samples for s1. + app = db.Appender(context.Background()) + for _, s := range tc.samplesAfterSeriesCall { + appendSample(app, s1, s.T(), s.V()) + } + require.NoError(t, app.Commit()) + + cr := NewOOOHeadChunkReader(db.head, tc.queryMinT, tc.queryMaxT) + for i := 0; i < len(chks); i++ { + c, err := cr.Chunk(chks[i]) + require.NoError(t, err) + + var resultSamples tsdbutil.SampleSlice + it := c.Iterator(nil) + for it.Next() { + ts, v := it.At() + resultSamples = append(resultSamples, sample{t: ts, v: v}) + } + require.Equal(t, tc.expChunksSamples[i], resultSamples) + } + }) + } +} + +// TestSortByMinTimeAndMinRef tests that the sort function for chunk metas does sort +// by chunk meta MinTime and in case of same references by the lower reference. +func TestSortByMinTimeAndMinRef(t *testing.T) { + tests := []struct { + name string + input []chunkMetaAndChunkDiskMapperRef + exp []chunkMetaAndChunkDiskMapperRef + }{ + { + name: "chunks are ordered by min time", + input: []chunkMetaAndChunkDiskMapperRef{ + { + meta: chunks.Meta{ + Ref: 0, + MinTime: 0, + }, + ref: chunks.ChunkDiskMapperRef(0), + }, + { + meta: chunks.Meta{ + Ref: 1, + MinTime: 1, + }, + ref: chunks.ChunkDiskMapperRef(1), + }, + }, + exp: []chunkMetaAndChunkDiskMapperRef{ + { + meta: chunks.Meta{ + Ref: 0, + MinTime: 0, + }, + ref: chunks.ChunkDiskMapperRef(0), + }, + { + meta: chunks.Meta{ + Ref: 1, + MinTime: 1, + }, + ref: chunks.ChunkDiskMapperRef(1), + }, + }, + }, + { + name: "if same mintime, lower reference goes first", + input: []chunkMetaAndChunkDiskMapperRef{ + { + meta: chunks.Meta{ + Ref: 10, + MinTime: 0, + }, + ref: chunks.ChunkDiskMapperRef(0), + }, + { + meta: chunks.Meta{ + Ref: 5, + MinTime: 0, + }, + ref: chunks.ChunkDiskMapperRef(1), + }, + }, + exp: []chunkMetaAndChunkDiskMapperRef{ + { + meta: chunks.Meta{ + Ref: 5, + MinTime: 0, + }, + ref: chunks.ChunkDiskMapperRef(1), + }, + { + meta: chunks.Meta{ + Ref: 10, + MinTime: 0, + }, + ref: chunks.ChunkDiskMapperRef(0), + }, + }, + }, + } + + for _, tc := range tests { + t.Run(fmt.Sprintf("name=%s", tc.name), func(t *testing.T) { + sort.Sort(byMinTimeAndMinRef(tc.input)) + require.Equal(t, tc.exp, tc.input) + }) + } +} + +// TestSortMetaByMinTimeAndMinRef tests that the sort function for chunk metas does sort +// by chunk meta MinTime and in case of same references by the lower reference. +func TestSortMetaByMinTimeAndMinRef(t *testing.T) { + tests := []struct { + name string + inputMetas []chunks.Meta + expMetas []chunks.Meta + }{ + { + name: "chunks are ordered by min time", + inputMetas: []chunks.Meta{ + { + Ref: 0, + MinTime: 0, + }, + { + Ref: 1, + MinTime: 1, + }, + }, + expMetas: []chunks.Meta{ + { + Ref: 0, + MinTime: 0, + }, + { + Ref: 1, + MinTime: 1, + }, + }, + }, + { + name: "if same mintime, lower reference goes first", + inputMetas: []chunks.Meta{ + { + Ref: 10, + MinTime: 0, + }, + { + Ref: 5, + MinTime: 0, + }, + }, + expMetas: []chunks.Meta{ + { + Ref: 5, + MinTime: 0, + }, + { + Ref: 10, + MinTime: 0, + }, + }, + }, + } + + for _, tc := range tests { + t.Run(fmt.Sprintf("name=%s", tc.name), func(t *testing.T) { + sort.Sort(metaByMinTimeAndMinRef(tc.inputMetas)) + require.Equal(t, tc.expMetas, tc.inputMetas) + }) + } +} + +func newTestDBWithOpts(t *testing.T, opts *Options) *DB { + dir := t.TempDir() + + db, err := Open(dir, nil, nil, opts, nil) + require.NoError(t, err) + + t.Cleanup(func() { + require.NoError(t, db.Close()) + }) + + return db +} diff --git a/tsdb/ooo_head_test.go b/tsdb/ooo_head_test.go new file mode 100644 index 0000000000..de078b94c4 --- /dev/null +++ b/tsdb/ooo_head_test.go @@ -0,0 +1,93 @@ +// Copyright 2022 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tsdb + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +const testMaxSize int = 32 + +// Formulas chosen to make testing easy: +func valEven(pos int) int { return pos*2 + 2 } // s[0]=2, s[1]=4, s[2]=6, ..., s[31]=64 - Predictable pre-existing values +func valOdd(pos int) int { return pos*2 + 1 } // s[0]=1, s[1]=3, s[2]=5, ..., s[31]=63 - New values will interject at chosen position because they sort before the pre-existing vals. + +func samplify(v int) sample { return sample{int64(v), float64(v)} } + +func makeEvenSampleSlice(n int) []sample { + s := make([]sample, n) + for i := 0; i < n; i++ { + s[i] = samplify(valEven(i)) + } + return s +} + +// TestOOOInsert tests the following cases: +// - Number of pre-existing samples anywhere from 0 to testMaxSize-1. +// - Insert new sample before first pre-existing samples, after the last, and anywhere in between. +// - With a chunk initial capacity of testMaxSize/8 and testMaxSize, which lets us test non-full and full chunks, and chunks that need to expand themselves. +// Note: In all samples used, t always equals v in numeric value. when we talk about 'value' we just refer to a value that will be used for both sample.t and sample.v. +func TestOOOInsert(t *testing.T) { + for numPreExisting := 0; numPreExisting <= testMaxSize; numPreExisting++ { + // For example, if we have numPreExisting 2, then: + // chunk.samples indexes filled 0 1 + // chunk.samples with these values 2 4 // valEven + // we want to test inserting at index 0 1 2 // insertPos=0..numPreExisting + // we can do this by using values 1, 3 5 // valOdd(insertPos) + + for insertPos := 0; insertPos <= numPreExisting; insertPos++ { + chunk := NewOOOChunk() + chunk.samples = makeEvenSampleSlice(numPreExisting) + newSample := samplify(valOdd(insertPos)) + chunk.Insert(newSample.t, newSample.v) + + var expSamples []sample + // Our expected new samples slice, will be first the original samples. + for i := 0; i < insertPos; i++ { + expSamples = append(expSamples, samplify(valEven(i))) + } + // Then the new sample. + expSamples = append(expSamples, newSample) + // Followed by any original samples that were pushed back by the new one. + for i := insertPos; i < numPreExisting; i++ { + expSamples = append(expSamples, samplify(valEven(i))) + } + + require.Equal(t, expSamples, chunk.samples, "numPreExisting %d, insertPos %d", numPreExisting, insertPos) + } + } +} + +// TestOOOInsertDuplicate tests the correct behavior when inserting a sample that is a duplicate of any +// pre-existing samples, with between 1 and testMaxSize pre-existing samples and +// with a chunk initial capacity of testMaxSize/8 and testMaxSize, which lets us test non-full and full chunks, and chunks that need to expand themselves. +func TestOOOInsertDuplicate(t *testing.T) { + for num := 1; num <= testMaxSize; num++ { + for dupPos := 0; dupPos < num; dupPos++ { + chunk := NewOOOChunk() + chunk.samples = makeEvenSampleSlice(num) + + dupSample := chunk.samples[dupPos] + dupSample.v = 0.123 + + ok := chunk.Insert(dupSample.t, dupSample.v) + + expSamples := makeEvenSampleSlice(num) // We expect no change. + require.False(t, ok) + require.Equal(t, expSamples, chunk.samples, "num %d, dupPos %d", num, dupPos) + } + } +} diff --git a/tsdb/querier.go b/tsdb/querier.go index 522adb87cd..5141a2c1d2 100644 --- a/tsdb/querier.go +++ b/tsdb/querier.go @@ -569,7 +569,7 @@ func (p *populateWithDelGenericSeriesIterator) next() bool { p.i++ p.currChkMeta = p.chks[p.i] - p.currChkMeta.Chunk, p.err = p.chunks.Chunk(p.currChkMeta.Ref) + p.currChkMeta.Chunk, p.err = p.chunks.Chunk(p.currChkMeta) if p.err != nil { p.err = errors.Wrapf(p.err, "cannot populate chunk %d", p.currChkMeta.Ref) return false @@ -898,7 +898,7 @@ func newNopChunkReader() ChunkReader { } } -func (cr nopChunkReader) Chunk(ref chunks.ChunkRef) (chunkenc.Chunk, error) { +func (cr nopChunkReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) { return cr.emptyChunk, nil } diff --git a/tsdb/querier_bench_test.go b/tsdb/querier_bench_test.go index 0bd295fbe7..3cf4e19346 100644 --- a/tsdb/querier_bench_test.go +++ b/tsdb/querier_bench_test.go @@ -34,7 +34,7 @@ func BenchmarkQuerier(b *testing.B) { opts := DefaultHeadOptions() opts.ChunkRange = 1000 opts.ChunkDirRoot = chunkDir - h, err := NewHead(nil, nil, nil, opts, nil) + h, err := NewHead(nil, nil, nil, nil, opts, nil) require.NoError(b, err) defer func() { require.NoError(b, h.Close()) @@ -180,7 +180,7 @@ func BenchmarkQuerierSelect(b *testing.B) { opts := DefaultHeadOptions() opts.ChunkRange = 1000 opts.ChunkDirRoot = chunkDir - h, err := NewHead(nil, nil, nil, opts, nil) + h, err := NewHead(nil, nil, nil, nil, opts, nil) require.NoError(b, err) defer h.Close() app := h.Appender(context.Background()) diff --git a/tsdb/querier_test.go b/tsdb/querier_test.go index 7d3cf2dc32..c0ba864510 100644 --- a/tsdb/querier_test.go +++ b/tsdb/querier_test.go @@ -458,7 +458,7 @@ func TestBlockQuerier_AgainstHeadWithOpenChunks(t *testing.T) { t.Run("", func(t *testing.T) { opts := DefaultHeadOptions() opts.ChunkRange = 2 * time.Hour.Milliseconds() - h, err := NewHead(nil, nil, nil, opts, nil) + h, err := NewHead(nil, nil, nil, nil, opts, nil) require.NoError(t, err) defer h.Close() @@ -627,10 +627,10 @@ func createFakeReaderAndNotPopulatedChunks(s ...[]tsdbutil.Sample) (*fakeChunksR return f, chks } -func (r *fakeChunksReader) Chunk(ref chunks.ChunkRef) (chunkenc.Chunk, error) { - chk, ok := r.chks[ref] +func (r *fakeChunksReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) { + chk, ok := r.chks[meta.Ref] if !ok { - return nil, errors.Errorf("chunk not found at ref %v", ref) + return nil, errors.Errorf("chunk not found at ref %v", meta.Ref) } return chk, nil } @@ -1016,8 +1016,8 @@ func BenchmarkMergedSeriesSet(b *testing.B) { type mockChunkReader map[chunks.ChunkRef]chunkenc.Chunk -func (cr mockChunkReader) Chunk(id chunks.ChunkRef) (chunkenc.Chunk, error) { - chk, ok := cr[id] +func (cr mockChunkReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) { + chk, ok := cr[meta.Ref] if ok { return chk, nil } @@ -1643,7 +1643,7 @@ func TestPostingsForMatchers(t *testing.T) { opts := DefaultHeadOptions() opts.ChunkRange = 1000 opts.ChunkDirRoot = chunkDir - h, err := NewHead(nil, nil, nil, opts, nil) + h, err := NewHead(nil, nil, nil, nil, opts, nil) require.NoError(t, err) defer func() { require.NoError(t, h.Close()) @@ -1944,13 +1944,17 @@ func BenchmarkQueries(b *testing.B) { }, } - queryTypes := make(map[string]storage.Querier) + type qt struct { + typ string + querier storage.Querier + } + var queryTypes []qt // We use a slice instead of map to keep the order of test cases consistent. defer func() { for _, q := range queryTypes { // Can't run a check for error here as some of these will fail as // queryTypes is using the same slice for the different block queriers // and would have been closed in the previous iteration. - q.Close() + q.querier.Close() } }() @@ -1991,21 +1995,38 @@ func BenchmarkQueries(b *testing.B) { qs = append(qs, q) } - queryTypes["_1-Block"] = storage.NewMergeQuerier(qs[:1], nil, storage.ChainedSeriesMerge) - queryTypes["_3-Blocks"] = storage.NewMergeQuerier(qs[0:3], nil, storage.ChainedSeriesMerge) - queryTypes["_10-Blocks"] = storage.NewMergeQuerier(qs, nil, storage.ChainedSeriesMerge) + queryTypes = append(queryTypes, qt{"_1-Block", storage.NewMergeQuerier(qs[:1], nil, storage.ChainedSeriesMerge)}) + queryTypes = append(queryTypes, qt{"_3-Blocks", storage.NewMergeQuerier(qs[0:3], nil, storage.ChainedSeriesMerge)}) + queryTypes = append(queryTypes, qt{"_10-Blocks", storage.NewMergeQuerier(qs, nil, storage.ChainedSeriesMerge)}) chunkDir := b.TempDir() head := createHead(b, nil, series, chunkDir) - qHead, err := NewBlockQuerier(head, 1, nSamples) + qHead, err := NewBlockQuerier(NewRangeHead(head, 1, nSamples), 1, nSamples) require.NoError(b, err) - queryTypes["_Head"] = qHead + queryTypes = append(queryTypes, qt{"_Head", qHead}) - for qtype, querier := range queryTypes { - b.Run(title+qtype+"_nSeries:"+strconv.Itoa(nSeries)+"_nSamples:"+strconv.Itoa(int(nSamples)), func(b *testing.B) { + for _, oooPercentage := range []int{1, 3, 5, 10} { + chunkDir := b.TempDir() + totalOOOSamples := oooPercentage * int(nSamples) / 100 + oooSampleFrequency := int(nSamples) / totalOOOSamples + head := createHeadWithOOOSamples(b, nil, series, chunkDir, oooSampleFrequency) + + qHead, err := NewBlockQuerier(NewRangeHead(head, 1, nSamples), 1, nSamples) + require.NoError(b, err) + qOOOHead, err := NewBlockQuerier(NewOOORangeHead(head, 1, nSamples), 1, nSamples) + require.NoError(b, err) + + queryTypes = append(queryTypes, qt{ + fmt.Sprintf("_Head_oooPercent:%d", oooPercentage), + storage.NewMergeQuerier([]storage.Querier{qHead, qOOOHead}, nil, storage.ChainedSeriesMerge), + }) + } + + for _, q := range queryTypes { + b.Run(title+q.typ+"_nSeries:"+strconv.Itoa(nSeries)+"_nSamples:"+strconv.Itoa(int(nSamples)), func(b *testing.B) { expExpansions, err := strconv.Atoi(string(title[len(title)-1])) require.NoError(b, err) - benchQuery(b, expExpansions, querier, selectors) + benchQuery(b, expExpansions, q.querier, selectors) }) } require.NoError(b, head.Close()) @@ -2025,6 +2046,7 @@ func benchQuery(b *testing.B, expExpansions int, q storage.Querier, selectors la s.Labels() it := s.Iterator() for it.Next() { + _, _ = it.At() } actualExpansions++ } diff --git a/tsdb/record/record.go b/tsdb/record/record.go index ee7169a457..162414a3ce 100644 --- a/tsdb/record/record.go +++ b/tsdb/record/record.go @@ -43,6 +43,8 @@ const ( Tombstones Type = 3 // Exemplars is used to match WAL records of type Exemplars. Exemplars Type = 4 + // MmapMarkers is used to match OOO WBL records of type MmapMarkers. + MmapMarkers Type = 5 // Metadata is used to match WAL records of type Metadata. Metadata Type = 6 ) @@ -57,6 +59,8 @@ func (rt Type) String() string { return "exemplars" case Tombstones: return "tombstones" + case MmapMarkers: + return "mmapmarkers" case Metadata: return "metadata" default: @@ -157,6 +161,12 @@ type RefExemplar struct { Labels labels.Labels } +// RefMmapMarker marks that the all the samples of the given series until now have been m-mapped to disk. +type RefMmapMarker struct { + Ref chunks.HeadSeriesRef + MmapRef chunks.ChunkDiskMapperRef +} + // Decoder decodes series, sample, metadata and tombstone records. // The zero value is ready to use. type Decoder struct{} @@ -168,7 +178,7 @@ func (d *Decoder) Type(rec []byte) Type { return Unknown } switch t := Type(rec[0]); t { - case Series, Samples, Tombstones, Exemplars, Metadata: + case Series, Samples, Tombstones, Exemplars, MmapMarkers, Metadata: return t } return Unknown @@ -354,6 +364,34 @@ func (d *Decoder) ExemplarsFromBuffer(dec *encoding.Decbuf, exemplars []RefExemp return exemplars, nil } +func (d *Decoder) MmapMarkers(rec []byte, markers []RefMmapMarker) ([]RefMmapMarker, error) { + dec := encoding.Decbuf{B: rec} + t := Type(dec.Byte()) + if t != MmapMarkers { + return nil, errors.New("invalid record type") + } + + if dec.Len() == 0 { + return markers, nil + } + for len(dec.B) > 0 && dec.Err() == nil { + ref := chunks.HeadSeriesRef(dec.Be64()) + mmapRef := chunks.ChunkDiskMapperRef(dec.Be64()) + markers = append(markers, RefMmapMarker{ + Ref: ref, + MmapRef: mmapRef, + }) + } + + if dec.Err() != nil { + return nil, errors.Wrapf(dec.Err(), "decode error after %d mmap markers", len(markers)) + } + if len(dec.B) > 0 { + return nil, errors.Errorf("unexpected %d bytes left in entry", len(dec.B)) + } + return markers, nil +} + // Encoder encodes series, sample, and tombstones records. // The zero value is ready to use. type Encoder struct{} @@ -467,3 +505,15 @@ func (e *Encoder) EncodeExemplarsIntoBuffer(exemplars []RefExemplar, buf *encodi EncodeLabels(buf, ex.Labels) } } + +func (e *Encoder) MmapMarkers(markers []RefMmapMarker, b []byte) []byte { + buf := encoding.Encbuf{B: b} + buf.PutByte(byte(MmapMarkers)) + + for _, s := range markers { + buf.PutBE64(uint64(s.Ref)) + buf.PutBE64(uint64(s.MmapRef)) + } + + return buf.Get() +} diff --git a/tsdb/wal/wal.go b/tsdb/wal/wal.go index ace6a99566..191b09ed99 100644 --- a/tsdb/wal/wal.go +++ b/tsdb/wal/wal.go @@ -40,6 +40,7 @@ const ( DefaultSegmentSize = 128 * 1024 * 1024 // 128 MB pageSize = 32 * 1024 // 32KB recordHeaderSize = 7 + WblDirName = "wbl" ) // The table gets initialized with sync.Once but may still cause a race @@ -204,32 +205,32 @@ func newWALMetrics(r prometheus.Registerer) *walMetrics { m := &walMetrics{} m.fsyncDuration = prometheus.NewSummary(prometheus.SummaryOpts{ - Name: "prometheus_tsdb_wal_fsync_duration_seconds", + Name: "fsync_duration_seconds", Help: "Duration of WAL fsync.", Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, }) m.pageFlushes = prometheus.NewCounter(prometheus.CounterOpts{ - Name: "prometheus_tsdb_wal_page_flushes_total", + Name: "page_flushes_total", Help: "Total number of page flushes.", }) m.pageCompletions = prometheus.NewCounter(prometheus.CounterOpts{ - Name: "prometheus_tsdb_wal_completed_pages_total", + Name: "completed_pages_total", Help: "Total number of completed pages.", }) m.truncateFail = prometheus.NewCounter(prometheus.CounterOpts{ - Name: "prometheus_tsdb_wal_truncations_failed_total", + Name: "truncations_failed_total", Help: "Total number of WAL truncations that failed.", }) m.truncateTotal = prometheus.NewCounter(prometheus.CounterOpts{ - Name: "prometheus_tsdb_wal_truncations_total", + Name: "truncations_total", Help: "Total number of WAL truncations attempted.", }) m.currentSegment = prometheus.NewGauge(prometheus.GaugeOpts{ - Name: "prometheus_tsdb_wal_segment_current", + Name: "segment_current", Help: "WAL segment index that TSDB is currently writing to.", }) m.writesFailed = prometheus.NewCounter(prometheus.CounterOpts{ - Name: "prometheus_tsdb_wal_writes_failed_total", + Name: "writes_failed_total", Help: "Total number of WAL writes that failed.", }) @@ -274,7 +275,11 @@ func NewSize(logger log.Logger, reg prometheus.Registerer, dir string, segmentSi stopc: make(chan chan struct{}), compress: compress, } - w.metrics = newWALMetrics(reg) + prefix := "prometheus_tsdb_wal_" + if filepath.Base(dir) == WblDirName { + prefix = "prometheus_tsdb_out_of_order_wal_" + } + w.metrics = newWALMetrics(prometheus.WrapRegistererWithPrefix(prefix, reg)) _, last, err := Segments(w.Dir()) if err != nil { @@ -459,36 +464,46 @@ func SegmentName(dir string, i int) string { return filepath.Join(dir, fmt.Sprintf("%08d", i)) } -// NextSegment creates the next segment and closes the previous one. -func (w *WAL) NextSegment() error { +// NextSegment creates the next segment and closes the previous one asynchronously. +// It returns the file number of the new file. +func (w *WAL) NextSegment() (int, error) { w.mtx.Lock() defer w.mtx.Unlock() - return w.nextSegment() + return w.nextSegment(true) +} + +// NextSegmentSync creates the next segment and closes the previous one in sync. +// It returns the file number of the new file. +func (w *WAL) NextSegmentSync() (int, error) { + w.mtx.Lock() + defer w.mtx.Unlock() + return w.nextSegment(false) } // nextSegment creates the next segment and closes the previous one. -func (w *WAL) nextSegment() error { +// It returns the file number of the new file. +func (w *WAL) nextSegment(async bool) (int, error) { if w.closed { - return errors.New("wal is closed") + return 0, errors.New("wal is closed") } // Only flush the current page if it actually holds data. if w.page.alloc > 0 { if err := w.flushPage(true); err != nil { - return err + return 0, err } } next, err := CreateSegment(w.Dir(), w.segment.Index()+1) if err != nil { - return errors.Wrap(err, "create new segment file") + return 0, errors.Wrap(err, "create new segment file") } prev := w.segment if err := w.setSegment(next); err != nil { - return err + return 0, err } // Don't block further writes by fsyncing the last segment. - w.actorc <- func() { + f := func() { if err := w.fsync(prev); err != nil { level.Error(w.logger).Log("msg", "sync previous segment", "err", err) } @@ -496,7 +511,12 @@ func (w *WAL) nextSegment() error { level.Error(w.logger).Log("msg", "close previous segment", "err", err) } } - return nil + if async { + w.actorc <- f + } else { + f() + } + return next.Index(), nil } func (w *WAL) setSegment(segment *Segment) error { @@ -638,7 +658,7 @@ func (w *WAL) log(rec []byte, final bool) error { left += (pageSize - recordHeaderSize) * (w.pagesPerSegment() - w.donePages - 1) // Free pages in the active segment. if len(rec) > left { - if err := w.nextSegment(); err != nil { + if _, err := w.nextSegment(true); err != nil { return err } } @@ -745,6 +765,13 @@ func (w *WAL) fsync(f *Segment) error { return err } +// Sync forces a file sync on the current wal segment. This function is meant +// to be used only on tests due to different behaviour on Operating Systems +// like windows and linux +func (w *WAL) Sync() error { + return w.fsync(w.segment) +} + // Close flushes all writes and closes active segment. func (w *WAL) Close() (err error) { w.mtx.Lock() diff --git a/tsdb/wal/watcher_test.go b/tsdb/wal/watcher_test.go index 0892d972c3..b89f8bead9 100644 --- a/tsdb/wal/watcher_test.go +++ b/tsdb/wal/watcher_test.go @@ -364,14 +364,16 @@ func TestReadCheckpoint(t *testing.T) { err := os.Mkdir(wdir, 0o777) require.NoError(t, err) - os.Create(SegmentName(wdir, 30)) + f, err := os.Create(SegmentName(wdir, 30)) + require.NoError(t, err) + require.NoError(t, f.Close()) enc := record.Encoder{} w, err := NewSize(nil, nil, wdir, 128*pageSize, compress) require.NoError(t, err) - defer func() { + t.Cleanup(func() { require.NoError(t, w.Close()) - }() + }) // Write to the initial segment then checkpoint. for i := 0; i < seriesCount; i++ { @@ -396,8 +398,11 @@ func TestReadCheckpoint(t *testing.T) { require.NoError(t, w.Log(sample)) } } - Checkpoint(log.NewNopLogger(), w, 30, 31, func(x chunks.HeadSeriesRef) bool { return true }, 0) - w.Truncate(32) + _, err = w.NextSegmentSync() + require.NoError(t, err) + _, err = Checkpoint(log.NewNopLogger(), w, 30, 31, func(x chunks.HeadSeriesRef) bool { return true }, 0) + require.NoError(t, err) + require.NoError(t, w.Truncate(32)) // Start read after checkpoint, no more data written. _, _, err = Segments(w.Dir()) diff --git a/web/api/v1/api_test.go b/web/api/v1/api_test.go index a904e47db3..d672807d3f 100644 --- a/web/api/v1/api_test.go +++ b/web/api/v1/api_test.go @@ -2314,7 +2314,7 @@ func (f *fakeDB) Stats(statsByLabelName string) (_ *tsdb.Stats, retErr error) { }() opts := tsdb.DefaultHeadOptions() opts.ChunkRange = 1000 - h, _ := tsdb.NewHead(nil, nil, nil, opts, nil) + h, _ := tsdb.NewHead(nil, nil, nil, nil, opts, nil) return h.Stats(statsByLabelName), nil }