Add out-of-order sample support to the TSDB (#11075)

* Introduce out-of-order TSDB support

This implementation is based on this design doc:
https://docs.google.com/document/d/1Kppm7qL9C-BJB1j6yb6-9ObG3AbdZnFUBYPNNWwDBYM/edit?usp=sharing

This commit adds support to accept out-of-order ("OOO") sample into the TSDB
up to a configurable time allowance. If OOO is enabled, overlapping querying
are automatically enabled.

Most of the additions have been borrowed from
https://github.com/grafana/mimir-prometheus/
Here is the list ist of the original commits cherry picked
from mimir-prometheus into this branch:
- 4b2198d7ec
- 2836e5513f
- 00b379c3a5
- ff0dc75758
- a632c73352
- c6f3d4ab33
- 5e8406a1d4
- abde1e0ba1
- e70e769889
- df59320886

Co-authored-by: Jesus Vazquez <jesus.vazquez@grafana.com>
Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com>
Co-authored-by: Dieter Plaetinck <dieter@grafana.com>
Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* gofumpt files

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Add license header to missing files

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Fix OOO tests due to existing chunk disk mapper implementation

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Fix truncate int overflow

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Add Sync method to the WAL and update tests

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* remove useless sync

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Update minOOOTime after truncating Head

* Update minOOOTime after truncating Head

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix lint

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Add a unit test

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Load OutOfOrderTimeWindow only once per appender

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Fix OOO Head LabelValues and PostingsForMatchers

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Fix replay of OOO mmap chunks

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Remove unnecessary err check

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Prevent panic with ApplyConfig

Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com
Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Run OOO compaction after restart if there is OOO data from WBL

Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com
Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Apply Bartek's suggestions

Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com>
Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Refactor OOO compaction

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Address comments and TODOs

- Added a comment explaining why we need the allow overlapping
  compaction toggle
- Clarified TSDBConfig OutOfOrderTimeWindow doc
- Added an owner to all the TODOs in the code

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Run go format

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Fix remaining review comments

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix tests

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Change wbl reference when truncating ooo in TestHeadMinOOOTimeUpdate

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Fix TestWBLAndMmapReplay test failure on windows

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Address most of the feedback

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Refactor the block meta for out of order

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix windows error

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix review comments

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>
Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com
Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com>
Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com>
Co-authored-by: Dieter Plaetinck <dieter@grafana.com>
Co-authored-by: Oleg Zaytsev <mail@olegzaytsev.com>
Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com>
This commit is contained in:
Jesus Vazquez 2022-09-20 19:05:50 +02:00 committed by GitHub
parent af6167df58
commit c1b669bf9b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
38 changed files with 6655 additions and 380 deletions

View file

@ -463,6 +463,9 @@ func main() {
} }
cfg.tsdb.MaxExemplars = int64(cfgFile.StorageConfig.ExemplarsConfig.MaxExemplars) cfg.tsdb.MaxExemplars = int64(cfgFile.StorageConfig.ExemplarsConfig.MaxExemplars)
} }
if cfgFile.StorageConfig.TSDBConfig != nil {
cfg.tsdb.OutOfOrderTimeWindow = cfgFile.StorageConfig.TSDBConfig.OutOfOrderTimeWindow
}
// Now that the validity of the config is established, set the config // Now that the validity of the config is established, set the config
// success metrics accordingly, although the config isn't really loaded // success metrics accordingly, although the config isn't really loaded
@ -1537,6 +1540,7 @@ type tsdbOptions struct {
StripeSize int StripeSize int
MinBlockDuration model.Duration MinBlockDuration model.Duration
MaxBlockDuration model.Duration MaxBlockDuration model.Duration
OutOfOrderTimeWindow int64
EnableExemplarStorage bool EnableExemplarStorage bool
MaxExemplars int64 MaxExemplars int64
EnableMemorySnapshotOnShutdown bool EnableMemorySnapshotOnShutdown bool
@ -1549,7 +1553,8 @@ func (opts tsdbOptions) ToTSDBOptions() tsdb.Options {
RetentionDuration: int64(time.Duration(opts.RetentionDuration) / time.Millisecond), RetentionDuration: int64(time.Duration(opts.RetentionDuration) / time.Millisecond),
MaxBytes: int64(opts.MaxBytes), MaxBytes: int64(opts.MaxBytes),
NoLockfile: opts.NoLockfile, NoLockfile: opts.NoLockfile,
AllowOverlappingBlocks: opts.AllowOverlappingBlocks, AllowOverlappingCompaction: opts.AllowOverlappingBlocks,
AllowOverlappingQueries: opts.AllowOverlappingBlocks,
WALCompression: opts.WALCompression, WALCompression: opts.WALCompression,
HeadChunksWriteQueueSize: opts.HeadChunksWriteQueueSize, HeadChunksWriteQueueSize: opts.HeadChunksWriteQueueSize,
StripeSize: opts.StripeSize, StripeSize: opts.StripeSize,
@ -1558,6 +1563,7 @@ func (opts tsdbOptions) ToTSDBOptions() tsdb.Options {
EnableExemplarStorage: opts.EnableExemplarStorage, EnableExemplarStorage: opts.EnableExemplarStorage,
MaxExemplars: opts.MaxExemplars, MaxExemplars: opts.MaxExemplars,
EnableMemorySnapshotOnShutdown: opts.EnableMemorySnapshotOnShutdown, EnableMemorySnapshotOnShutdown: opts.EnableMemorySnapshotOnShutdown,
OutOfOrderTimeWindow: opts.OutOfOrderTimeWindow,
} }
} }

View file

@ -117,7 +117,8 @@ func TestBackfillRuleIntegration(t *testing.T) {
} }
opts := tsdb.DefaultOptions() opts := tsdb.DefaultOptions()
opts.AllowOverlappingBlocks = true opts.AllowOverlappingQueries = true
opts.AllowOverlappingCompaction = true
db, err := tsdb.Open(tmpDir, nil, nil, opts, nil) db, err := tsdb.Open(tmpDir, nil, nil, opts, nil)
require.NoError(t, err) require.NoError(t, err)
@ -245,7 +246,8 @@ func TestBackfillLabels(t *testing.T) {
} }
opts := tsdb.DefaultOptions() opts := tsdb.DefaultOptions()
opts.AllowOverlappingBlocks = true opts.AllowOverlappingQueries = true
opts.AllowOverlappingCompaction = true
db, err := tsdb.Open(tmpDir, nil, nil, opts, nil) db, err := tsdb.Open(tmpDir, nil, nil, opts, nil)
require.NoError(t, err) require.NoError(t, err)

View file

@ -597,7 +597,7 @@ func analyzeCompaction(block tsdb.BlockReader, indexr tsdb.IndexReader) (err err
for _, chk := range chks { for _, chk := range chks {
// Load the actual data of the chunk. // Load the actual data of the chunk.
chk, err := chunkr.Chunk(chk.Ref) chk, err := chunkr.Chunk(chk)
if err != nil { if err != nil {
return err return err
} }

View file

@ -501,9 +501,37 @@ func (c *ScrapeConfig) MarshalYAML() (interface{}, error) {
// StorageConfig configures runtime reloadable configuration options. // StorageConfig configures runtime reloadable configuration options.
type StorageConfig struct { type StorageConfig struct {
TSDBConfig *TSDBConfig `yaml:"tsdb,omitempty"`
ExemplarsConfig *ExemplarsConfig `yaml:"exemplars,omitempty"` ExemplarsConfig *ExemplarsConfig `yaml:"exemplars,omitempty"`
} }
// TSDBConfig configures runtime reloadable configuration options.
type TSDBConfig struct {
// OutOfOrderTimeWindow sets how long back in time an out-of-order sample can be inserted
// into the TSDB. This flag is typically set while unmarshaling the configuration file and translating
// OutOfOrderTimeWindowFlag's duration. The unit of this flag is expected to be the same as any
// other timestamp in the TSDB.
OutOfOrderTimeWindow int64
// OutOfOrderTimeWindowFlag holds the parsed duration from the config file.
// During unmarshall, this is converted into milliseconds and stored in OutOfOrderTimeWindow.
// This should not be used directly and must be converted into OutOfOrderTimeWindow.
OutOfOrderTimeWindowFlag model.Duration `yaml:"out_of_order_time_window,omitempty"`
}
// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (t *TSDBConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
*t = TSDBConfig{}
type plain TSDBConfig
if err := unmarshal((*plain)(t)); err != nil {
return err
}
t.OutOfOrderTimeWindow = time.Duration(t.OutOfOrderTimeWindowFlag).Milliseconds()
return nil
}
type TracingClientType string type TracingClientType string
const ( const (

View file

@ -27,10 +27,15 @@ import (
// The errors exposed. // The errors exposed.
var ( var (
ErrNotFound = errors.New("not found") ErrNotFound = errors.New("not found")
ErrOutOfOrderSample = errors.New("out of order sample") // ErrOutOfOrderSample is when out of order support is disabled and the sample is out of order.
ErrOutOfOrderSample = errors.New("out of order sample")
// ErrOutOfBounds is when out of order support is disabled and the sample is older than the min valid time for the append.
ErrOutOfBounds = errors.New("out of bounds")
// ErrTooOldSample is when out of order support is enabled but the sample is outside the time window allowed.
ErrTooOldSample = errors.New("too old sample")
// ErrDuplicateSampleForTimestamp is when the sample has same timestamp but different value.
ErrDuplicateSampleForTimestamp = errors.New("duplicate sample for timestamp") ErrDuplicateSampleForTimestamp = errors.New("duplicate sample for timestamp")
ErrOutOfBounds = errors.New("out of bounds")
ErrOutOfOrderExemplar = errors.New("out of order exemplar") ErrOutOfOrderExemplar = errors.New("out of order exemplar")
ErrDuplicateExemplar = errors.New("duplicate exemplar") ErrDuplicateExemplar = errors.New("duplicate exemplar")
ErrExemplarLabelLength = fmt.Errorf("label length for exemplar exceeds maximum of %d UTF-8 characters", exemplar.ExemplarMaxLabelSetLength) ErrExemplarLabelLength = fmt.Errorf("label length for exemplar exceeds maximum of %d UTF-8 characters", exemplar.ExemplarMaxLabelSetLength)

View file

@ -717,3 +717,56 @@ func (h *chunkIteratorHeap) Pop() interface{} {
*h = old[0 : n-1] *h = old[0 : n-1]
return x return x
} }
// NewConcatenatingChunkSeriesMerger returns a VerticalChunkSeriesMergeFunc that simply concatenates the
// chunks from the series. The resultant stream of chunks for a series might be overlapping and unsorted.
func NewConcatenatingChunkSeriesMerger() VerticalChunkSeriesMergeFunc {
return func(series ...ChunkSeries) ChunkSeries {
if len(series) == 0 {
return nil
}
return &ChunkSeriesEntry{
Lset: series[0].Labels(),
ChunkIteratorFn: func() chunks.Iterator {
iterators := make([]chunks.Iterator, 0, len(series))
for _, s := range series {
iterators = append(iterators, s.Iterator())
}
return &concatenatingChunkIterator{
iterators: iterators,
}
},
}
}
}
type concatenatingChunkIterator struct {
iterators []chunks.Iterator
idx int
curr chunks.Meta
}
func (c *concatenatingChunkIterator) At() chunks.Meta {
return c.curr
}
func (c *concatenatingChunkIterator) Next() bool {
if c.idx >= len(c.iterators) {
return false
}
if c.iterators[c.idx].Next() {
c.curr = c.iterators[c.idx].At()
return true
}
c.idx++
return c.Next()
}
func (c *concatenatingChunkIterator) Err() error {
errs := tsdb_errors.NewMulti()
for _, iter := range c.iterators {
errs.Add(iter.Err())
}
return errs.Err()
}

View file

@ -499,6 +499,140 @@ func TestCompactingChunkSeriesMerger(t *testing.T) {
} }
} }
func TestConcatenatingChunkSeriesMerger(t *testing.T) {
m := NewConcatenatingChunkSeriesMerger()
for _, tc := range []struct {
name string
input []ChunkSeries
expected ChunkSeries
}{
{
name: "single empty series",
input: []ChunkSeries{
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil),
},
expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil),
},
{
name: "single series",
input: []ChunkSeries{
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}}),
},
expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}}),
},
{
name: "two empty series",
input: []ChunkSeries{
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil),
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil),
},
expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil, nil),
},
{
name: "two non overlapping",
input: []ChunkSeries{
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}, sample{5, 5}}),
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{7, 7}, sample{9, 9}}, []tsdbutil.Sample{sample{10, 10}}),
},
expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}, sample{5, 5}}, []tsdbutil.Sample{sample{7, 7}, sample{9, 9}}, []tsdbutil.Sample{sample{10, 10}}),
},
{
name: "two overlapping",
input: []ChunkSeries{
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}, sample{8, 8}}),
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{7, 7}, sample{9, 9}}, []tsdbutil.Sample{sample{10, 10}}),
},
expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
[]tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}, sample{8, 8}},
[]tsdbutil.Sample{sample{7, 7}, sample{9, 9}}, []tsdbutil.Sample{sample{10, 10}},
),
},
{
name: "two duplicated",
input: []ChunkSeries{
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}}),
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{2, 2}, sample{3, 3}, sample{5, 5}}),
},
expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
[]tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}},
[]tsdbutil.Sample{sample{2, 2}, sample{3, 3}, sample{5, 5}},
),
},
{
name: "three overlapping",
input: []ChunkSeries{
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}}),
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{2, 2}, sample{3, 3}, sample{6, 6}}),
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{0, 0}, sample{4, 4}}),
},
expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
[]tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}},
[]tsdbutil.Sample{sample{2, 2}, sample{3, 3}, sample{6, 6}},
[]tsdbutil.Sample{sample{0, 0}, sample{4, 4}},
),
},
{
name: "three in chained overlap",
input: []ChunkSeries{
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}}),
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{4, 4}, sample{6, 66}}),
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{6, 6}, sample{10, 10}}),
},
expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
[]tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}},
[]tsdbutil.Sample{sample{4, 4}, sample{6, 66}},
[]tsdbutil.Sample{sample{6, 6}, sample{10, 10}},
),
},
{
name: "three in chained overlap complex",
input: []ChunkSeries{
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{0, 0}, sample{5, 5}}, []tsdbutil.Sample{sample{10, 10}, sample{15, 15}}),
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{2, 2}, sample{20, 20}}, []tsdbutil.Sample{sample{25, 25}, sample{30, 30}}),
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{18, 18}, sample{26, 26}}, []tsdbutil.Sample{sample{31, 31}, sample{35, 35}}),
},
expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
[]tsdbutil.Sample{sample{0, 0}, sample{5, 5}}, []tsdbutil.Sample{sample{10, 10}, sample{15, 15}},
[]tsdbutil.Sample{sample{2, 2}, sample{20, 20}}, []tsdbutil.Sample{sample{25, 25}, sample{30, 30}},
[]tsdbutil.Sample{sample{18, 18}, sample{26, 26}}, []tsdbutil.Sample{sample{31, 31}, sample{35, 35}},
),
},
{
name: "110 overlapping",
input: []ChunkSeries{
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), tsdbutil.GenerateSamples(0, 110)), // [0 - 110)
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), tsdbutil.GenerateSamples(60, 50)), // [60 - 110)
},
expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
tsdbutil.GenerateSamples(0, 110),
tsdbutil.GenerateSamples(60, 50),
),
},
{
name: "150 overlapping samples, simply concatenated and no splits",
input: []ChunkSeries{
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), tsdbutil.GenerateSamples(0, 90)), // [0 - 90)
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), tsdbutil.GenerateSamples(60, 90)), // [90 - 150)
},
expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
tsdbutil.GenerateSamples(0, 90),
tsdbutil.GenerateSamples(60, 90),
),
},
} {
t.Run(tc.name, func(t *testing.T) {
merged := m(tc.input...)
require.Equal(t, tc.expected.Labels(), merged.Labels())
actChks, actErr := ExpandChunks(merged.Iterator())
expChks, expErr := ExpandChunks(tc.expected.Iterator())
require.Equal(t, expErr, actErr)
require.Equal(t, expChks, actChks)
})
}
}
type mockQuerier struct { type mockQuerier struct {
LabelQuerier LabelQuerier

View file

@ -567,8 +567,7 @@ func (db *DB) truncate(mint int64) error {
// Start a new segment so low ingestion volume instances don't have more WAL // Start a new segment so low ingestion volume instances don't have more WAL
// than needed. // than needed.
err = db.wal.NextSegment() if _, err := db.wal.NextSegment(); err != nil {
if err != nil {
return errors.Wrap(err, "next segment") return errors.Wrap(err, "next segment")
} }

View file

@ -116,7 +116,7 @@ type ChunkWriter interface {
// ChunkReader provides reading access of serialized time series data. // ChunkReader provides reading access of serialized time series data.
type ChunkReader interface { type ChunkReader interface {
// Chunk returns the series data chunk with the given reference. // Chunk returns the series data chunk with the given reference.
Chunk(ref chunks.ChunkRef) (chunkenc.Chunk, error) Chunk(meta chunks.Meta) (chunkenc.Chunk, error)
// Close releases all underlying resources of the reader. // Close releases all underlying resources of the reader.
Close() error Close() error
@ -189,12 +189,39 @@ type BlockMetaCompaction struct {
// this block. // this block.
Parents []BlockDesc `json:"parents,omitempty"` Parents []BlockDesc `json:"parents,omitempty"`
Failed bool `json:"failed,omitempty"` Failed bool `json:"failed,omitempty"`
// Additional information about the compaction, for example, block created from out-of-order chunks.
Hints []string `json:"hints,omitempty"`
}
func (bm *BlockMetaCompaction) SetOutOfOrder() {
if bm.containsHint(CompactionHintFromOutOfOrder) {
return
}
bm.Hints = append(bm.Hints, CompactionHintFromOutOfOrder)
sort.Strings(bm.Hints)
}
func (bm *BlockMetaCompaction) FromOutOfOrder() bool {
return bm.containsHint(CompactionHintFromOutOfOrder)
}
func (bm *BlockMetaCompaction) containsHint(hint string) bool {
for _, h := range bm.Hints {
if h == hint {
return true
}
}
return false
} }
const ( const (
indexFilename = "index" indexFilename = "index"
metaFilename = "meta.json" metaFilename = "meta.json"
metaVersion1 = 1 metaVersion1 = 1
// CompactionHintFromOutOfOrder is a hint noting that the block
// was created from out-of-order chunks.
CompactionHintFromOutOfOrder = "from-out-of-order"
) )
func chunkDir(dir string) string { return filepath.Join(dir, "chunks") } func chunkDir(dir string) string { return filepath.Join(dir, "chunks") }

View file

@ -27,6 +27,7 @@ import (
"testing" "testing"
"github.com/go-kit/log" "github.com/go-kit/log"
prom_testutil "github.com/prometheus/client_golang/prometheus/testutil"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
"github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/model/labels"
@ -487,7 +488,7 @@ func createBlockFromHead(tb testing.TB, dir string, head *Head) string {
func createHead(tb testing.TB, w *wal.WAL, series []storage.Series, chunkDir string) *Head { func createHead(tb testing.TB, w *wal.WAL, series []storage.Series, chunkDir string) *Head {
opts := DefaultHeadOptions() opts := DefaultHeadOptions()
opts.ChunkDirRoot = chunkDir opts.ChunkDirRoot = chunkDir
head, err := NewHead(nil, nil, w, opts, nil) head, err := NewHead(nil, nil, w, nil, opts, nil)
require.NoError(tb, err) require.NoError(tb, err)
app := head.Appender(context.Background()) app := head.Appender(context.Background())
@ -506,6 +507,66 @@ func createHead(tb testing.TB, w *wal.WAL, series []storage.Series, chunkDir str
return head return head
} }
func createHeadWithOOOSamples(tb testing.TB, w *wal.WAL, series []storage.Series, chunkDir string, oooSampleFrequency int) *Head {
opts := DefaultHeadOptions()
opts.ChunkDirRoot = chunkDir
opts.OutOfOrderTimeWindow.Store(10000000000)
head, err := NewHead(nil, nil, w, nil, opts, nil)
require.NoError(tb, err)
oooSampleLabels := make([]labels.Labels, 0, len(series))
oooSamples := make([]tsdbutil.SampleSlice, 0, len(series))
totalSamples := 0
app := head.Appender(context.Background())
for _, s := range series {
ref := storage.SeriesRef(0)
it := s.Iterator()
lset := s.Labels()
os := tsdbutil.SampleSlice{}
count := 0
for it.Next() {
totalSamples++
count++
t, v := it.At()
if count%oooSampleFrequency == 0 {
os = append(os, sample{t: t, v: v})
continue
}
ref, err = app.Append(ref, lset, t, v)
require.NoError(tb, err)
}
require.NoError(tb, it.Err())
if len(os) > 0 {
oooSampleLabels = append(oooSampleLabels, lset)
oooSamples = append(oooSamples, os)
}
}
require.NoError(tb, app.Commit())
oooSamplesAppended := 0
require.Equal(tb, float64(0), prom_testutil.ToFloat64(head.metrics.outOfOrderSamplesAppended))
app = head.Appender(context.Background())
for i, lset := range oooSampleLabels {
ref := storage.SeriesRef(0)
for _, sample := range oooSamples[i] {
ref, err = app.Append(ref, lset, sample.T(), sample.V())
require.NoError(tb, err)
oooSamplesAppended++
}
}
require.NoError(tb, app.Commit())
actOOOAppended := prom_testutil.ToFloat64(head.metrics.outOfOrderSamplesAppended)
require.GreaterOrEqual(tb, actOOOAppended, float64(oooSamplesAppended-len(series)))
require.LessOrEqual(tb, actOOOAppended, float64(oooSamplesAppended))
require.Equal(tb, float64(totalSamples), prom_testutil.ToFloat64(head.metrics.samplesAppended))
return head
}
const ( const (
defaultLabelName = "labelName" defaultLabelName = "labelName"
defaultLabelValue = "labelValue" defaultLabelValue = "labelValue"

View file

@ -39,7 +39,7 @@ type BlockWriter struct {
} }
// ErrNoSeriesAppended is returned if the series count is zero while flushing blocks. // ErrNoSeriesAppended is returned if the series count is zero while flushing blocks.
var ErrNoSeriesAppended error = errors.New("no series appended, aborting") var ErrNoSeriesAppended = errors.New("no series appended, aborting")
// NewBlockWriter create a new block writer. // NewBlockWriter create a new block writer.
// //
@ -71,7 +71,7 @@ func (w *BlockWriter) initHead() error {
opts := DefaultHeadOptions() opts := DefaultHeadOptions()
opts.ChunkRange = w.blockSize opts.ChunkRange = w.blockSize
opts.ChunkDirRoot = w.chunkDir opts.ChunkDirRoot = w.chunkDir
h, err := NewHead(nil, w.logger, nil, opts, NewHeadStats()) h, err := NewHead(nil, w.logger, nil, nil, opts, NewHeadStats())
if err != nil { if err != nil {
return errors.Wrap(err, "tsdb.NewHead") return errors.Wrap(err, "tsdb.NewHead")
} }

View file

@ -39,6 +39,21 @@ const (
EncXOR EncXOR
) )
// Chunk encodings for out-of-order chunks.
// These encodings must be only used by the Head block for its internal bookkeeping.
const (
OutOfOrderMask = 0b10000000
EncOOOXOR = EncXOR | OutOfOrderMask
)
func IsOutOfOrderChunk(e Encoding) bool {
return (e & OutOfOrderMask) != 0
}
func IsValidEncoding(e Encoding) bool {
return e == EncXOR || e == EncOOOXOR
}
// Chunk holds a sequence of sample pairs that can be iterated over and appended to. // Chunk holds a sequence of sample pairs that can be iterated over and appended to.
type Chunk interface { type Chunk interface {
// Bytes returns the underlying byte slice of the chunk. // Bytes returns the underlying byte slice of the chunk.
@ -155,7 +170,7 @@ func NewPool() Pool {
func (p *pool) Get(e Encoding, b []byte) (Chunk, error) { func (p *pool) Get(e Encoding, b []byte) (Chunk, error) {
switch e { switch e {
case EncXOR: case EncXOR, EncOOOXOR:
c := p.xor.Get().(*XORChunk) c := p.xor.Get().(*XORChunk)
c.b.stream = b c.b.stream = b
c.b.count = 0 c.b.count = 0
@ -166,7 +181,7 @@ func (p *pool) Get(e Encoding, b []byte) (Chunk, error) {
func (p *pool) Put(c Chunk) error { func (p *pool) Put(c Chunk) error {
switch c.Encoding() { switch c.Encoding() {
case EncXOR: case EncXOR, EncOOOXOR:
xc, ok := c.(*XORChunk) xc, ok := c.(*XORChunk)
// This may happen often with wrapped chunks. Nothing we can really do about // This may happen often with wrapped chunks. Nothing we can really do about
// it but returning an error would cause a lot of allocations again. Thus, // it but returning an error would cause a lot of allocations again. Thus,
@ -188,7 +203,7 @@ func (p *pool) Put(c Chunk) error {
// bytes. // bytes.
func FromData(e Encoding, d []byte) (Chunk, error) { func FromData(e Encoding, d []byte) (Chunk, error) {
switch e { switch e {
case EncXOR: case EncXOR, EncOOOXOR:
return &XORChunk{b: bstream{count: 0, stream: d}}, nil return &XORChunk{b: bstream{count: 0, stream: d}}, nil
} }
return nil, errors.Errorf("invalid chunk encoding %q", e) return nil, errors.Errorf("invalid chunk encoding %q", e)

View file

@ -457,3 +457,12 @@ func (it *xorIterator) readValue() bool {
it.numRead++ it.numRead++
return true return true
} }
// OOOXORChunk holds a XORChunk and overrides the Encoding() method.
type OOOXORChunk struct {
*XORChunk
}
func (c *OOOXORChunk) Encoding() Encoding {
return EncOOOXOR
}

View file

@ -121,6 +121,15 @@ type Meta struct {
// Time range the data covers. // Time range the data covers.
// When MaxTime == math.MaxInt64 the chunk is still open and being appended to. // When MaxTime == math.MaxInt64 the chunk is still open and being appended to.
MinTime, MaxTime int64 MinTime, MaxTime int64
// OOOLastRef, OOOLastMinTime and OOOLastMaxTime are kept as markers for
// overlapping chunks.
// These fields point to the last created out of order Chunk (the head) that existed
// when Series() was called and was overlapping.
// Series() and Chunk() method responses should be consistent for the same
// query even if new data is added in between the calls.
OOOLastRef ChunkRef
OOOLastMinTime, OOOLastMaxTime int64
} }
// Iterator iterates over the chunks of a single time series. // Iterator iterates over the chunks of a single time series.
@ -556,8 +565,8 @@ func (s *Reader) Size() int64 {
} }
// Chunk returns a chunk from a given reference. // Chunk returns a chunk from a given reference.
func (s *Reader) Chunk(ref ChunkRef) (chunkenc.Chunk, error) { func (s *Reader) Chunk(meta Meta) (chunkenc.Chunk, error) {
sgmIndex, chkStart := BlockChunkRef(ref).Unpack() sgmIndex, chkStart := BlockChunkRef(meta.Ref).Unpack()
if sgmIndex >= len(s.bs) { if sgmIndex >= len(s.bs) {
return nil, errors.Errorf("segment index %d out of range", sgmIndex) return nil, errors.Errorf("segment index %d out of range", sgmIndex)

View file

@ -23,6 +23,6 @@ func TestReaderWithInvalidBuffer(t *testing.T) {
b := realByteSlice([]byte{0x81, 0x81, 0x81, 0x81, 0x81, 0x81}) b := realByteSlice([]byte{0x81, 0x81, 0x81, 0x81, 0x81, 0x81})
r := &Reader{bs: []ByteSlice{b}} r := &Reader{bs: []ByteSlice{b}}
_, err := r.Chunk(0) _, err := r.Chunk(Meta{Ref: 0})
require.Error(t, err) require.Error(t, err)
} }

View file

@ -87,6 +87,18 @@ func (ref ChunkDiskMapperRef) Unpack() (seq, offset int) {
return seq, offset return seq, offset
} }
func (ref ChunkDiskMapperRef) GreaterThanOrEqualTo(r ChunkDiskMapperRef) bool {
s1, o1 := ref.Unpack()
s2, o2 := r.Unpack()
return s1 > s2 || (s1 == s2 && o1 >= o2)
}
func (ref ChunkDiskMapperRef) GreaterThan(r ChunkDiskMapperRef) bool {
s1, o1 := ref.Unpack()
s2, o2 := r.Unpack()
return s1 > s2 || (s1 == s2 && o1 > o2)
}
// CorruptionErr is an error that's returned when corruption is encountered. // CorruptionErr is an error that's returned when corruption is encountered.
type CorruptionErr struct { type CorruptionErr struct {
Dir string Dir string
@ -736,7 +748,7 @@ func (cdm *ChunkDiskMapper) Chunk(ref ChunkDiskMapperRef) (chunkenc.Chunk, error
// and runs the provided function with information about each chunk. It returns on the first error encountered. // and runs the provided function with information about each chunk. It returns on the first error encountered.
// NOTE: This method needs to be called at least once after creating ChunkDiskMapper // NOTE: This method needs to be called at least once after creating ChunkDiskMapper
// to set the maxt of all the file. // to set the maxt of all the file.
func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, numSamples uint16) error) (err error) { func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, numSamples uint16, encoding chunkenc.Encoding) error) (err error) {
cdm.writePathMtx.Lock() cdm.writePathMtx.Lock()
defer cdm.writePathMtx.Unlock() defer cdm.writePathMtx.Unlock()
@ -799,7 +811,8 @@ func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chu
break break
} }
idx += ChunkEncodingSize // Skip encoding. chkEnc := chunkenc.Encoding(mmapFile.byteSlice.Range(idx, idx+ChunkEncodingSize)[0])
idx += ChunkEncodingSize
dataLen, n := binary.Uvarint(mmapFile.byteSlice.Range(idx, idx+MaxChunkLengthFieldSize)) dataLen, n := binary.Uvarint(mmapFile.byteSlice.Range(idx, idx+MaxChunkLengthFieldSize))
idx += n idx += n
@ -834,7 +847,7 @@ func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chu
mmapFile.maxt = maxt mmapFile.maxt = maxt
} }
if err := f(seriesRef, chunkRef, mint, maxt, numSamples); err != nil { if err := f(seriesRef, chunkRef, mint, maxt, numSamples, chkEnc); err != nil {
if cerr, ok := err.(*CorruptionErr); ok { if cerr, ok := err.(*CorruptionErr); ok {
cerr.Dir = cdm.dir.Name() cerr.Dir = cdm.dir.Name()
cerr.FileIndex = segID cerr.FileIndex = segID
@ -857,12 +870,8 @@ func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chu
return nil return nil
} }
// Truncate deletes the head chunk files which are strictly below the mint. // Truncate deletes the head chunk files whose file number is less than given fileNo.
// mint should be in milliseconds. func (cdm *ChunkDiskMapper) Truncate(fileNo uint32) error {
func (cdm *ChunkDiskMapper) Truncate(mint int64) error {
if !cdm.fileMaxtSet {
return errors.New("maxt of the files are not set")
}
cdm.readPathMtx.RLock() cdm.readPathMtx.RLock()
// Sort the file indices, else if files deletion fails in between, // Sort the file indices, else if files deletion fails in between,
@ -875,12 +884,10 @@ func (cdm *ChunkDiskMapper) Truncate(mint int64) error {
var removedFiles []int var removedFiles []int
for _, seq := range chkFileIndices { for _, seq := range chkFileIndices {
if seq == cdm.curFileSequence || cdm.mmappedChunkFiles[seq].maxt >= mint { if seq == cdm.curFileSequence || uint32(seq) >= fileNo {
break break
} }
if cdm.mmappedChunkFiles[seq].maxt < mint { removedFiles = append(removedFiles, seq)
removedFiles = append(removedFiles, seq)
}
} }
cdm.readPathMtx.RUnlock() cdm.readPathMtx.RUnlock()

View file

@ -58,6 +58,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) {
mint, maxt int64 mint, maxt int64
numSamples uint16 numSamples uint16
chunk chunkenc.Chunk chunk chunkenc.Chunk
isOOO bool
} }
expectedData := []expectedDataType{} expectedData := []expectedDataType{}
@ -67,7 +68,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) {
for hrw.curFileSequence < 3 || hrw.chkWriter.Buffered() == 0 { for hrw.curFileSequence < 3 || hrw.chkWriter.Buffered() == 0 {
addChunks := func(numChunks int) { addChunks := func(numChunks int) {
for i := 0; i < numChunks; i++ { for i := 0; i < numChunks; i++ {
seriesRef, chkRef, mint, maxt, chunk := createChunk(t, totalChunks, hrw) seriesRef, chkRef, mint, maxt, chunk, isOOO := createChunk(t, totalChunks, hrw)
totalChunks++ totalChunks++
expectedData = append(expectedData, expectedDataType{ expectedData = append(expectedData, expectedDataType{
seriesRef: seriesRef, seriesRef: seriesRef,
@ -76,6 +77,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) {
chunkRef: chkRef, chunkRef: chkRef,
chunk: chunk, chunk: chunk,
numSamples: uint16(chunk.NumSamples()), numSamples: uint16(chunk.NumSamples()),
isOOO: isOOO,
}) })
if hrw.curFileSequence != 1 { if hrw.curFileSequence != 1 {
@ -147,7 +149,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) {
hrw = createChunkDiskMapper(t, dir) hrw = createChunkDiskMapper(t, dir)
idx := 0 idx := 0
require.NoError(t, hrw.IterateAllChunks(func(seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, numSamples uint16) error { require.NoError(t, hrw.IterateAllChunks(func(seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, numSamples uint16, encoding chunkenc.Encoding) error {
t.Helper() t.Helper()
expData := expectedData[idx] expData := expectedData[idx]
@ -156,6 +158,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) {
require.Equal(t, expData.maxt, maxt) require.Equal(t, expData.maxt, maxt)
require.Equal(t, expData.maxt, maxt) require.Equal(t, expData.maxt, maxt)
require.Equal(t, expData.numSamples, numSamples) require.Equal(t, expData.numSamples, numSamples)
require.Equal(t, expData.isOOO, chunkenc.IsOutOfOrderChunk(encoding))
actChunk, err := hrw.Chunk(expData.chunkRef) actChunk, err := hrw.Chunk(expData.chunkRef)
require.NoError(t, err) require.NoError(t, err)
@ -178,9 +181,7 @@ func TestChunkDiskMapper_Truncate(t *testing.T) {
}() }()
timeRange := 0 timeRange := 0
fileTimeStep := 100 addChunk := func() {
var thirdFileMinT, sixthFileMinT int64
addChunk := func() int {
t.Helper() t.Helper()
step := 100 step := 100
@ -194,8 +195,6 @@ func TestChunkDiskMapper_Truncate(t *testing.T) {
<-awaitCb <-awaitCb
require.NoError(t, err) require.NoError(t, err)
timeRange += step timeRange += step
return mint
} }
verifyFiles := func(remainingFiles []int) { verifyFiles := func(remainingFiles []int) {
@ -216,17 +215,12 @@ func TestChunkDiskMapper_Truncate(t *testing.T) {
// Create segments 1 to 7. // Create segments 1 to 7.
for i := 1; i <= 7; i++ { for i := 1; i <= 7; i++ {
hrw.CutNewFile() hrw.CutNewFile()
mint := int64(addChunk()) addChunk()
if i == 3 {
thirdFileMinT = mint
} else if i == 6 {
sixthFileMinT = mint
}
} }
verifyFiles([]int{1, 2, 3, 4, 5, 6, 7}) verifyFiles([]int{1, 2, 3, 4, 5, 6, 7})
// Truncating files. // Truncating files.
require.NoError(t, hrw.Truncate(thirdFileMinT)) require.NoError(t, hrw.Truncate(3))
// Add a chunk to trigger cutting of new file. // Add a chunk to trigger cutting of new file.
addChunk() addChunk()
@ -245,11 +239,11 @@ func TestChunkDiskMapper_Truncate(t *testing.T) {
verifyFiles([]int{3, 4, 5, 6, 7, 8, 9}) verifyFiles([]int{3, 4, 5, 6, 7, 8, 9})
// Truncating files after restart. // Truncating files after restart.
require.NoError(t, hrw.Truncate(sixthFileMinT)) require.NoError(t, hrw.Truncate(6))
verifyFiles([]int{6, 7, 8, 9}) verifyFiles([]int{6, 7, 8, 9})
// Truncating a second time without adding a chunk shouldn't create a new file. // Truncating a second time without adding a chunk shouldn't create a new file.
require.NoError(t, hrw.Truncate(sixthFileMinT+1)) require.NoError(t, hrw.Truncate(6))
verifyFiles([]int{6, 7, 8, 9}) verifyFiles([]int{6, 7, 8, 9})
// Add a chunk to trigger cutting of new file. // Add a chunk to trigger cutting of new file.
@ -257,8 +251,12 @@ func TestChunkDiskMapper_Truncate(t *testing.T) {
verifyFiles([]int{6, 7, 8, 9, 10}) verifyFiles([]int{6, 7, 8, 9, 10})
// Truncation by file number.
require.NoError(t, hrw.Truncate(8))
verifyFiles([]int{8, 9, 10})
// Truncating till current time should not delete the current active file. // Truncating till current time should not delete the current active file.
require.NoError(t, hrw.Truncate(int64(timeRange+(2*fileTimeStep)))) require.NoError(t, hrw.Truncate(10))
// Add a chunk to trigger cutting of new file. // Add a chunk to trigger cutting of new file.
addChunk() addChunk()
@ -335,8 +333,7 @@ func TestChunkDiskMapper_Truncate_PreservesFileSequence(t *testing.T) {
// Truncating files till 2. It should not delete anything after 3 (inclusive) // Truncating files till 2. It should not delete anything after 3 (inclusive)
// though files 4 and 6 are empty. // though files 4 and 6 are empty.
file2Maxt := hrw.mmappedChunkFiles[2].maxt require.NoError(t, hrw.Truncate(3))
require.NoError(t, hrw.Truncate(file2Maxt+1))
verifyFiles([]int{3, 4, 5, 6}) verifyFiles([]int{3, 4, 5, 6})
// Add chunk, so file 6 is not empty anymore. // Add chunk, so file 6 is not empty anymore.
@ -344,8 +341,7 @@ func TestChunkDiskMapper_Truncate_PreservesFileSequence(t *testing.T) {
verifyFiles([]int{3, 4, 5, 6}) verifyFiles([]int{3, 4, 5, 6})
// Truncating till file 3 should also delete file 4, because it is empty. // Truncating till file 3 should also delete file 4, because it is empty.
file3Maxt := hrw.mmappedChunkFiles[3].maxt require.NoError(t, hrw.Truncate(5))
require.NoError(t, hrw.Truncate(file3Maxt+1))
addChunk() addChunk()
verifyFiles([]int{5, 6, 7}) verifyFiles([]int{5, 6, 7})
@ -381,7 +377,7 @@ func TestHeadReadWriter_TruncateAfterFailedIterateChunks(t *testing.T) {
hrw = createChunkDiskMapper(t, dir) hrw = createChunkDiskMapper(t, dir)
// Forcefully failing IterateAllChunks. // Forcefully failing IterateAllChunks.
require.Error(t, hrw.IterateAllChunks(func(_ HeadSeriesRef, _ ChunkDiskMapperRef, _, _ int64, _ uint16) error { require.Error(t, hrw.IterateAllChunks(func(_ HeadSeriesRef, _ ChunkDiskMapperRef, _, _ int64, _ uint16, _ chunkenc.Encoding) error {
return errors.New("random error") return errors.New("random error")
})) }))
@ -471,7 +467,9 @@ func createChunkDiskMapper(t *testing.T, dir string) *ChunkDiskMapper {
hrw, err := NewChunkDiskMapper(nil, dir, chunkenc.NewPool(), DefaultWriteBufferSize, writeQueueSize) hrw, err := NewChunkDiskMapper(nil, dir, chunkenc.NewPool(), DefaultWriteBufferSize, writeQueueSize)
require.NoError(t, err) require.NoError(t, err)
require.False(t, hrw.fileMaxtSet) require.False(t, hrw.fileMaxtSet)
require.NoError(t, hrw.IterateAllChunks(func(_ HeadSeriesRef, _ ChunkDiskMapperRef, _, _ int64, _ uint16) error { return nil })) require.NoError(t, hrw.IterateAllChunks(func(_ HeadSeriesRef, _ ChunkDiskMapperRef, _, _ int64, _ uint16, _ chunkenc.Encoding) error {
return nil
}))
require.True(t, hrw.fileMaxtSet) require.True(t, hrw.fileMaxtSet)
return hrw return hrw
@ -488,13 +486,17 @@ func randomChunk(t *testing.T) chunkenc.Chunk {
return chunk return chunk
} }
func createChunk(t *testing.T, idx int, hrw *ChunkDiskMapper) (seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, chunk chunkenc.Chunk) { func createChunk(t *testing.T, idx int, hrw *ChunkDiskMapper) (seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, chunk chunkenc.Chunk, isOOO bool) {
var err error var err error
seriesRef = HeadSeriesRef(rand.Int63()) seriesRef = HeadSeriesRef(rand.Int63())
mint = int64((idx)*1000 + 1) mint = int64((idx)*1000 + 1)
maxt = int64((idx + 1) * 1000) maxt = int64((idx + 1) * 1000)
chunk = randomChunk(t) chunk = randomChunk(t)
awaitCb := make(chan struct{}) awaitCb := make(chan struct{})
if rand.Intn(2) == 0 {
isOOO = true
chunk = &chunkenc.OOOXORChunk{XORChunk: chunk.(*chunkenc.XORChunk)}
}
chunkRef = hrw.WriteChunk(seriesRef, mint, maxt, chunk, func(cbErr error) { chunkRef = hrw.WriteChunk(seriesRef, mint, maxt, chunk, func(cbErr error) {
require.NoError(t, err) require.NoError(t, err)
close(awaitCb) close(awaitCb)

View file

@ -1080,7 +1080,7 @@ func BenchmarkCompactionFromHead(b *testing.B) {
opts := DefaultHeadOptions() opts := DefaultHeadOptions()
opts.ChunkRange = 1000 opts.ChunkRange = 1000
opts.ChunkDirRoot = chunkDir opts.ChunkDirRoot = chunkDir
h, err := NewHead(nil, nil, nil, opts, nil) h, err := NewHead(nil, nil, nil, nil, opts, nil)
require.NoError(b, err) require.NoError(b, err)
for ln := 0; ln < labelNames; ln++ { for ln := 0; ln < labelNames; ln++ {
app := h.Appender(context.Background()) app := h.Appender(context.Background())

View file

@ -33,6 +33,7 @@ import (
"github.com/oklog/ulid" "github.com/oklog/ulid"
"github.com/pkg/errors" "github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
"go.uber.org/atomic"
"golang.org/x/sync/errgroup" "golang.org/x/sync/errgroup"
"github.com/prometheus/prometheus/config" "github.com/prometheus/prometheus/config"
@ -69,18 +70,19 @@ var ErrNotReady = errors.New("TSDB not ready")
// millisecond precision timestamps. // millisecond precision timestamps.
func DefaultOptions() *Options { func DefaultOptions() *Options {
return &Options{ return &Options{
WALSegmentSize: wal.DefaultSegmentSize, WALSegmentSize: wal.DefaultSegmentSize,
MaxBlockChunkSegmentSize: chunks.DefaultChunkSegmentSize, MaxBlockChunkSegmentSize: chunks.DefaultChunkSegmentSize,
RetentionDuration: int64(15 * 24 * time.Hour / time.Millisecond), RetentionDuration: int64(15 * 24 * time.Hour / time.Millisecond),
MinBlockDuration: DefaultBlockDuration, MinBlockDuration: DefaultBlockDuration,
MaxBlockDuration: DefaultBlockDuration, MaxBlockDuration: DefaultBlockDuration,
NoLockfile: false, NoLockfile: false,
AllowOverlappingBlocks: false, AllowOverlappingCompaction: false,
WALCompression: false, AllowOverlappingQueries: false,
StripeSize: DefaultStripeSize, WALCompression: false,
HeadChunksWriteBufferSize: chunks.DefaultWriteBufferSize, StripeSize: DefaultStripeSize,
IsolationDisabled: defaultIsolationDisabled, HeadChunksWriteBufferSize: chunks.DefaultWriteBufferSize,
HeadChunksWriteQueueSize: chunks.DefaultWriteQueueSize, IsolationDisabled: defaultIsolationDisabled,
OutOfOrderCapMax: DefaultOutOfOrderCapMax,
} }
} }
@ -112,9 +114,19 @@ type Options struct {
// NoLockfile disables creation and consideration of a lock file. // NoLockfile disables creation and consideration of a lock file.
NoLockfile bool NoLockfile bool
// Overlapping blocks are allowed if AllowOverlappingBlocks is true. // Querying on overlapping blocks are allowed if AllowOverlappingQueries is true.
// This in-turn enables vertical compaction and vertical query merge. // Since querying is a required operation for TSDB, if there are going to be
AllowOverlappingBlocks bool // overlapping blocks, then this should be set to true.
// NOTE: Do not use this directly in DB. Use it via DB.AllowOverlappingQueries().
AllowOverlappingQueries bool
// Compaction of overlapping blocks are allowed if AllowOverlappingCompaction is true.
// This is an optional flag for overlapping blocks.
// The reason why this flag exists is because there are various users of the TSDB
// that do not want vertical compaction happening on ingest time. Instead,
// they'd rather keep overlapping blocks and let another component do the overlapping compaction later.
// For Prometheus, this will always be enabled if overlapping queries is enabled.
AllowOverlappingCompaction bool
// WALCompression will turn on Snappy compression for records on the WAL. // WALCompression will turn on Snappy compression for records on the WAL.
WALCompression bool WALCompression bool
@ -160,6 +172,15 @@ type Options struct {
// Disables isolation between reads and in-flight appends. // Disables isolation between reads and in-flight appends.
IsolationDisabled bool IsolationDisabled bool
// OutOfOrderTimeWindow specifies how much out of order is allowed, if any.
// This can change during run-time, so this value from here should only be used
// while initialising.
OutOfOrderTimeWindow int64
// OutOfOrderCapMax is maximum capacity for OOO chunks (in samples).
// If it is <=0, the default value is assumed.
OutOfOrderCapMax int64
} }
type BlocksToDeleteFunc func(blocks []*Block) map[ulid.ULID]struct{} type BlocksToDeleteFunc func(blocks []*Block) map[ulid.ULID]struct{}
@ -197,6 +218,13 @@ type DB struct {
// Cancel a running compaction when a shutdown is initiated. // Cancel a running compaction when a shutdown is initiated.
compactCancel context.CancelFunc compactCancel context.CancelFunc
// oooWasEnabled is true if out of order support was enabled at least one time
// during the time TSDB was up. In which case we need to keep supporting
// out-of-order compaction and vertical queries.
oooWasEnabled atomic.Bool
registerer prometheus.Registerer
} }
type dbMetrics struct { type dbMetrics struct {
@ -372,9 +400,17 @@ func (db *DBReadOnly) FlushWAL(dir string) (returnErr error) {
if err != nil { if err != nil {
return err return err
} }
var wbl *wal.WAL
wblDir := filepath.Join(db.dir, wal.WblDirName)
if _, err := os.Stat(wblDir); !os.IsNotExist(err) {
wbl, err = wal.Open(db.logger, wblDir)
if err != nil {
return err
}
}
opts := DefaultHeadOptions() opts := DefaultHeadOptions()
opts.ChunkDirRoot = db.dir opts.ChunkDirRoot = db.dir
head, err := NewHead(nil, db.logger, w, opts, NewHeadStats()) head, err := NewHead(nil, db.logger, w, wbl, opts, NewHeadStats())
if err != nil { if err != nil {
return err return err
} }
@ -430,7 +466,7 @@ func (db *DBReadOnly) loadDataAsQueryable(maxt int64) (storage.SampleAndChunkQue
opts := DefaultHeadOptions() opts := DefaultHeadOptions()
opts.ChunkDirRoot = db.dir opts.ChunkDirRoot = db.dir
head, err := NewHead(nil, db.logger, nil, opts, NewHeadStats()) head, err := NewHead(nil, db.logger, nil, nil, opts, NewHeadStats())
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -448,9 +484,17 @@ func (db *DBReadOnly) loadDataAsQueryable(maxt int64) (storage.SampleAndChunkQue
if err != nil { if err != nil {
return nil, err return nil, err
} }
var wbl *wal.WAL
wblDir := filepath.Join(db.dir, wal.WblDirName)
if _, err := os.Stat(wblDir); !os.IsNotExist(err) {
wbl, err = wal.Open(db.logger, wblDir)
if err != nil {
return nil, err
}
}
opts := DefaultHeadOptions() opts := DefaultHeadOptions()
opts.ChunkDirRoot = db.dir opts.ChunkDirRoot = db.dir
head, err = NewHead(nil, db.logger, w, opts, NewHeadStats()) head, err = NewHead(nil, db.logger, w, wbl, opts, NewHeadStats())
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -598,6 +642,15 @@ func validateOpts(opts *Options, rngs []int64) (*Options, []int64) {
if opts.MinBlockDuration > opts.MaxBlockDuration { if opts.MinBlockDuration > opts.MaxBlockDuration {
opts.MaxBlockDuration = opts.MinBlockDuration opts.MaxBlockDuration = opts.MinBlockDuration
} }
if opts.OutOfOrderTimeWindow > 0 {
opts.AllowOverlappingQueries = true
}
if opts.OutOfOrderCapMax <= 0 {
opts.OutOfOrderCapMax = DefaultOutOfOrderCapMax
}
if opts.OutOfOrderTimeWindow < 0 {
opts.OutOfOrderTimeWindow = 0
}
if len(rngs) == 0 { if len(rngs) == 0 {
// Start with smallest block duration and create exponential buckets until the exceed the // Start with smallest block duration and create exponential buckets until the exceed the
@ -634,6 +687,7 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
} }
walDir := filepath.Join(dir, "wal") walDir := filepath.Join(dir, "wal")
wblDir := filepath.Join(dir, wal.WblDirName)
// Migrate old WAL if one exists. // Migrate old WAL if one exists.
if err := MigrateWAL(l, walDir); err != nil { if err := MigrateWAL(l, walDir); err != nil {
@ -656,6 +710,7 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
autoCompact: true, autoCompact: true,
chunkPool: chunkenc.NewPool(), chunkPool: chunkenc.NewPool(),
blocksToDelete: opts.BlocksToDelete, blocksToDelete: opts.BlocksToDelete,
registerer: r,
} }
defer func() { defer func() {
// Close files if startup fails somewhere. // Close files if startup fails somewhere.
@ -694,7 +749,7 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
} }
db.compactCancel = cancel db.compactCancel = cancel
var wlog *wal.WAL var wlog, wblog *wal.WAL
segmentSize := wal.DefaultSegmentSize segmentSize := wal.DefaultSegmentSize
// Wal is enabled. // Wal is enabled.
if opts.WALSegmentSize >= 0 { if opts.WALSegmentSize >= 0 {
@ -706,8 +761,19 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
if err != nil { if err != nil {
return nil, err return nil, err
} }
// Check if there is a WBL on disk, in which case we should replay that data.
wblSize, err := fileutil.DirSize(wblDir)
if err != nil && !os.IsNotExist(err) {
return nil, err
}
if opts.OutOfOrderTimeWindow > 0 || wblSize > 0 {
wblog, err = wal.NewSize(l, r, wblDir, segmentSize, opts.WALCompression)
if err != nil {
return nil, err
}
}
} }
db.oooWasEnabled.Store(opts.OutOfOrderTimeWindow > 0)
headOpts := DefaultHeadOptions() headOpts := DefaultHeadOptions()
headOpts.ChunkRange = rngs[0] headOpts.ChunkRange = rngs[0]
headOpts.ChunkDirRoot = dir headOpts.ChunkDirRoot = dir
@ -719,11 +785,13 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
headOpts.EnableExemplarStorage = opts.EnableExemplarStorage headOpts.EnableExemplarStorage = opts.EnableExemplarStorage
headOpts.MaxExemplars.Store(opts.MaxExemplars) headOpts.MaxExemplars.Store(opts.MaxExemplars)
headOpts.EnableMemorySnapshotOnShutdown = opts.EnableMemorySnapshotOnShutdown headOpts.EnableMemorySnapshotOnShutdown = opts.EnableMemorySnapshotOnShutdown
headOpts.OutOfOrderTimeWindow.Store(opts.OutOfOrderTimeWindow)
headOpts.OutOfOrderCapMax.Store(opts.OutOfOrderCapMax)
if opts.IsolationDisabled { if opts.IsolationDisabled {
// We only override this flag if isolation is disabled at DB level. We use the default otherwise. // We only override this flag if isolation is disabled at DB level. We use the default otherwise.
headOpts.IsolationDisabled = opts.IsolationDisabled headOpts.IsolationDisabled = opts.IsolationDisabled
} }
db.head, err = NewHead(r, l, wlog, headOpts, stats.Head) db.head, err = NewHead(r, l, wlog, wblog, headOpts, stats.Head)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -741,20 +809,36 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
} }
// Set the min valid time for the ingested samples // Set the min valid time for the ingested samples
// to be no lower than the maxt of the last block. // to be no lower than the maxt of the last block.
blocks := db.Blocks()
minValidTime := int64(math.MinInt64) minValidTime := int64(math.MinInt64)
if len(blocks) > 0 { // We do not consider blocks created from out-of-order samples for Head's minValidTime
minValidTime = blocks[len(blocks)-1].Meta().MaxTime // since minValidTime is only for the in-order data and we do not want to discard unnecessary
// samples from the Head.
inOrderMaxTime, ok := db.inOrderBlocksMaxTime()
if ok {
minValidTime = inOrderMaxTime
} }
if initErr := db.head.Init(minValidTime); initErr != nil { if initErr := db.head.Init(minValidTime); initErr != nil {
db.head.metrics.walCorruptionsTotal.Inc() db.head.metrics.walCorruptionsTotal.Inc()
level.Warn(db.logger).Log("msg", "Encountered WAL read error, attempting repair", "err", initErr) isOOOErr := isErrLoadOOOWal(initErr)
if err := wlog.Repair(initErr); err != nil { if isOOOErr {
return nil, errors.Wrap(err, "repair corrupted WAL") level.Warn(db.logger).Log("msg", "Encountered OOO WAL read error, attempting repair", "err", initErr)
if err := wblog.Repair(initErr); err != nil {
return nil, errors.Wrap(err, "repair corrupted OOO WAL")
}
} else {
level.Warn(db.logger).Log("msg", "Encountered WAL read error, attempting repair", "err", initErr)
if err := wlog.Repair(initErr); err != nil {
return nil, errors.Wrap(err, "repair corrupted WAL")
}
} }
} }
if db.head.MinOOOTime() != int64(math.MaxInt64) {
// Some OOO data was replayed from the disk that needs compaction and cleanup.
db.oooWasEnabled.Store(true)
}
go db.run() go db.run()
return db, nil return db, nil
@ -846,8 +930,58 @@ func (db *DB) Appender(ctx context.Context) storage.Appender {
return dbAppender{db: db, Appender: db.head.Appender(ctx)} return dbAppender{db: db, Appender: db.head.Appender(ctx)}
} }
// ApplyConfig applies a new config to the DB.
// Behaviour of 'OutOfOrderTimeWindow' is as follows:
// OOO enabled = oooTimeWindow > 0. OOO disabled = oooTimeWindow is 0.
// 1) Before: OOO disabled, Now: OOO enabled =>
// - A new WBL is created for the head block.
// - OOO compaction is enabled.
// - Overlapping queries are enabled.
//
// 2) Before: OOO enabled, Now: OOO enabled =>
// - Only the time window is updated.
//
// 3) Before: OOO enabled, Now: OOO disabled =>
// - Time Window set to 0. So no new OOO samples will be allowed.
// - OOO WBL will stay and will be eventually cleaned up.
// - OOO Compaction and overlapping queries will remain enabled until a restart or until all OOO samples are compacted.
//
// 4) Before: OOO disabled, Now: OOO disabled => no-op.
func (db *DB) ApplyConfig(conf *config.Config) error { func (db *DB) ApplyConfig(conf *config.Config) error {
return db.head.ApplyConfig(conf) oooTimeWindow := int64(0)
if conf.StorageConfig.TSDBConfig != nil {
oooTimeWindow = conf.StorageConfig.TSDBConfig.OutOfOrderTimeWindow
}
if oooTimeWindow < 0 {
oooTimeWindow = 0
}
// Create WBL if it was not present and if OOO is enabled with WAL enabled.
var wblog *wal.WAL
var err error
if db.head.wbl != nil {
// The existing WBL from the disk might have been replayed while OOO was disabled.
wblog = db.head.wbl
} else if !db.oooWasEnabled.Load() && oooTimeWindow > 0 && db.opts.WALSegmentSize >= 0 {
segmentSize := wal.DefaultSegmentSize
// Wal is set to a custom size.
if db.opts.WALSegmentSize > 0 {
segmentSize = db.opts.WALSegmentSize
}
oooWalDir := filepath.Join(db.dir, wal.WblDirName)
wblog, err = wal.NewSize(db.logger, db.registerer, oooWalDir, segmentSize, db.opts.WALCompression)
if err != nil {
return err
}
}
db.opts.OutOfOrderTimeWindow = oooTimeWindow
db.head.ApplyConfig(conf, wblog)
if !db.oooWasEnabled.Load() {
db.oooWasEnabled.Store(oooTimeWindow > 0)
}
return nil
} }
// dbAppender wraps the DB's head appender and triggers compactions on commit // dbAppender wraps the DB's head appender and triggers compactions on commit
@ -946,6 +1080,14 @@ func (db *DB) Compact() (returnErr error) {
"block_range", db.head.chunkRange.Load(), "block_range", db.head.chunkRange.Load(),
) )
} }
if lastBlockMaxt != math.MinInt64 {
// The head was compacted, so we compact OOO head as well.
if err := db.compactOOOHead(); err != nil {
return errors.Wrap(err, "compact ooo head")
}
}
return db.compactBlocks() return db.compactBlocks()
} }
@ -964,6 +1106,102 @@ func (db *DB) CompactHead(head *RangeHead) error {
return nil return nil
} }
// CompactOOOHead compacts the OOO Head.
func (db *DB) CompactOOOHead() error {
db.cmtx.Lock()
defer db.cmtx.Unlock()
return db.compactOOOHead()
}
func (db *DB) compactOOOHead() error {
if !db.oooWasEnabled.Load() {
return nil
}
oooHead, err := NewOOOCompactionHead(db.head)
if err != nil {
return errors.Wrap(err, "get ooo compaction head")
}
ulids, err := db.compactOOO(db.dir, oooHead)
if err != nil {
return errors.Wrap(err, "compact ooo head")
}
if err := db.reloadBlocks(); err != nil {
errs := tsdb_errors.NewMulti(err)
for _, uid := range ulids {
if errRemoveAll := os.RemoveAll(filepath.Join(db.dir, uid.String())); errRemoveAll != nil {
errs.Add(errRemoveAll)
}
}
return errors.Wrap(errs.Err(), "reloadBlocks blocks after failed compact ooo head")
}
lastWBLFile, minOOOMmapRef := oooHead.LastWBLFile(), oooHead.LastMmapRef()
if lastWBLFile != 0 || minOOOMmapRef != 0 {
if err := db.head.truncateOOO(lastWBLFile, minOOOMmapRef); err != nil {
return errors.Wrap(err, "truncate ooo wbl")
}
}
return nil
}
// compactOOO creates a new block per possible block range in the compactor's directory from the OOO Head given.
// Each ULID in the result corresponds to a block in a unique time range.
func (db *DB) compactOOO(dest string, oooHead *OOOCompactionHead) (_ []ulid.ULID, err error) {
start := time.Now()
blockSize := oooHead.ChunkRange()
oooHeadMint, oooHeadMaxt := oooHead.MinTime(), oooHead.MaxTime()
ulids := make([]ulid.ULID, 0)
defer func() {
if err != nil {
// Best effort removal of created block on any error.
for _, uid := range ulids {
_ = os.RemoveAll(filepath.Join(db.dir, uid.String()))
}
}
}()
for t := blockSize * (oooHeadMint / blockSize); t <= oooHeadMaxt; t = t + blockSize {
mint, maxt := t, t+blockSize
// Block intervals are half-open: [b.MinTime, b.MaxTime). Block intervals are always +1 than the total samples it includes.
uid, err := db.compactor.Write(dest, oooHead.CloneForTimeRange(mint, maxt-1), mint, maxt, nil)
if err != nil {
return nil, err
}
if uid.Compare(ulid.ULID{}) != 0 {
ulids = append(ulids, uid)
blockDir := filepath.Join(dest, uid.String())
meta, _, err := readMetaFile(blockDir)
if err != nil {
return ulids, errors.Wrap(err, "read meta")
}
meta.Compaction.SetOutOfOrder()
_, err = writeMetaFile(db.logger, blockDir, meta)
if err != nil {
return ulids, errors.Wrap(err, "write meta")
}
}
}
if len(ulids) == 0 {
level.Info(db.logger).Log(
"msg", "compact ooo head resulted in no blocks",
"duration", time.Since(start),
)
return nil, nil
}
level.Info(db.logger).Log(
"msg", "out-of-order compaction completed",
"duration", time.Since(start),
"ulids", fmt.Sprintf("%v", ulids),
)
return ulids, nil
}
// compactHead compacts the given RangeHead. // compactHead compacts the given RangeHead.
// The compaction mutex should be held before calling this method. // The compaction mutex should be held before calling this method.
func (db *DB) compactHead(head *RangeHead) error { func (db *DB) compactHead(head *RangeHead) error {
@ -1038,10 +1276,11 @@ func (db *DB) reload() error {
if err := db.reloadBlocks(); err != nil { if err := db.reloadBlocks(); err != nil {
return errors.Wrap(err, "reloadBlocks") return errors.Wrap(err, "reloadBlocks")
} }
if len(db.blocks) == 0 { maxt, ok := db.inOrderBlocksMaxTime()
if !ok {
return nil return nil
} }
if err := db.head.Truncate(db.blocks[len(db.blocks)-1].MaxTime()); err != nil { if err := db.head.Truncate(maxt); err != nil {
return errors.Wrap(err, "head truncate") return errors.Wrap(err, "head truncate")
} }
return nil return nil
@ -1121,7 +1360,7 @@ func (db *DB) reloadBlocks() (err error) {
sort.Slice(toLoad, func(i, j int) bool { sort.Slice(toLoad, func(i, j int) bool {
return toLoad[i].Meta().MinTime < toLoad[j].Meta().MinTime return toLoad[i].Meta().MinTime < toLoad[j].Meta().MinTime
}) })
if !db.opts.AllowOverlappingBlocks { if !db.AllowOverlappingQueries() {
if err := validateBlockSequence(toLoad); err != nil { if err := validateBlockSequence(toLoad); err != nil {
return errors.Wrap(err, "invalid block sequence") return errors.Wrap(err, "invalid block sequence")
} }
@ -1151,6 +1390,10 @@ func (db *DB) reloadBlocks() (err error) {
return nil return nil
} }
func (db *DB) AllowOverlappingQueries() bool {
return db.opts.AllowOverlappingQueries || db.oooWasEnabled.Load()
}
func openBlocks(l log.Logger, dir string, loaded []*Block, chunkPool chunkenc.Pool) (blocks []*Block, corrupted map[ulid.ULID]error, err error) { func openBlocks(l log.Logger, dir string, loaded []*Block, chunkPool chunkenc.Pool) (blocks []*Block, corrupted map[ulid.ULID]error, err error) {
bDirs, err := blockDirs(dir) bDirs, err := blockDirs(dir)
if err != nil { if err != nil {
@ -1428,6 +1671,21 @@ func (db *DB) Blocks() []*Block {
return db.blocks return db.blocks
} }
// inOrderBlocksMaxTime returns the max time among the blocks that were not totally created
// out of out-of-order data. If the returned boolean is true, it means there is at least
// one such block.
func (db *DB) inOrderBlocksMaxTime() (maxt int64, ok bool) {
maxt, ok = int64(math.MinInt64), false
// If blocks are overlapping, last block might not have the max time. So check all blocks.
for _, b := range db.Blocks() {
if !b.meta.Compaction.FromOutOfOrder() && b.meta.MaxTime > maxt {
ok = true
maxt = b.meta.MaxTime
}
}
return maxt, ok
}
// Head returns the databases's head. // Head returns the databases's head.
func (db *DB) Head() *Head { func (db *DB) Head() *Head {
return db.head return db.head
@ -1526,13 +1784,13 @@ func (db *DB) Querier(_ context.Context, mint, maxt int64) (storage.Querier, err
blocks = append(blocks, b) blocks = append(blocks, b)
} }
} }
var headQuerier storage.Querier var inOrderHeadQuerier storage.Querier
if maxt >= db.head.MinTime() { if maxt >= db.head.MinTime() {
rh := NewRangeHead(db.head, mint, maxt) rh := NewRangeHead(db.head, mint, maxt)
var err error var err error
headQuerier, err = NewBlockQuerier(rh, mint, maxt) inOrderHeadQuerier, err = NewBlockQuerier(rh, mint, maxt)
if err != nil { if err != nil {
return nil, errors.Wrapf(err, "open querier for head %s", rh) return nil, errors.Wrapf(err, "open block querier for head %s", rh)
} }
// Getting the querier above registers itself in the queue that the truncation waits on. // Getting the querier above registers itself in the queue that the truncation waits on.
@ -1540,20 +1798,30 @@ func (db *DB) Querier(_ context.Context, mint, maxt int64) (storage.Querier, err
// won't run into a race later since any truncation that comes after will wait on this querier if it overlaps. // won't run into a race later since any truncation that comes after will wait on this querier if it overlaps.
shouldClose, getNew, newMint := db.head.IsQuerierCollidingWithTruncation(mint, maxt) shouldClose, getNew, newMint := db.head.IsQuerierCollidingWithTruncation(mint, maxt)
if shouldClose { if shouldClose {
if err := headQuerier.Close(); err != nil { if err := inOrderHeadQuerier.Close(); err != nil {
return nil, errors.Wrapf(err, "closing head querier %s", rh) return nil, errors.Wrapf(err, "closing head block querier %s", rh)
} }
headQuerier = nil inOrderHeadQuerier = nil
} }
if getNew { if getNew {
rh := NewRangeHead(db.head, newMint, maxt) rh := NewRangeHead(db.head, newMint, maxt)
headQuerier, err = NewBlockQuerier(rh, newMint, maxt) inOrderHeadQuerier, err = NewBlockQuerier(rh, newMint, maxt)
if err != nil { if err != nil {
return nil, errors.Wrapf(err, "open querier for head while getting new querier %s", rh) return nil, errors.Wrapf(err, "open block querier for head while getting new querier %s", rh)
} }
} }
} }
var outOfOrderHeadQuerier storage.Querier
if overlapsClosedInterval(mint, maxt, db.head.MinOOOTime(), db.head.MaxOOOTime()) {
rh := NewOOORangeHead(db.head, mint, maxt)
var err error
outOfOrderHeadQuerier, err = NewBlockQuerier(rh, mint, maxt)
if err != nil {
return nil, errors.Wrapf(err, "open block querier for ooo head %s", rh)
}
}
blockQueriers := make([]storage.Querier, 0, len(blocks)) blockQueriers := make([]storage.Querier, 0, len(blocks))
for _, b := range blocks { for _, b := range blocks {
q, err := NewBlockQuerier(b, mint, maxt) q, err := NewBlockQuerier(b, mint, maxt)
@ -1568,14 +1836,18 @@ func (db *DB) Querier(_ context.Context, mint, maxt int64) (storage.Querier, err
} }
return nil, errors.Wrapf(err, "open querier for block %s", b) return nil, errors.Wrapf(err, "open querier for block %s", b)
} }
if headQuerier != nil { if inOrderHeadQuerier != nil {
blockQueriers = append(blockQueriers, headQuerier) blockQueriers = append(blockQueriers, inOrderHeadQuerier)
}
if outOfOrderHeadQuerier != nil {
blockQueriers = append(blockQueriers, outOfOrderHeadQuerier)
} }
return storage.NewMergeQuerier(blockQueriers, nil, storage.ChainedSeriesMerge), nil return storage.NewMergeQuerier(blockQueriers, nil, storage.ChainedSeriesMerge), nil
} }
// ChunkQuerier returns a new chunk querier over the data partition for the given time range. // blockQueriersForRange returns individual block chunk queriers from the persistent blocks, in-order head block, and the
func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQuerier, error) { // out-of-order head block, overlapping with the given time range.
func (db *DB) blockChunkQuerierForRange(mint, maxt int64) ([]storage.ChunkQuerier, error) {
var blocks []BlockReader var blocks []BlockReader
db.mtx.RLock() db.mtx.RLock()
@ -1586,11 +1858,11 @@ func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQu
blocks = append(blocks, b) blocks = append(blocks, b)
} }
} }
var headQuerier storage.ChunkQuerier var inOrderHeadQuerier storage.ChunkQuerier
if maxt >= db.head.MinTime() { if maxt >= db.head.MinTime() {
rh := NewRangeHead(db.head, mint, maxt) rh := NewRangeHead(db.head, mint, maxt)
var err error var err error
headQuerier, err = NewBlockChunkQuerier(rh, mint, maxt) inOrderHeadQuerier, err = NewBlockChunkQuerier(rh, mint, maxt)
if err != nil { if err != nil {
return nil, errors.Wrapf(err, "open querier for head %s", rh) return nil, errors.Wrapf(err, "open querier for head %s", rh)
} }
@ -1600,20 +1872,30 @@ func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQu
// won't run into a race later since any truncation that comes after will wait on this querier if it overlaps. // won't run into a race later since any truncation that comes after will wait on this querier if it overlaps.
shouldClose, getNew, newMint := db.head.IsQuerierCollidingWithTruncation(mint, maxt) shouldClose, getNew, newMint := db.head.IsQuerierCollidingWithTruncation(mint, maxt)
if shouldClose { if shouldClose {
if err := headQuerier.Close(); err != nil { if err := inOrderHeadQuerier.Close(); err != nil {
return nil, errors.Wrapf(err, "closing head querier %s", rh) return nil, errors.Wrapf(err, "closing head querier %s", rh)
} }
headQuerier = nil inOrderHeadQuerier = nil
} }
if getNew { if getNew {
rh := NewRangeHead(db.head, newMint, maxt) rh := NewRangeHead(db.head, newMint, maxt)
headQuerier, err = NewBlockChunkQuerier(rh, newMint, maxt) inOrderHeadQuerier, err = NewBlockChunkQuerier(rh, newMint, maxt)
if err != nil { if err != nil {
return nil, errors.Wrapf(err, "open querier for head while getting new querier %s", rh) return nil, errors.Wrapf(err, "open querier for head while getting new querier %s", rh)
} }
} }
} }
var outOfOrderHeadQuerier storage.ChunkQuerier
if overlapsClosedInterval(mint, maxt, db.head.MinOOOTime(), db.head.MaxOOOTime()) {
rh := NewOOORangeHead(db.head, mint, maxt)
var err error
outOfOrderHeadQuerier, err = NewBlockChunkQuerier(rh, mint, maxt)
if err != nil {
return nil, errors.Wrapf(err, "open block chunk querier for ooo head %s", rh)
}
}
blockQueriers := make([]storage.ChunkQuerier, 0, len(blocks)) blockQueriers := make([]storage.ChunkQuerier, 0, len(blocks))
for _, b := range blocks { for _, b := range blocks {
q, err := NewBlockChunkQuerier(b, mint, maxt) q, err := NewBlockChunkQuerier(b, mint, maxt)
@ -1628,10 +1910,22 @@ func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQu
} }
return nil, errors.Wrapf(err, "open querier for block %s", b) return nil, errors.Wrapf(err, "open querier for block %s", b)
} }
if headQuerier != nil { if inOrderHeadQuerier != nil {
blockQueriers = append(blockQueriers, headQuerier) blockQueriers = append(blockQueriers, inOrderHeadQuerier)
}
if outOfOrderHeadQuerier != nil {
blockQueriers = append(blockQueriers, outOfOrderHeadQuerier)
} }
return blockQueriers, nil
}
// ChunkQuerier returns a new chunk querier over the data partition for the given time range.
func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQuerier, error) {
blockQueriers, err := db.blockChunkQuerierForRange(mint, maxt)
if err != nil {
return nil, err
}
return storage.NewMergeChunkQuerier(blockQueriers, nil, storage.NewCompactingChunkSeriesMerger(storage.ChainedSeriesMerge)), nil return storage.NewMergeChunkQuerier(blockQueriers, nil, storage.NewCompactingChunkSeriesMerger(storage.ChainedSeriesMerge)), nil
} }

File diff suppressed because it is too large Load diff

View file

@ -25,9 +25,10 @@ import (
"github.com/go-kit/log/level" "github.com/go-kit/log/level"
"github.com/oklog/ulid" "github.com/oklog/ulid"
"github.com/pkg/errors" "github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"go.uber.org/atomic" "go.uber.org/atomic"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/prometheus/config" "github.com/prometheus/prometheus/config"
"github.com/prometheus/prometheus/model/exemplar" "github.com/prometheus/prometheus/model/exemplar"
"github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/model/labels"
@ -62,15 +63,19 @@ var (
type Head struct { type Head struct {
chunkRange atomic.Int64 chunkRange atomic.Int64
numSeries atomic.Uint64 numSeries atomic.Uint64
minTime, maxTime atomic.Int64 // Current min and max of the samples included in the head. minOOOTime, maxOOOTime atomic.Int64 // TODO(jesusvazquez) These should be updated after garbage collection.
minTime, maxTime atomic.Int64 // Current min and max of the samples included in the head. TODO(jesusvazquez) Ensure these are properly tracked.
minValidTime atomic.Int64 // Mint allowed to be added to the head. It shouldn't be lower than the maxt of the last persisted block. minValidTime atomic.Int64 // Mint allowed to be added to the head. It shouldn't be lower than the maxt of the last persisted block.
lastWALTruncationTime atomic.Int64 lastWALTruncationTime atomic.Int64
lastMemoryTruncationTime atomic.Int64 lastMemoryTruncationTime atomic.Int64
lastSeriesID atomic.Uint64 lastSeriesID atomic.Uint64
// All the ooo m-map chunks should be after this. This is used to truncate old ooo m-map chunks.
// This should be typecasted to chunks.ChunkDiskMapperRef after loading.
minOOOMmapRef atomic.Uint64
metrics *headMetrics metrics *headMetrics
opts *HeadOptions opts *HeadOptions
wal *wal.WAL wal, wbl *wal.WAL
exemplarMetrics *ExemplarMetrics exemplarMetrics *ExemplarMetrics
exemplars ExemplarStorage exemplars ExemplarStorage
logger log.Logger logger log.Logger
@ -87,6 +92,7 @@ type Head struct {
deletedMtx sync.Mutex deletedMtx sync.Mutex
deleted map[chunks.HeadSeriesRef]int // Deleted series, and what WAL segment they must be kept until. deleted map[chunks.HeadSeriesRef]int // Deleted series, and what WAL segment they must be kept until.
// TODO(codesome): Extend MemPostings to return only OOOPostings, Set OOOStatus, ... Like an additional map of ooo postings.
postings *index.MemPostings // Postings lists for terms. postings *index.MemPostings // Postings lists for terms.
tombstones *tombstones.MemTombstones tombstones *tombstones.MemTombstones
@ -130,6 +136,8 @@ type HeadOptions struct {
ChunkPool chunkenc.Pool ChunkPool chunkenc.Pool
ChunkWriteBufferSize int ChunkWriteBufferSize int
ChunkWriteQueueSize int ChunkWriteQueueSize int
OutOfOrderTimeWindow atomic.Int64
OutOfOrderCapMax atomic.Int64
// StripeSize sets the number of entries in the hash map, it must be a power of 2. // StripeSize sets the number of entries in the hash map, it must be a power of 2.
// A larger StripeSize will allocate more memory up-front, but will increase performance when handling a large number of series. // A larger StripeSize will allocate more memory up-front, but will increase performance when handling a large number of series.
@ -142,8 +150,13 @@ type HeadOptions struct {
IsolationDisabled bool IsolationDisabled bool
} }
const (
// DefaultOutOfOrderCapMax is the default maximum size of an in-memory out-of-order chunk.
DefaultOutOfOrderCapMax int64 = 32
)
func DefaultHeadOptions() *HeadOptions { func DefaultHeadOptions() *HeadOptions {
return &HeadOptions{ ho := &HeadOptions{
ChunkRange: DefaultBlockDuration, ChunkRange: DefaultBlockDuration,
ChunkDirRoot: "", ChunkDirRoot: "",
ChunkPool: chunkenc.NewPool(), ChunkPool: chunkenc.NewPool(),
@ -153,6 +166,8 @@ func DefaultHeadOptions() *HeadOptions {
SeriesCallback: &noopSeriesLifecycleCallback{}, SeriesCallback: &noopSeriesLifecycleCallback{},
IsolationDisabled: defaultIsolationDisabled, IsolationDisabled: defaultIsolationDisabled,
} }
ho.OutOfOrderCapMax.Store(DefaultOutOfOrderCapMax)
return ho
} }
// SeriesLifecycleCallback specifies a list of callbacks that will be called during a lifecycle of a series. // SeriesLifecycleCallback specifies a list of callbacks that will be called during a lifecycle of a series.
@ -171,11 +186,23 @@ type SeriesLifecycleCallback interface {
} }
// NewHead opens the head block in dir. // NewHead opens the head block in dir.
func NewHead(r prometheus.Registerer, l log.Logger, wal *wal.WAL, opts *HeadOptions, stats *HeadStats) (*Head, error) { func NewHead(r prometheus.Registerer, l log.Logger, wal, wbl *wal.WAL, opts *HeadOptions, stats *HeadStats) (*Head, error) {
var err error var err error
if l == nil { if l == nil {
l = log.NewNopLogger() l = log.NewNopLogger()
} }
if opts.OutOfOrderTimeWindow.Load() < 0 {
opts.OutOfOrderTimeWindow.Store(0)
}
// Time window can be set on runtime. So the capMin and capMax should be valid
// even if ooo is not enabled yet.
capMax := opts.OutOfOrderCapMax.Load()
if capMax <= 0 || capMax > 255 {
return nil, errors.Errorf("OOOCapMax of %d is invalid. must be > 0 and <= 255", capMax)
}
if opts.ChunkRange < 1 { if opts.ChunkRange < 1 {
return nil, errors.Errorf("invalid chunk range %d", opts.ChunkRange) return nil, errors.Errorf("invalid chunk range %d", opts.ChunkRange)
} }
@ -193,6 +220,7 @@ func NewHead(r prometheus.Registerer, l log.Logger, wal *wal.WAL, opts *HeadOpti
h := &Head{ h := &Head{
wal: wal, wal: wal,
wbl: wbl,
logger: l, logger: l,
opts: opts, opts: opts,
memChunkPool: sync.Pool{ memChunkPool: sync.Pool{
@ -254,35 +282,40 @@ func (h *Head) resetInMemoryState() error {
h.chunkRange.Store(h.opts.ChunkRange) h.chunkRange.Store(h.opts.ChunkRange)
h.minTime.Store(math.MaxInt64) h.minTime.Store(math.MaxInt64)
h.maxTime.Store(math.MinInt64) h.maxTime.Store(math.MinInt64)
h.minOOOTime.Store(math.MaxInt64)
h.maxOOOTime.Store(math.MinInt64)
h.lastWALTruncationTime.Store(math.MinInt64) h.lastWALTruncationTime.Store(math.MinInt64)
h.lastMemoryTruncationTime.Store(math.MinInt64) h.lastMemoryTruncationTime.Store(math.MinInt64)
return nil return nil
} }
type headMetrics struct { type headMetrics struct {
activeAppenders prometheus.Gauge activeAppenders prometheus.Gauge
series prometheus.GaugeFunc series prometheus.GaugeFunc
seriesCreated prometheus.Counter seriesCreated prometheus.Counter
seriesRemoved prometheus.Counter seriesRemoved prometheus.Counter
seriesNotFound prometheus.Counter seriesNotFound prometheus.Counter
chunks prometheus.Gauge chunks prometheus.Gauge
chunksCreated prometheus.Counter chunksCreated prometheus.Counter
chunksRemoved prometheus.Counter chunksRemoved prometheus.Counter
gcDuration prometheus.Summary gcDuration prometheus.Summary
samplesAppended prometheus.Counter samplesAppended prometheus.Counter
outOfBoundSamples prometheus.Counter outOfOrderSamplesAppended prometheus.Counter
outOfOrderSamples prometheus.Counter outOfBoundSamples prometheus.Counter
walTruncateDuration prometheus.Summary outOfOrderSamples prometheus.Counter
walCorruptionsTotal prometheus.Counter tooOldSamples prometheus.Counter
walTotalReplayDuration prometheus.Gauge walTruncateDuration prometheus.Summary
headTruncateFail prometheus.Counter walCorruptionsTotal prometheus.Counter
headTruncateTotal prometheus.Counter dataTotalReplayDuration prometheus.Gauge
checkpointDeleteFail prometheus.Counter headTruncateFail prometheus.Counter
checkpointDeleteTotal prometheus.Counter headTruncateTotal prometheus.Counter
checkpointCreationFail prometheus.Counter checkpointDeleteFail prometheus.Counter
checkpointCreationTotal prometheus.Counter checkpointDeleteTotal prometheus.Counter
mmapChunkCorruptionTotal prometheus.Counter checkpointCreationFail prometheus.Counter
snapshotReplayErrorTotal prometheus.Counter // Will be either 0 or 1. checkpointCreationTotal prometheus.Counter
mmapChunkCorruptionTotal prometheus.Counter
snapshotReplayErrorTotal prometheus.Counter // Will be either 0 or 1.
oooHistogram prometheus.Histogram
} }
func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics { func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
@ -333,7 +366,7 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
Name: "prometheus_tsdb_wal_corruptions_total", Name: "prometheus_tsdb_wal_corruptions_total",
Help: "Total number of WAL corruptions.", Help: "Total number of WAL corruptions.",
}), }),
walTotalReplayDuration: prometheus.NewGauge(prometheus.GaugeOpts{ dataTotalReplayDuration: prometheus.NewGauge(prometheus.GaugeOpts{
Name: "prometheus_tsdb_data_replay_duration_seconds", Name: "prometheus_tsdb_data_replay_duration_seconds",
Help: "Time taken to replay the data on disk.", Help: "Time taken to replay the data on disk.",
}), }),
@ -341,13 +374,21 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
Name: "prometheus_tsdb_head_samples_appended_total", Name: "prometheus_tsdb_head_samples_appended_total",
Help: "Total number of appended samples.", Help: "Total number of appended samples.",
}), }),
outOfOrderSamplesAppended: prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_head_out_of_order_samples_appended_total",
Help: "Total number of appended out of order samples.",
}),
outOfBoundSamples: prometheus.NewCounter(prometheus.CounterOpts{ outOfBoundSamples: prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_out_of_bound_samples_total", Name: "prometheus_tsdb_out_of_bound_samples_total",
Help: "Total number of out of bound samples ingestion failed attempts.", Help: "Total number of out of bound samples ingestion failed attempts with out of order support disabled.",
}), }),
outOfOrderSamples: prometheus.NewCounter(prometheus.CounterOpts{ outOfOrderSamples: prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_out_of_order_samples_total", Name: "prometheus_tsdb_out_of_order_samples_total",
Help: "Total number of out of order samples ingestion failed attempts.", Help: "Total number of out of order samples ingestion failed attempts due to out of order being disabled.",
}),
tooOldSamples: prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_too_old_samples_total",
Help: "Total number of out of order samples ingestion failed attempts with out of support enabled, but sample outside of time window.",
}), }),
headTruncateFail: prometheus.NewCounter(prometheus.CounterOpts{ headTruncateFail: prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_head_truncations_failed_total", Name: "prometheus_tsdb_head_truncations_failed_total",
@ -381,6 +422,19 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
Name: "prometheus_tsdb_snapshot_replay_error_total", Name: "prometheus_tsdb_snapshot_replay_error_total",
Help: "Total number snapshot replays that failed.", Help: "Total number snapshot replays that failed.",
}), }),
oooHistogram: prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "prometheus_tsdb_sample_ooo_delta",
Help: "Delta in seconds by which a sample is considered out of order (reported regardless of OOO time window and whether sample is accepted or not).",
Buckets: []float64{
60 * 10, // 10 min
60 * 30, // 30 min
60 * 60, // 60 min
60 * 60 * 2, // 2h
60 * 60 * 3, // 3h
60 * 60 * 6, // 6h
60 * 60 * 12, // 12h
},
}),
} }
if r != nil { if r != nil {
@ -396,10 +450,12 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
m.gcDuration, m.gcDuration,
m.walTruncateDuration, m.walTruncateDuration,
m.walCorruptionsTotal, m.walCorruptionsTotal,
m.walTotalReplayDuration, m.dataTotalReplayDuration,
m.samplesAppended, m.samplesAppended,
m.outOfOrderSamplesAppended,
m.outOfBoundSamples, m.outOfBoundSamples,
m.outOfOrderSamples, m.outOfOrderSamples,
m.tooOldSamples,
m.headTruncateFail, m.headTruncateFail,
m.headTruncateTotal, m.headTruncateTotal,
m.checkpointDeleteFail, m.checkpointDeleteFail,
@ -517,8 +573,9 @@ func (h *Head) Init(minValidTime int64) error {
} }
mmapChunkReplayStart := time.Now() mmapChunkReplayStart := time.Now()
mmappedChunks, err := h.loadMmappedChunks(refSeries) mmappedChunks, oooMmappedChunks, lastMmapRef, err := h.loadMmappedChunks(refSeries)
if err != nil { if err != nil {
// TODO(codesome): clear out all m-map chunks here for refSeries.
level.Error(h.logger).Log("msg", "Loading on-disk chunks failed", "err", err) level.Error(h.logger).Log("msg", "Loading on-disk chunks failed", "err", err)
if _, ok := errors.Cause(err).(*chunks.CorruptionErr); ok { if _, ok := errors.Cause(err).(*chunks.CorruptionErr); ok {
h.metrics.mmapChunkCorruptionTotal.Inc() h.metrics.mmapChunkCorruptionTotal.Inc()
@ -529,7 +586,7 @@ func (h *Head) Init(minValidTime int64) error {
// If this fails, data will be recovered from WAL. // If this fails, data will be recovered from WAL.
// Hence we wont lose any data (given WAL is not corrupt). // Hence we wont lose any data (given WAL is not corrupt).
mmappedChunks, err = h.removeCorruptedMmappedChunks(err) mmappedChunks, oooMmappedChunks, lastMmapRef, err = h.removeCorruptedMmappedChunks(err)
if err != nil { if err != nil {
return err return err
} }
@ -572,7 +629,7 @@ func (h *Head) Init(minValidTime int64) error {
// A corrupted checkpoint is a hard error for now and requires user // A corrupted checkpoint is a hard error for now and requires user
// intervention. There's likely little data that can be recovered anyway. // intervention. There's likely little data that can be recovered anyway.
if err := h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks); err != nil { if err := h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks, oooMmappedChunks); err != nil {
return errors.Wrap(err, "backfill checkpoint") return errors.Wrap(err, "backfill checkpoint")
} }
h.updateWALReplayStatusRead(startFrom) h.updateWALReplayStatusRead(startFrom)
@ -605,7 +662,7 @@ func (h *Head) Init(minValidTime int64) error {
if err != nil { if err != nil {
return errors.Wrapf(err, "segment reader (offset=%d)", offset) return errors.Wrapf(err, "segment reader (offset=%d)", offset)
} }
err = h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks) err = h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks, oooMmappedChunks)
if err := sr.Close(); err != nil { if err := sr.Close(); err != nil {
level.Warn(h.logger).Log("msg", "Error while closing the wal segments reader", "err", err) level.Warn(h.logger).Log("msg", "Error while closing the wal segments reader", "err", err)
} }
@ -615,26 +672,94 @@ func (h *Head) Init(minValidTime int64) error {
level.Info(h.logger).Log("msg", "WAL segment loaded", "segment", i, "maxSegment", endAt) level.Info(h.logger).Log("msg", "WAL segment loaded", "segment", i, "maxSegment", endAt)
h.updateWALReplayStatusRead(i) h.updateWALReplayStatusRead(i)
} }
walReplayDuration := time.Since(walReplayStart)
walReplayDuration := time.Since(start) wblReplayStart := time.Now()
h.metrics.walTotalReplayDuration.Set(walReplayDuration.Seconds()) if h.wbl != nil {
// Replay OOO WAL.
startFrom, endAt, e = wal.Segments(h.wbl.Dir())
if e != nil {
return errors.Wrap(e, "finding OOO WAL segments")
}
h.startWALReplayStatus(startFrom, endAt)
for i := startFrom; i <= endAt; i++ {
s, err := wal.OpenReadSegment(wal.SegmentName(h.wbl.Dir(), i))
if err != nil {
return errors.Wrap(err, fmt.Sprintf("open WBL segment: %d", i))
}
sr := wal.NewSegmentBufReader(s)
err = h.loadWBL(wal.NewReader(sr), multiRef, lastMmapRef)
if err := sr.Close(); err != nil {
level.Warn(h.logger).Log("msg", "Error while closing the wbl segments reader", "err", err)
}
if err != nil {
return err
}
level.Info(h.logger).Log("msg", "WBL segment loaded", "segment", i, "maxSegment", endAt)
h.updateWALReplayStatusRead(i)
}
}
wblReplayDuration := time.Since(wblReplayStart)
totalReplayDuration := time.Since(start)
h.metrics.dataTotalReplayDuration.Set(totalReplayDuration.Seconds())
level.Info(h.logger).Log( level.Info(h.logger).Log(
"msg", "WAL replay completed", "msg", "WAL replay completed",
"checkpoint_replay_duration", checkpointReplayDuration.String(), "checkpoint_replay_duration", checkpointReplayDuration.String(),
"wal_replay_duration", time.Since(walReplayStart).String(), "wal_replay_duration", walReplayDuration.String(),
"total_replay_duration", walReplayDuration.String(), "wbl_replay_duration", wblReplayDuration.String(),
"total_replay_duration", totalReplayDuration.String(),
) )
return nil return nil
} }
func (h *Head) loadMmappedChunks(refSeries map[chunks.HeadSeriesRef]*memSeries) (map[chunks.HeadSeriesRef][]*mmappedChunk, error) { func (h *Head) loadMmappedChunks(refSeries map[chunks.HeadSeriesRef]*memSeries) (map[chunks.HeadSeriesRef][]*mmappedChunk, map[chunks.HeadSeriesRef][]*mmappedChunk, chunks.ChunkDiskMapperRef, error) {
mmappedChunks := map[chunks.HeadSeriesRef][]*mmappedChunk{} mmappedChunks := map[chunks.HeadSeriesRef][]*mmappedChunk{}
if err := h.chunkDiskMapper.IterateAllChunks(func(seriesRef chunks.HeadSeriesRef, chunkRef chunks.ChunkDiskMapperRef, mint, maxt int64, numSamples uint16) error { oooMmappedChunks := map[chunks.HeadSeriesRef][]*mmappedChunk{}
if maxt < h.minValidTime.Load() { var lastRef, secondLastRef chunks.ChunkDiskMapperRef
if err := h.chunkDiskMapper.IterateAllChunks(func(seriesRef chunks.HeadSeriesRef, chunkRef chunks.ChunkDiskMapperRef, mint, maxt int64, numSamples uint16, encoding chunkenc.Encoding) error {
secondLastRef = lastRef
lastRef = chunkRef
isOOO := chunkenc.IsOutOfOrderChunk(encoding)
if !isOOO && maxt < h.minValidTime.Load() {
return nil return nil
} }
// We ignore any chunk that doesn't have a valid encoding
if !chunkenc.IsValidEncoding(encoding) {
return nil
}
ms, ok := refSeries[seriesRef] ms, ok := refSeries[seriesRef]
if isOOO {
if !ok {
oooMmappedChunks[seriesRef] = append(oooMmappedChunks[seriesRef], &mmappedChunk{
ref: chunkRef,
minTime: mint,
maxTime: maxt,
numSamples: numSamples,
})
return nil
}
h.metrics.chunks.Inc()
h.metrics.chunksCreated.Inc()
ms.oooMmappedChunks = append(ms.oooMmappedChunks, &mmappedChunk{
ref: chunkRef,
minTime: mint,
maxTime: maxt,
numSamples: numSamples,
})
return nil
}
if !ok { if !ok {
slice := mmappedChunks[seriesRef] slice := mmappedChunks[seriesRef]
if len(slice) > 0 && slice[len(slice)-1].maxTime >= mint { if len(slice) > 0 && slice[len(slice)-1].maxTime >= mint {
@ -677,45 +802,57 @@ func (h *Head) loadMmappedChunks(refSeries map[chunks.HeadSeriesRef]*memSeries)
} }
return nil return nil
}); err != nil { }); err != nil {
return nil, errors.Wrap(err, "iterate on on-disk chunks") // secondLastRef because the lastRef caused an error.
return nil, nil, secondLastRef, errors.Wrap(err, "iterate on on-disk chunks")
} }
return mmappedChunks, nil return mmappedChunks, oooMmappedChunks, lastRef, nil
} }
// removeCorruptedMmappedChunks attempts to delete the corrupted mmapped chunks and if it fails, it clears all the previously // removeCorruptedMmappedChunks attempts to delete the corrupted mmapped chunks and if it fails, it clears all the previously
// loaded mmapped chunks. // loaded mmapped chunks.
func (h *Head) removeCorruptedMmappedChunks(err error) (map[chunks.HeadSeriesRef][]*mmappedChunk, error) { func (h *Head) removeCorruptedMmappedChunks(err error) (map[chunks.HeadSeriesRef][]*mmappedChunk, map[chunks.HeadSeriesRef][]*mmappedChunk, chunks.ChunkDiskMapperRef, error) {
level.Info(h.logger).Log("msg", "Deleting mmapped chunk files")
// We never want to preserve the in-memory series from snapshots if we are repairing m-map chunks. // We never want to preserve the in-memory series from snapshots if we are repairing m-map chunks.
if err := h.resetInMemoryState(); err != nil { if err := h.resetInMemoryState(); err != nil {
return nil, err return map[chunks.HeadSeriesRef][]*mmappedChunk{}, map[chunks.HeadSeriesRef][]*mmappedChunk{}, 0, err
} }
level.Info(h.logger).Log("msg", "Deleting mmapped chunk files") level.Info(h.logger).Log("msg", "Deleting mmapped chunk files")
if err := h.chunkDiskMapper.DeleteCorrupted(err); err != nil { if err := h.chunkDiskMapper.DeleteCorrupted(err); err != nil {
level.Info(h.logger).Log("msg", "Deletion of corrupted mmap chunk files failed, discarding chunk files completely", "err", err) level.Info(h.logger).Log("msg", "Deletion of corrupted mmap chunk files failed, discarding chunk files completely", "err", err)
if err := h.chunkDiskMapper.Truncate(math.MaxInt64); err != nil { if err := h.chunkDiskMapper.Truncate(math.MaxUint32); err != nil {
level.Error(h.logger).Log("msg", "Deletion of all mmap chunk files failed", "err", err) level.Error(h.logger).Log("msg", "Deletion of all mmap chunk files failed", "err", err)
} }
return map[chunks.HeadSeriesRef][]*mmappedChunk{}, nil return map[chunks.HeadSeriesRef][]*mmappedChunk{}, map[chunks.HeadSeriesRef][]*mmappedChunk{}, 0, nil
} }
level.Info(h.logger).Log("msg", "Deletion of mmap chunk files successful, reattempting m-mapping the on-disk chunks") level.Info(h.logger).Log("msg", "Deletion of mmap chunk files successful, reattempting m-mapping the on-disk chunks")
mmappedChunks, err := h.loadMmappedChunks(make(map[chunks.HeadSeriesRef]*memSeries)) mmappedChunks, oooMmappedChunks, lastRef, err := h.loadMmappedChunks(make(map[chunks.HeadSeriesRef]*memSeries))
if err != nil { if err != nil {
level.Error(h.logger).Log("msg", "Loading on-disk chunks failed, discarding chunk files completely", "err", err) level.Error(h.logger).Log("msg", "Loading on-disk chunks failed, discarding chunk files completely", "err", err)
if err := h.chunkDiskMapper.Truncate(math.MaxInt64); err != nil { if err := h.chunkDiskMapper.Truncate(math.MaxUint32); err != nil {
level.Error(h.logger).Log("msg", "Deletion of all mmap chunk files failed after failed loading", "err", err) level.Error(h.logger).Log("msg", "Deletion of all mmap chunk files failed after failed loading", "err", err)
} }
mmappedChunks = map[chunks.HeadSeriesRef][]*mmappedChunk{} mmappedChunks = map[chunks.HeadSeriesRef][]*mmappedChunk{}
} }
return mmappedChunks, nil return mmappedChunks, oooMmappedChunks, lastRef, nil
} }
func (h *Head) ApplyConfig(cfg *config.Config) error { func (h *Head) ApplyConfig(cfg *config.Config, wbl *wal.WAL) {
oooTimeWindow := int64(0)
if cfg.StorageConfig.TSDBConfig != nil {
oooTimeWindow = cfg.StorageConfig.TSDBConfig.OutOfOrderTimeWindow
}
if oooTimeWindow < 0 {
oooTimeWindow = 0
}
h.SetOutOfOrderTimeWindow(oooTimeWindow, wbl)
if !h.opts.EnableExemplarStorage { if !h.opts.EnableExemplarStorage {
return nil return
} }
// Head uses opts.MaxExemplars in combination with opts.EnableExemplarStorage // Head uses opts.MaxExemplars in combination with opts.EnableExemplarStorage
@ -726,12 +863,21 @@ func (h *Head) ApplyConfig(cfg *config.Config) error {
newSize := h.opts.MaxExemplars.Load() newSize := h.opts.MaxExemplars.Load()
if prevSize == newSize { if prevSize == newSize {
return nil return
} }
migrated := h.exemplars.(*CircularExemplarStorage).Resize(newSize) migrated := h.exemplars.(*CircularExemplarStorage).Resize(newSize)
level.Info(h.logger).Log("msg", "Exemplar storage resized", "from", prevSize, "to", newSize, "migrated", migrated) level.Info(h.logger).Log("msg", "Exemplar storage resized", "from", prevSize, "to", newSize, "migrated", migrated)
return nil }
// SetOutOfOrderTimeWindow updates the out of order related parameters.
// If the Head already has a WBL set, then the wbl will be ignored.
func (h *Head) SetOutOfOrderTimeWindow(oooTimeWindow int64, wbl *wal.WAL) {
if oooTimeWindow > 0 && h.wbl == nil {
h.wbl = wbl
}
h.opts.OutOfOrderTimeWindow.Store(oooTimeWindow)
} }
// PostingsCardinalityStats returns top 10 highest cardinality stats By label and value names. // PostingsCardinalityStats returns top 10 highest cardinality stats By label and value names.
@ -773,6 +919,27 @@ func (h *Head) updateMinMaxTime(mint, maxt int64) {
} }
} }
func (h *Head) updateMinOOOMaxOOOTime(mint, maxt int64) {
for {
lt := h.MinOOOTime()
if mint >= lt {
break
}
if h.minOOOTime.CompareAndSwap(lt, mint) {
break
}
}
for {
ht := h.MaxOOOTime()
if maxt <= ht {
break
}
if h.maxOOOTime.CompareAndSwap(ht, maxt) {
break
}
}
}
// SetMinValidTime sets the minimum timestamp the head can ingest. // SetMinValidTime sets the minimum timestamp the head can ingest.
func (h *Head) SetMinValidTime(minValidTime int64) { func (h *Head) SetMinValidTime(minValidTime int64) {
h.minValidTime.Store(minValidTime) h.minValidTime.Store(minValidTime)
@ -838,30 +1005,7 @@ func (h *Head) truncateMemory(mint int64) (err error) {
} }
h.metrics.headTruncateTotal.Inc() h.metrics.headTruncateTotal.Inc()
start := time.Now() return h.truncateSeriesAndChunkDiskMapper("truncateMemory")
actualMint := h.gc()
level.Info(h.logger).Log("msg", "Head GC completed", "duration", time.Since(start))
h.metrics.gcDuration.Observe(time.Since(start).Seconds())
if actualMint > h.minTime.Load() {
// The actual mint of the Head is higher than the one asked to truncate.
appendableMinValidTime := h.appendableMinValidTime()
if actualMint < appendableMinValidTime {
h.minTime.Store(actualMint)
h.minValidTime.Store(actualMint)
} else {
// The actual min time is in the appendable window.
// So we set the mint to the appendableMinValidTime.
h.minTime.Store(appendableMinValidTime)
h.minValidTime.Store(appendableMinValidTime)
}
}
// Truncate the chunk m-mapper.
if err := h.chunkDiskMapper.Truncate(mint); err != nil {
return errors.Wrap(err, "truncate chunks.HeadReadWriter")
}
return nil
} }
// WaitForPendingReadersInTimeRange waits for queries overlapping with given range to finish querying. // WaitForPendingReadersInTimeRange waits for queries overlapping with given range to finish querying.
@ -950,7 +1094,7 @@ func (h *Head) truncateWAL(mint int64) error {
} }
// Start a new segment, so low ingestion volume TSDB don't have more WAL than // Start a new segment, so low ingestion volume TSDB don't have more WAL than
// needed. // needed.
if err := h.wal.NextSegment(); err != nil { if _, err := h.wal.NextSegment(); err != nil {
return errors.Wrap(err, "next segment") return errors.Wrap(err, "next segment")
} }
last-- // Never consider last segment for checkpoint. last-- // Never consider last segment for checkpoint.
@ -1016,6 +1160,59 @@ func (h *Head) truncateWAL(mint int64) error {
return nil return nil
} }
// truncateOOO
// - truncates the OOO WBL files whose index is strictly less than lastWBLFile.
// - garbage collects all the m-map chunks from the memory that are less than or equal to minOOOMmapRef
// and then deletes the series that do not have any data anymore.
func (h *Head) truncateOOO(lastWBLFile int, minOOOMmapRef chunks.ChunkDiskMapperRef) error {
curMinOOOMmapRef := chunks.ChunkDiskMapperRef(h.minOOOMmapRef.Load())
if minOOOMmapRef.GreaterThan(curMinOOOMmapRef) {
h.minOOOMmapRef.Store(uint64(minOOOMmapRef))
if err := h.truncateSeriesAndChunkDiskMapper("truncateOOO"); err != nil {
return err
}
}
return h.wbl.Truncate(lastWBLFile)
}
// truncateSeriesAndChunkDiskMapper is a helper function for truncateMemory and truncateOOO.
// It runs GC on the Head and truncates the ChunkDiskMapper accordingly.
func (h *Head) truncateSeriesAndChunkDiskMapper(caller string) error {
start := time.Now()
headMaxt := h.MaxTime()
actualMint, minOOOTime, minMmapFile := h.gc()
level.Info(h.logger).Log("msg", "Head GC completed", "caller", caller, "duration", time.Since(start))
h.metrics.gcDuration.Observe(time.Since(start).Seconds())
if actualMint > h.minTime.Load() {
// The actual mint of the head is higher than the one asked to truncate.
appendableMinValidTime := h.appendableMinValidTime()
if actualMint < appendableMinValidTime {
h.minTime.Store(actualMint)
h.minValidTime.Store(actualMint)
} else {
// The actual min time is in the appendable window.
// So we set the mint to the appendableMinValidTime.
h.minTime.Store(appendableMinValidTime)
h.minValidTime.Store(appendableMinValidTime)
}
}
if headMaxt-h.opts.OutOfOrderTimeWindow.Load() < minOOOTime {
// The allowed OOO window is lower than the min OOO time seen during GC.
// So it is possible that some OOO sample was inserted that was less that minOOOTime.
// So we play safe and set it to the min that was possible.
minOOOTime = headMaxt - h.opts.OutOfOrderTimeWindow.Load()
}
h.minOOOTime.Store(minOOOTime)
// Truncate the chunk m-mapper.
if err := h.chunkDiskMapper.Truncate(uint32(minMmapFile)); err != nil {
return errors.Wrap(err, "truncate chunks.HeadReadWriter by file number")
}
return nil
}
type Stats struct { type Stats struct {
NumSeries uint64 NumSeries uint64
MinTime, MaxTime int64 MinTime, MaxTime int64
@ -1149,14 +1346,20 @@ func (h *Head) Delete(mint, maxt int64, ms ...*labels.Matcher) error {
} }
// gc removes data before the minimum timestamp from the head. // gc removes data before the minimum timestamp from the head.
// It returns the actual min times of the chunks present in the Head. // It returns
func (h *Head) gc() int64 { // * The actual min times of the chunks present in the Head.
// * The min OOO time seen during the GC.
// * Min mmap file number seen in the series (in-order and out-of-order) after gc'ing the series.
func (h *Head) gc() (actualInOrderMint, minOOOTime int64, minMmapFile int) {
// Only data strictly lower than this timestamp must be deleted. // Only data strictly lower than this timestamp must be deleted.
mint := h.MinTime() mint := h.MinTime()
// Only ooo m-map chunks strictly lower than or equal to this ref
// must be deleted.
minOOOMmapRef := chunks.ChunkDiskMapperRef(h.minOOOMmapRef.Load())
// Drop old chunks and remember series IDs and hashes if they can be // Drop old chunks and remember series IDs and hashes if they can be
// deleted entirely. // deleted entirely.
deleted, chunksRemoved, actualMint := h.series.gc(mint) deleted, chunksRemoved, actualInOrderMint, minOOOTime, minMmapFile := h.series.gc(mint, minOOOMmapRef)
seriesRemoved := len(deleted) seriesRemoved := len(deleted)
h.metrics.seriesRemoved.Add(float64(seriesRemoved)) h.metrics.seriesRemoved.Add(float64(seriesRemoved))
@ -1186,7 +1389,7 @@ func (h *Head) gc() int64 {
h.deletedMtx.Unlock() h.deletedMtx.Unlock()
} }
return actualMint return actualInOrderMint, minOOOTime, minMmapFile
} }
// Tombstones returns a new reader over the head's tombstones // Tombstones returns a new reader over the head's tombstones
@ -1224,6 +1427,18 @@ func (h *Head) MaxTime() int64 {
return h.maxTime.Load() return h.maxTime.Load()
} }
// MinOOOTime returns the lowest time bound on visible data in the out of order
// head.
func (h *Head) MinOOOTime() int64 {
return h.minOOOTime.Load()
}
// MaxOOOTime returns the highest timestamp on visible data in the out of order
// head.
func (h *Head) MaxOOOTime() int64 {
return h.maxOOOTime.Load()
}
// compactable returns whether the head has a compactable range. // compactable returns whether the head has a compactable range.
// The head has a compactable range when the head time range is 1.5 times the chunk range. // The head has a compactable range when the head time range is 1.5 times the chunk range.
// The 0.5 acts as a buffer of the appendable window. // The 0.5 acts as a buffer of the appendable window.
@ -1241,6 +1456,9 @@ func (h *Head) Close() error {
if h.wal != nil { if h.wal != nil {
errs.Add(h.wal.Close()) errs.Add(h.wal.Close())
} }
if h.wbl != nil {
errs.Add(h.wbl.Close())
}
if errs.Err() == nil && h.opts.EnableMemorySnapshotOnShutdown { if errs.Err() == nil && h.opts.EnableMemorySnapshotOnShutdown {
errs.Add(h.performChunkSnapshot()) errs.Add(h.performChunkSnapshot())
} }
@ -1271,7 +1489,7 @@ func (h *Head) getOrCreate(hash uint64, lset labels.Labels) (*memSeries, bool, e
func (h *Head) getOrCreateWithID(id chunks.HeadSeriesRef, hash uint64, lset labels.Labels) (*memSeries, bool, error) { func (h *Head) getOrCreateWithID(id chunks.HeadSeriesRef, hash uint64, lset labels.Labels) (*memSeries, bool, error) {
s, created, err := h.series.getOrSet(hash, lset, func() *memSeries { s, created, err := h.series.getOrSet(hash, lset, func() *memSeries {
return newMemSeries(lset, id, h.chunkRange.Load(), h.opts.IsolationDisabled) return newMemSeries(lset, id, h.chunkRange.Load(), h.opts.OutOfOrderCapMax.Load(), h.opts.IsolationDisabled)
}) })
if err != nil { if err != nil {
return nil, false, err return nil, false, err
@ -1333,7 +1551,7 @@ const (
) )
// stripeSeries holds series by HeadSeriesRef ("ID") and also by hash of their labels. // stripeSeries holds series by HeadSeriesRef ("ID") and also by hash of their labels.
// ID-based lookups via (getByID()) are preferred over getByHash() for performance reasons. // ID-based lookups via getByID() are preferred over getByHash() for performance reasons.
// It locks modulo ranges of IDs and hashes to reduce lock contention. // It locks modulo ranges of IDs and hashes to reduce lock contention.
// The locks are padded to not be on the same cache line. Filling the padded space // The locks are padded to not be on the same cache line. Filling the padded space
// with the maps was profiled to be slower likely due to the additional pointer // with the maps was profiled to be slower likely due to the additional pointer
@ -1375,13 +1593,16 @@ func newStripeSeries(stripeSize int, seriesCallback SeriesLifecycleCallback) *st
// note: returning map[chunks.HeadSeriesRef]struct{} would be more accurate, // note: returning map[chunks.HeadSeriesRef]struct{} would be more accurate,
// but the returned map goes into postings.Delete() which expects a map[storage.SeriesRef]struct // but the returned map goes into postings.Delete() which expects a map[storage.SeriesRef]struct
// and there's no easy way to cast maps. // and there's no easy way to cast maps.
func (s *stripeSeries) gc(mint int64) (map[storage.SeriesRef]struct{}, int, int64) { // minMmapFile is the min mmap file number seen in the series (in-order and out-of-order) after gc'ing the series.
func (s *stripeSeries) gc(mint int64, minOOOMmapRef chunks.ChunkDiskMapperRef) (_ map[storage.SeriesRef]struct{}, _ int, _, _ int64, minMmapFile int) {
var ( var (
deleted = map[storage.SeriesRef]struct{}{} deleted = map[storage.SeriesRef]struct{}{}
deletedForCallback = []labels.Labels{} deletedForCallback = []labels.Labels{}
rmChunks = 0 rmChunks = 0
actualMint int64 = math.MaxInt64 actualMint int64 = math.MaxInt64
minOOOTime int64 = math.MaxInt64
) )
minMmapFile = math.MaxInt32
// Run through all series and truncate old chunks. Mark those with no // Run through all series and truncate old chunks. Mark those with no
// chunks left as deleted and store their ID. // chunks left as deleted and store their ID.
for i := 0; i < s.size; i++ { for i := 0; i < s.size; i++ {
@ -1390,9 +1611,32 @@ func (s *stripeSeries) gc(mint int64) (map[storage.SeriesRef]struct{}, int, int6
for hash, all := range s.hashes[i] { for hash, all := range s.hashes[i] {
for _, series := range all { for _, series := range all {
series.Lock() series.Lock()
rmChunks += series.truncateChunksBefore(mint) rmChunks += series.truncateChunksBefore(mint, minOOOMmapRef)
if len(series.mmappedChunks) > 0 || series.headChunk != nil || series.pendingCommit { if len(series.mmappedChunks) > 0 {
seq, _ := series.mmappedChunks[0].ref.Unpack()
if seq < minMmapFile {
minMmapFile = seq
}
}
if len(series.oooMmappedChunks) > 0 {
seq, _ := series.oooMmappedChunks[0].ref.Unpack()
if seq < minMmapFile {
minMmapFile = seq
}
for _, ch := range series.oooMmappedChunks {
if ch.minTime < minOOOTime {
minOOOTime = ch.minTime
}
}
}
if series.oooHeadChunk != nil {
if series.oooHeadChunk.minTime < minOOOTime {
minOOOTime = series.oooHeadChunk.minTime
}
}
if len(series.mmappedChunks) > 0 || len(series.oooMmappedChunks) > 0 ||
series.headChunk != nil || series.oooHeadChunk != nil || series.pendingCommit {
seriesMint := series.minTime() seriesMint := series.minTime()
if seriesMint < actualMint { if seriesMint < actualMint {
actualMint = seriesMint actualMint = seriesMint
@ -1435,7 +1679,7 @@ func (s *stripeSeries) gc(mint int64) (map[storage.SeriesRef]struct{}, int, int6
actualMint = mint actualMint = mint
} }
return deleted, rmChunks, actualMint return deleted, rmChunks, actualMint, minOOOTime, minMmapFile
} }
func (s *stripeSeries) getByID(id chunks.HeadSeriesRef) *memSeries { func (s *stripeSeries) getByID(id chunks.HeadSeriesRef) *memSeries {
@ -1528,11 +1772,16 @@ type memSeries struct {
// //
// pN is the pointer to the mmappedChunk referered to by HeadChunkID=N // pN is the pointer to the mmappedChunk referered to by HeadChunkID=N
mmappedChunks []*mmappedChunk mmappedChunks []*mmappedChunk
headChunk *memChunk // Most recent chunk in memory that's still being built.
firstChunkID chunks.HeadChunkID // HeadChunkID for mmappedChunks[0]
mmMaxTime int64 // Max time of any mmapped chunk, only used during WAL replay. oooMmappedChunks []*mmappedChunk // Immutable chunks on disk containing OOO samples.
headChunk *memChunk // Most recent chunk in memory that's still being built. oooHeadChunk *oooHeadChunk // Most recent chunk for ooo samples in memory that's still being built.
chunkRange int64 firstOOOChunkID chunks.HeadChunkID // HeadOOOChunkID for oooMmappedChunks[0]
firstChunkID chunks.HeadChunkID // HeadChunkID for mmappedChunks[0]
mmMaxTime int64 // Max time of any mmapped chunk, only used during WAL replay.
chunkRange int64
oooCapMax uint8
nextAt int64 // Timestamp at which to cut the next chunk. nextAt int64 // Timestamp at which to cut the next chunk.
@ -1551,12 +1800,13 @@ type memSeries struct {
pendingCommit bool // Whether there are samples waiting to be committed to this series. pendingCommit bool // Whether there are samples waiting to be committed to this series.
} }
func newMemSeries(lset labels.Labels, id chunks.HeadSeriesRef, chunkRange int64, isolationDisabled bool) *memSeries { func newMemSeries(lset labels.Labels, id chunks.HeadSeriesRef, chunkRange, oooCapMax int64, isolationDisabled bool) *memSeries {
s := &memSeries{ s := &memSeries{
lset: lset, lset: lset,
ref: id, ref: id,
chunkRange: chunkRange, chunkRange: chunkRange,
nextAt: math.MinInt64, nextAt: math.MinInt64,
oooCapMax: uint8(oooCapMax),
} }
if !isolationDisabled { if !isolationDisabled {
s.txs = newTxRing(4) s.txs = newTxRing(4)
@ -1575,6 +1825,7 @@ func (s *memSeries) minTime() int64 {
} }
func (s *memSeries) maxTime() int64 { func (s *memSeries) maxTime() int64 {
// The highest timestamps will always be in the regular (non-OOO) chunks, even if OOO is enabled.
c := s.head() c := s.head()
if c != nil { if c != nil {
return c.maxTime return c.maxTime
@ -1588,26 +1839,39 @@ func (s *memSeries) maxTime() int64 {
// truncateChunksBefore removes all chunks from the series that // truncateChunksBefore removes all chunks from the series that
// have no timestamp at or after mint. // have no timestamp at or after mint.
// Chunk IDs remain unchanged. // Chunk IDs remain unchanged.
func (s *memSeries) truncateChunksBefore(mint int64) (removed int) { func (s *memSeries) truncateChunksBefore(mint int64, minOOOMmapRef chunks.ChunkDiskMapperRef) int {
var removedInOrder int
if s.headChunk != nil && s.headChunk.maxTime < mint { if s.headChunk != nil && s.headChunk.maxTime < mint {
// If head chunk is truncated, we can truncate all mmapped chunks. // If head chunk is truncated, we can truncate all mmapped chunks.
removed = 1 + len(s.mmappedChunks) removedInOrder = 1 + len(s.mmappedChunks)
s.firstChunkID += chunks.HeadChunkID(removed) s.firstChunkID += chunks.HeadChunkID(removedInOrder)
s.headChunk = nil s.headChunk = nil
s.mmappedChunks = nil s.mmappedChunks = nil
return removed
} }
if len(s.mmappedChunks) > 0 { if len(s.mmappedChunks) > 0 {
for i, c := range s.mmappedChunks { for i, c := range s.mmappedChunks {
if c.maxTime >= mint { if c.maxTime >= mint {
break break
} }
removed = i + 1 removedInOrder = i + 1
} }
s.mmappedChunks = append(s.mmappedChunks[:0], s.mmappedChunks[removed:]...) s.mmappedChunks = append(s.mmappedChunks[:0], s.mmappedChunks[removedInOrder:]...)
s.firstChunkID += chunks.HeadChunkID(removed) s.firstChunkID += chunks.HeadChunkID(removedInOrder)
} }
return removed
var removedOOO int
if len(s.oooMmappedChunks) > 0 {
for i, c := range s.oooMmappedChunks {
if c.ref.GreaterThan(minOOOMmapRef) {
break
}
removedOOO = i + 1
}
s.oooMmappedChunks = append(s.oooMmappedChunks[:0], s.oooMmappedChunks[removedOOO:]...)
s.firstOOOChunkID += chunks.HeadChunkID(removedOOO)
}
return removedInOrder + removedOOO
} }
// cleanupAppendIDsBelow cleans up older appendIDs. Has to be called after // cleanupAppendIDsBelow cleans up older appendIDs. Has to be called after
@ -1627,6 +1891,16 @@ type memChunk struct {
minTime, maxTime int64 minTime, maxTime int64
} }
type oooHeadChunk struct {
chunk *OOOChunk
minTime, maxTime int64 // can probably be removed and pulled out of the chunk instead
}
// OverlapsClosedInterval returns true if the chunk overlaps [mint, maxt].
func (mc *oooHeadChunk) OverlapsClosedInterval(mint, maxt int64) bool {
return overlapsClosedInterval(mc.minTime, mc.maxTime, mint, maxt)
}
// OverlapsClosedInterval returns true if the chunk overlaps [mint, maxt]. // OverlapsClosedInterval returns true if the chunk overlaps [mint, maxt].
func (mc *memChunk) OverlapsClosedInterval(mint, maxt int64) bool { func (mc *memChunk) OverlapsClosedInterval(mint, maxt int64) bool {
return overlapsClosedInterval(mc.minTime, mc.maxTime, mint, maxt) return overlapsClosedInterval(mc.minTime, mc.maxTime, mint, maxt)
@ -1655,12 +1929,15 @@ func (noopSeriesLifecycleCallback) PostCreation(labels.Labels) {}
func (noopSeriesLifecycleCallback) PostDeletion(...labels.Labels) {} func (noopSeriesLifecycleCallback) PostDeletion(...labels.Labels) {}
func (h *Head) Size() int64 { func (h *Head) Size() int64 {
var walSize int64 var walSize, wblSize int64
if h.wal != nil { if h.wal != nil {
walSize, _ = h.wal.Size() walSize, _ = h.wal.Size()
} }
if h.wbl != nil {
wblSize, _ = h.wbl.Size()
}
cdmSize, _ := h.chunkDiskMapper.Size() cdmSize, _ := h.chunkDiskMapper.Size()
return walSize + cdmSize return walSize + wblSize + cdmSize
} }
func (h *RangeHead) Size() int64 { func (h *RangeHead) Size() int64 {

View file

@ -137,6 +137,8 @@ func (h *Head) appender() *headAppender {
minValidTime: h.appendableMinValidTime(), minValidTime: h.appendableMinValidTime(),
mint: math.MaxInt64, mint: math.MaxInt64,
maxt: math.MinInt64, maxt: math.MinInt64,
headMaxt: h.MaxTime(),
oooTimeWindow: h.opts.OutOfOrderTimeWindow.Load(),
samples: h.getAppendBuffer(), samples: h.getAppendBuffer(),
sampleSeries: h.getSeriesBuffer(), sampleSeries: h.getSeriesBuffer(),
exemplars: exemplarsBuf, exemplars: exemplarsBuf,
@ -252,9 +254,11 @@ type exemplarWithSeriesRef struct {
} }
type headAppender struct { type headAppender struct {
head *Head head *Head
minValidTime int64 // No samples below this timestamp are allowed. minValidTime int64 // No samples below this timestamp are allowed.
mint, maxt int64 mint, maxt int64
headMaxt int64 // We track it here to not take the lock for every sample appended.
oooTimeWindow int64 // Use the same for the entire append, and don't load the atomic for each sample.
series []record.RefSeries // New series held by this appender. series []record.RefSeries // New series held by this appender.
metadata []record.RefMetadata // New metadata held by this appender. metadata []record.RefMetadata // New metadata held by this appender.
@ -268,7 +272,9 @@ type headAppender struct {
} }
func (a *headAppender) Append(ref storage.SeriesRef, lset labels.Labels, t int64, v float64) (storage.SeriesRef, error) { func (a *headAppender) Append(ref storage.SeriesRef, lset labels.Labels, t int64, v float64) (storage.SeriesRef, error) {
if t < a.minValidTime { // For OOO inserts, this restriction is irrelevant and will be checked later once we confirm the sample is an in-order append.
// If OOO inserts are disabled, we may as well as check this as early as we can and avoid more work.
if a.oooTimeWindow == 0 && t < a.minValidTime {
a.head.metrics.outOfBoundSamples.Inc() a.head.metrics.outOfBoundSamples.Inc()
return 0, storage.ErrOutOfBounds return 0, storage.ErrOutOfBounds
} }
@ -300,15 +306,25 @@ func (a *headAppender) Append(ref storage.SeriesRef, lset labels.Labels, t int64
} }
s.Lock() s.Lock()
if err := s.appendable(t, v); err != nil { // TODO(codesome): If we definitely know at this point that the sample is ooo, then optimise
s.Unlock() // to skip that sample from the WAL and write only in the WBL.
if err == storage.ErrOutOfOrderSample { _, delta, err := s.appendable(t, v, a.headMaxt, a.minValidTime, a.oooTimeWindow)
if err == nil {
s.pendingCommit = true
}
s.Unlock()
if delta > 0 {
a.head.metrics.oooHistogram.Observe(float64(delta))
}
if err != nil {
switch err {
case storage.ErrOutOfOrderSample:
a.head.metrics.outOfOrderSamples.Inc() a.head.metrics.outOfOrderSamples.Inc()
case storage.ErrTooOldSample:
a.head.metrics.tooOldSamples.Inc()
} }
return 0, err return 0, err
} }
s.pendingCommit = true
s.Unlock()
if t < a.mint { if t < a.mint {
a.mint = t a.mint = t
@ -326,25 +342,46 @@ func (a *headAppender) Append(ref storage.SeriesRef, lset labels.Labels, t int64
return storage.SeriesRef(s.ref), nil return storage.SeriesRef(s.ref), nil
} }
// appendable checks whether the given sample is valid for appending to the series. // appendable checks whether the given sample is valid for appending to the series. (if we return false and no error)
func (s *memSeries) appendable(t int64, v float64) error { // The sample belongs to the out of order chunk if we return true and no error.
c := s.head() // An error signifies the sample cannot be handled.
if c == nil { func (s *memSeries) appendable(t int64, v float64, headMaxt, minValidTime, oooTimeWindow int64) (isOOO bool, oooDelta int64, err error) {
return nil // Check if we can append in the in-order chunk.
if t >= minValidTime {
if s.head() == nil {
// The series has no sample and was freshly created.
return false, 0, nil
}
msMaxt := s.maxTime()
if t > msMaxt {
return false, 0, nil
}
if t == msMaxt {
// We are allowing exact duplicates as we can encounter them in valid cases
// like federation and erroring out at that time would be extremely noisy.
// This only checks against the latest in-order sample.
// The OOO headchunk has its own method to detect these duplicates.
if math.Float64bits(s.sampleBuf[3].v) != math.Float64bits(v) {
return false, 0, storage.ErrDuplicateSampleForTimestamp
}
// Sample is identical (ts + value) with most current (highest ts) sample in sampleBuf.
return false, 0, nil
}
} }
if t > c.maxTime { // The sample cannot go in the in-order chunk. Check if it can go in the out-of-order chunk.
return nil if oooTimeWindow > 0 && t >= headMaxt-oooTimeWindow {
return true, headMaxt - t, nil
} }
if t < c.maxTime {
return storage.ErrOutOfOrderSample // The sample cannot go in both in-order and out-of-order chunk.
if oooTimeWindow > 0 {
return true, headMaxt - t, storage.ErrTooOldSample
} }
// We are allowing exact duplicates as we can encounter them in valid cases if t < minValidTime {
// like federation and erroring out at that time would be extremely noisy. return false, headMaxt - t, storage.ErrOutOfBounds
if math.Float64bits(s.sampleBuf[3].v) != math.Float64bits(v) {
return storage.ErrDuplicateSampleForTimestamp
} }
return nil return false, headMaxt - t, storage.ErrOutOfOrderSample
} }
// AppendExemplar for headAppender assumes the series ref already exists, and so it doesn't // AppendExemplar for headAppender assumes the series ref already exists, and so it doesn't
@ -487,6 +524,7 @@ func exemplarsForEncoding(es []exemplarWithSeriesRef) []record.RefExemplar {
} }
// Commit writes to the WAL and adds the data to the Head. // Commit writes to the WAL and adds the data to the Head.
// TODO(codesome): Refactor this method to reduce indentation and make it more readable.
func (a *headAppender) Commit() (err error) { func (a *headAppender) Commit() (err error) {
if a.closed { if a.closed {
return ErrAppenderClosed return ErrAppenderClosed
@ -517,24 +555,143 @@ func (a *headAppender) Commit() (err error) {
defer a.head.putMetadataBuffer(a.metadata) defer a.head.putMetadataBuffer(a.metadata)
defer a.head.iso.closeAppend(a.appendID) defer a.head.iso.closeAppend(a.appendID)
total := len(a.samples) var (
var series *memSeries samplesAppended = len(a.samples)
oooAccepted int // number of samples out of order but accepted: with ooo enabled and within time window
oooRejected int // number of samples rejected due to: out of order but OOO support disabled.
tooOldRejected int // number of samples rejected due to: that are out of order but too old (OOO support enabled, but outside time window)
oobRejected int // number of samples rejected due to: out of bounds: with t < minValidTime (OOO support disabled)
inOrderMint int64 = math.MaxInt64
inOrderMaxt int64 = math.MinInt64
ooomint int64 = math.MaxInt64
ooomaxt int64 = math.MinInt64
wblSamples []record.RefSample
oooMmapMarkers map[chunks.HeadSeriesRef]chunks.ChunkDiskMapperRef
oooRecords [][]byte
series *memSeries
enc record.Encoder
)
defer func() {
for i := range oooRecords {
a.head.putBytesBuffer(oooRecords[i][:0])
}
}()
collectOOORecords := func() {
if a.head.wbl == nil {
// WBL is not enabled. So no need to collect.
wblSamples = nil
oooMmapMarkers = nil
return
}
// The m-map happens before adding a new sample. So we collect
// the m-map markers first, and then samples.
// WBL Graphically:
// WBL Before this Commit(): [old samples before this commit for chunk 1]
// WBL After this Commit(): [old samples before this commit for chunk 1][new samples in this commit for chunk 1]mmapmarker1[samples for chunk 2]mmapmarker2[samples for chunk 3]
if oooMmapMarkers != nil {
markers := make([]record.RefMmapMarker, 0, len(oooMmapMarkers))
for ref, mmapRef := range oooMmapMarkers {
markers = append(markers, record.RefMmapMarker{
Ref: ref,
MmapRef: mmapRef,
})
}
r := enc.MmapMarkers(markers, a.head.getBytesBuffer())
oooRecords = append(oooRecords, r)
}
if len(wblSamples) > 0 {
r := enc.Samples(wblSamples, a.head.getBytesBuffer())
oooRecords = append(oooRecords, r)
}
wblSamples = nil
oooMmapMarkers = nil
}
for i, s := range a.samples { for i, s := range a.samples {
series = a.sampleSeries[i] series = a.sampleSeries[i]
series.Lock() series.Lock()
ok, chunkCreated := series.append(s.T, s.V, a.appendID, a.head.chunkDiskMapper)
series.cleanupAppendIDsBelow(a.cleanupAppendIDsBelow)
series.pendingCommit = false
series.Unlock()
if !ok { oooSample, _, err := series.appendable(s.T, s.V, a.headMaxt, a.minValidTime, a.oooTimeWindow)
total-- switch err {
a.head.metrics.outOfOrderSamples.Inc() case storage.ErrOutOfOrderSample:
samplesAppended--
oooRejected++
case storage.ErrOutOfBounds:
samplesAppended--
oobRejected++
case storage.ErrTooOldSample:
samplesAppended--
tooOldRejected++
case nil:
// Do nothing.
default:
samplesAppended--
} }
var ok, chunkCreated bool
if err == nil && oooSample {
// Sample is OOO and OOO handling is enabled
// and the delta is within the OOO tolerance.
var mmapRef chunks.ChunkDiskMapperRef
ok, chunkCreated, mmapRef = series.insert(s.T, s.V, a.head.chunkDiskMapper)
if chunkCreated {
r, ok := oooMmapMarkers[series.ref]
if !ok || r != 0 {
// !ok means there are no markers collected for these samples yet. So we first flush the samples
// before setting this m-map marker.
// r != 0 means we have already m-mapped a chunk for this series in the same Commit().
// Hence, before we m-map again, we should add the samples and m-map markers
// seen till now to the WBL records.
collectOOORecords()
}
if oooMmapMarkers == nil {
oooMmapMarkers = make(map[chunks.HeadSeriesRef]chunks.ChunkDiskMapperRef)
}
oooMmapMarkers[series.ref] = mmapRef
}
if ok {
wblSamples = append(wblSamples, s)
if s.T < ooomint {
ooomint = s.T
}
if s.T > ooomaxt {
ooomaxt = s.T
}
oooAccepted++
} else {
// Sample is an exact duplicate of the last sample.
// NOTE: We can only detect updates if they clash with a sample in the OOOHeadChunk,
// not with samples in already flushed OOO chunks.
// TODO(codesome): Add error reporting? It depends on addressing https://github.com/prometheus/prometheus/discussions/10305.
samplesAppended--
}
} else if err == nil {
ok, chunkCreated = series.append(s.T, s.V, a.appendID, a.head.chunkDiskMapper)
if ok {
if s.T < inOrderMint {
inOrderMint = s.T
}
if s.T > inOrderMaxt {
inOrderMaxt = s.T
}
} else {
// The sample is an exact duplicate, and should be silently dropped.
samplesAppended--
}
}
if chunkCreated { if chunkCreated {
a.head.metrics.chunks.Inc() a.head.metrics.chunks.Inc()
a.head.metrics.chunksCreated.Inc() a.head.metrics.chunksCreated.Inc()
} }
series.cleanupAppendIDsBelow(a.cleanupAppendIDsBelow)
series.pendingCommit = false
series.Unlock()
} }
for i, m := range a.metadata { for i, m := range a.metadata {
@ -544,12 +701,48 @@ func (a *headAppender) Commit() (err error) {
series.Unlock() series.Unlock()
} }
a.head.metrics.samplesAppended.Add(float64(total)) a.head.metrics.outOfOrderSamples.Add(float64(oooRejected))
a.head.updateMinMaxTime(a.mint, a.maxt) a.head.metrics.outOfBoundSamples.Add(float64(oobRejected))
a.head.metrics.tooOldSamples.Add(float64(tooOldRejected))
a.head.metrics.samplesAppended.Add(float64(samplesAppended))
a.head.metrics.outOfOrderSamplesAppended.Add(float64(oooAccepted))
a.head.updateMinMaxTime(inOrderMint, inOrderMaxt)
a.head.updateMinOOOMaxOOOTime(ooomint, ooomaxt)
collectOOORecords()
if a.head.wbl != nil {
if err := a.head.wbl.Log(oooRecords...); err != nil {
// TODO(codesome): Currently WBL logging of ooo samples is best effort here since we cannot try logging
// until we have found what samples become OOO. We can try having a metric for this failure.
// Returning the error here is not correct because we have already put the samples into the memory,
// hence the append/insert was a success.
level.Error(a.head.logger).Log("msg", "Failed to log out of order samples into the WAL", "err", err)
}
}
return nil return nil
} }
// insert is like append, except it inserts. Used for OOO samples.
func (s *memSeries) insert(t int64, v float64, chunkDiskMapper *chunks.ChunkDiskMapper) (inserted, chunkCreated bool, mmapRef chunks.ChunkDiskMapperRef) {
c := s.oooHeadChunk
if c == nil || c.chunk.NumSamples() == int(s.oooCapMax) {
// Note: If no new samples come in then we rely on compaction to clean up stale in-memory OOO chunks.
c, mmapRef = s.cutNewOOOHeadChunk(t, chunkDiskMapper)
chunkCreated = true
}
ok := c.chunk.Insert(t, v)
if ok {
if chunkCreated || t < c.minTime {
c.minTime = t
}
if chunkCreated || t > c.maxTime {
c.maxTime = t
}
}
return ok, chunkCreated, mmapRef
}
// append adds the sample (t, v) to the series. The caller also has to provide // append adds the sample (t, v) to the series. The caller also has to provide
// the appendID for isolation. (The appendID can be zero, which results in no // the appendID for isolation. (The appendID can be zero, which results in no
// isolation for this append.) // isolation for this append.)
@ -567,7 +760,7 @@ func (s *memSeries) append(t int64, v float64, appendID uint64, chunkDiskMapper
// Out of order sample. Sample timestamp is already in the mmapped chunks, so ignore it. // Out of order sample. Sample timestamp is already in the mmapped chunks, so ignore it.
return false, false return false, false
} }
// There is no chunk in this series yet, create the first chunk for the sample. // There is no head chunk in this series yet, create the first chunk for the sample.
c = s.cutNewHeadChunk(t, chunkDiskMapper) c = s.cutNewHeadChunk(t, chunkDiskMapper)
chunkCreated = true chunkCreated = true
} }
@ -651,6 +844,36 @@ func (s *memSeries) cutNewHeadChunk(mint int64, chunkDiskMapper *chunks.ChunkDis
return s.headChunk return s.headChunk
} }
func (s *memSeries) cutNewOOOHeadChunk(mint int64, chunkDiskMapper *chunks.ChunkDiskMapper) (*oooHeadChunk, chunks.ChunkDiskMapperRef) {
ref := s.mmapCurrentOOOHeadChunk(chunkDiskMapper)
s.oooHeadChunk = &oooHeadChunk{
chunk: NewOOOChunk(),
minTime: mint,
maxTime: math.MinInt64,
}
return s.oooHeadChunk, ref
}
func (s *memSeries) mmapCurrentOOOHeadChunk(chunkDiskMapper *chunks.ChunkDiskMapper) chunks.ChunkDiskMapperRef {
if s.oooHeadChunk == nil {
// There is no head chunk, so nothing to m-map here.
return 0
}
xor, _ := s.oooHeadChunk.chunk.ToXOR() // Encode to XorChunk which is more compact and implements all of the needed functionality.
oooXor := &chunkenc.OOOXORChunk{XORChunk: xor}
chunkRef := chunkDiskMapper.WriteChunk(s.ref, s.oooHeadChunk.minTime, s.oooHeadChunk.maxTime, oooXor, handleChunkWriteError)
s.oooMmappedChunks = append(s.oooMmappedChunks, &mmappedChunk{
ref: chunkRef,
numSamples: uint16(xor.NumSamples()),
minTime: s.oooHeadChunk.minTime,
maxTime: s.oooHeadChunk.maxTime,
})
s.oooHeadChunk = nil
return chunkRef
}
func (s *memSeries) mmapCurrentHeadChunk(chunkDiskMapper *chunks.ChunkDiskMapper) { func (s *memSeries) mmapCurrentHeadChunk(chunkDiskMapper *chunks.ChunkDiskMapper) {
if s.headChunk == nil { if s.headChunk == nil {
// There is no head chunk, so nothing to m-map here. // There is no head chunk, so nothing to m-map here.

View file

@ -30,7 +30,7 @@ func BenchmarkHeadStripeSeriesCreate(b *testing.B) {
opts := DefaultHeadOptions() opts := DefaultHeadOptions()
opts.ChunkRange = 1000 opts.ChunkRange = 1000
opts.ChunkDirRoot = chunkDir opts.ChunkDirRoot = chunkDir
h, err := NewHead(nil, nil, nil, opts, nil) h, err := NewHead(nil, nil, nil, nil, opts, nil)
require.NoError(b, err) require.NoError(b, err)
defer h.Close() defer h.Close()
@ -45,7 +45,7 @@ func BenchmarkHeadStripeSeriesCreateParallel(b *testing.B) {
opts := DefaultHeadOptions() opts := DefaultHeadOptions()
opts.ChunkRange = 1000 opts.ChunkRange = 1000
opts.ChunkDirRoot = chunkDir opts.ChunkDirRoot = chunkDir
h, err := NewHead(nil, nil, nil, opts, nil) h, err := NewHead(nil, nil, nil, nil, opts, nil)
require.NoError(b, err) require.NoError(b, err)
defer h.Close() defer h.Close()
@ -69,7 +69,7 @@ func BenchmarkHeadStripeSeriesCreate_PreCreationFailure(b *testing.B) {
// Mock the PreCreation() callback to fail on each series. // Mock the PreCreation() callback to fail on each series.
opts.SeriesCallback = failingSeriesLifecycleCallback{} opts.SeriesCallback = failingSeriesLifecycleCallback{}
h, err := NewHead(nil, nil, nil, opts, nil) h, err := NewHead(nil, nil, nil, nil, opts, nil)
require.NoError(b, err) require.NoError(b, err)
defer h.Close() defer h.Close()

View file

@ -183,11 +183,20 @@ func (h *headIndexReader) Series(ref storage.SeriesRef, lbls *labels.Labels, chk
return nil return nil
} }
// headChunkID returns the HeadChunkID corresponding to .mmappedChunks[pos] // headChunkID returns the HeadChunkID referred to by the given position.
// * 0 <= pos < len(s.mmappedChunks) refer to s.mmappedChunks[pos]
// * pos == len(s.mmappedChunks) refers to s.headChunk
func (s *memSeries) headChunkID(pos int) chunks.HeadChunkID { func (s *memSeries) headChunkID(pos int) chunks.HeadChunkID {
return chunks.HeadChunkID(pos) + s.firstChunkID return chunks.HeadChunkID(pos) + s.firstChunkID
} }
// oooHeadChunkID returns the HeadChunkID referred to by the given position.
// * 0 <= pos < len(s.oooMmappedChunks) refer to s.oooMmappedChunks[pos]
// * pos == len(s.oooMmappedChunks) refers to s.oooHeadChunk
func (s *memSeries) oooHeadChunkID(pos int) chunks.HeadChunkID {
return chunks.HeadChunkID(pos) + s.firstOOOChunkID
}
// LabelValueFor returns label value for the given label name in the series referred to by ID. // LabelValueFor returns label value for the given label name in the series referred to by ID.
func (h *headIndexReader) LabelValueFor(id storage.SeriesRef, label string) (string, error) { func (h *headIndexReader) LabelValueFor(id storage.SeriesRef, label string) (string, error) {
memSeries := h.head.series.getByID(chunks.HeadSeriesRef(id)) memSeries := h.head.series.getByID(chunks.HeadSeriesRef(id))
@ -258,8 +267,8 @@ func (h *headChunkReader) Close() error {
} }
// Chunk returns the chunk for the reference number. // Chunk returns the chunk for the reference number.
func (h *headChunkReader) Chunk(ref chunks.ChunkRef) (chunkenc.Chunk, error) { func (h *headChunkReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
sid, cid := chunks.HeadChunkRef(ref).Unpack() sid, cid := chunks.HeadChunkRef(meta.Ref).Unpack()
s := h.head.series.getByID(sid) s := h.head.series.getByID(sid)
// This means that the series has been garbage collected. // This means that the series has been garbage collected.
@ -330,6 +339,260 @@ func (s *memSeries) chunk(id chunks.HeadChunkID, chunkDiskMapper *chunks.ChunkDi
return mc, true, nil return mc, true, nil
} }
// oooMergedChunk returns the requested chunk based on the given chunks.Meta
// reference from memory or by m-mapping it from the disk. The returned chunk
// might be a merge of all the overlapping chunks, if any, amongst all the
// chunks in the OOOHead.
// This function is not thread safe unless the caller holds a lock.
func (s *memSeries) oooMergedChunk(meta chunks.Meta, cdm *chunks.ChunkDiskMapper, mint, maxt int64) (chunk *mergedOOOChunks, err error) {
_, cid := chunks.HeadChunkRef(meta.Ref).Unpack()
// ix represents the index of chunk in the s.mmappedChunks slice. The chunk meta's are
// incremented by 1 when new chunk is created, hence (meta - firstChunkID) gives the slice index.
// The max index for the s.mmappedChunks slice can be len(s.mmappedChunks)-1, hence if the ix
// is len(s.mmappedChunks), it represents the next chunk, which is the head chunk.
ix := int(cid) - int(s.firstOOOChunkID)
if ix < 0 || ix > len(s.oooMmappedChunks) {
return nil, storage.ErrNotFound
}
if ix == len(s.oooMmappedChunks) {
if s.oooHeadChunk == nil {
return nil, errors.New("invalid ooo head chunk")
}
}
// We create a temporary slice of chunk metas to hold the information of all
// possible chunks that may overlap with the requested chunk.
tmpChks := make([]chunkMetaAndChunkDiskMapperRef, 0, len(s.oooMmappedChunks))
oooHeadRef := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(len(s.oooMmappedChunks))))
if s.oooHeadChunk != nil && s.oooHeadChunk.OverlapsClosedInterval(mint, maxt) {
// We only want to append the head chunk if this chunk existed when
// Series() was called. This brings consistency in case new data
// is added in between Series() and Chunk() calls.
if oooHeadRef == meta.OOOLastRef {
tmpChks = append(tmpChks, chunkMetaAndChunkDiskMapperRef{
meta: chunks.Meta{
// Ignoring samples added before and after the last known min and max time for this chunk.
MinTime: meta.OOOLastMinTime,
MaxTime: meta.OOOLastMaxTime,
Ref: oooHeadRef,
},
})
}
}
for i, c := range s.oooMmappedChunks {
chunkRef := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(i)))
// We can skip chunks that came in later than the last known OOOLastRef.
if chunkRef > meta.OOOLastRef {
break
}
if chunkRef == meta.OOOLastRef {
tmpChks = append(tmpChks, chunkMetaAndChunkDiskMapperRef{
meta: chunks.Meta{
MinTime: meta.OOOLastMinTime,
MaxTime: meta.OOOLastMaxTime,
Ref: chunkRef,
},
ref: c.ref,
origMinT: c.minTime,
origMaxT: c.maxTime,
})
} else if c.OverlapsClosedInterval(mint, maxt) {
tmpChks = append(tmpChks, chunkMetaAndChunkDiskMapperRef{
meta: chunks.Meta{
MinTime: c.minTime,
MaxTime: c.maxTime,
Ref: chunkRef,
},
ref: c.ref,
})
}
}
// Next we want to sort all the collected chunks by min time so we can find
// those that overlap and stop when we know the rest don't.
sort.Sort(byMinTimeAndMinRef(tmpChks))
mc := &mergedOOOChunks{}
absoluteMax := int64(math.MinInt64)
for _, c := range tmpChks {
if c.meta.Ref != meta.Ref && (len(mc.chunks) == 0 || c.meta.MinTime > absoluteMax) {
continue
}
if c.meta.Ref == oooHeadRef {
var xor *chunkenc.XORChunk
// If head chunk min and max time match the meta OOO markers
// that means that the chunk has not expanded so we can append
// it as it is.
if s.oooHeadChunk.minTime == meta.OOOLastMinTime && s.oooHeadChunk.maxTime == meta.OOOLastMaxTime {
xor, err = s.oooHeadChunk.chunk.ToXOR() // TODO(jesus.vazquez) (This is an optimization idea that has no priority and might not be that useful) See if we could use a copy of the underlying slice. That would leave the more expensive ToXOR() function only for the usecase where Bytes() is called.
} else {
// We need to remove samples that are outside of the markers
xor, err = s.oooHeadChunk.chunk.ToXORBetweenTimestamps(meta.OOOLastMinTime, meta.OOOLastMaxTime)
}
if err != nil {
return nil, errors.Wrap(err, "failed to convert ooo head chunk to xor chunk")
}
c.meta.Chunk = xor
} else {
chk, err := cdm.Chunk(c.ref)
if err != nil {
if _, ok := err.(*chunks.CorruptionErr); ok {
return nil, errors.Wrap(err, "invalid ooo mmapped chunk")
}
return nil, err
}
if c.meta.Ref == meta.OOOLastRef &&
(c.origMinT != meta.OOOLastMinTime || c.origMaxT != meta.OOOLastMaxTime) {
// The head expanded and was memory mapped so now we need to
// wrap the chunk within a chunk that doesnt allows us to iterate
// through samples out of the OOOLastMinT and OOOLastMaxT
// markers.
c.meta.Chunk = boundedChunk{chk, meta.OOOLastMinTime, meta.OOOLastMaxTime}
} else {
c.meta.Chunk = chk
}
}
mc.chunks = append(mc.chunks, c.meta)
if c.meta.MaxTime > absoluteMax {
absoluteMax = c.meta.MaxTime
}
}
return mc, nil
}
var _ chunkenc.Chunk = &mergedOOOChunks{}
// mergedOOOChunks holds the list of overlapping chunks. This struct satisfies
// chunkenc.Chunk.
type mergedOOOChunks struct {
chunks []chunks.Meta
}
// Bytes is a very expensive method because its calling the iterator of all the
// chunks in the mergedOOOChunk and building a new chunk with the samples.
func (o mergedOOOChunks) Bytes() []byte {
xc := chunkenc.NewXORChunk()
app, err := xc.Appender()
if err != nil {
panic(err)
}
it := o.Iterator(nil)
for it.Next() {
t, v := it.At()
app.Append(t, v)
}
return xc.Bytes()
}
func (o mergedOOOChunks) Encoding() chunkenc.Encoding {
return chunkenc.EncXOR
}
func (o mergedOOOChunks) Appender() (chunkenc.Appender, error) {
return nil, errors.New("can't append to mergedOOOChunks")
}
func (o mergedOOOChunks) Iterator(iterator chunkenc.Iterator) chunkenc.Iterator {
iterators := make([]chunkenc.Iterator, 0, len(o.chunks))
for _, c := range o.chunks {
iterators = append(iterators, c.Chunk.Iterator(nil))
}
return storage.NewChainSampleIterator(iterators)
}
func (o mergedOOOChunks) NumSamples() int {
samples := 0
for _, c := range o.chunks {
samples += c.Chunk.NumSamples()
}
return samples
}
func (o mergedOOOChunks) Compact() {}
var _ chunkenc.Chunk = &boundedChunk{}
// boundedChunk is an implementation of chunkenc.Chunk that uses a
// boundedIterator that only iterates through samples which timestamps are
// >= minT and <= maxT
type boundedChunk struct {
chunkenc.Chunk
minT int64
maxT int64
}
func (b boundedChunk) Bytes() []byte {
xor := chunkenc.NewXORChunk()
a, _ := xor.Appender()
it := b.Iterator(nil)
for it.Next() {
t, v := it.At()
a.Append(t, v)
}
return xor.Bytes()
}
func (b boundedChunk) Iterator(iterator chunkenc.Iterator) chunkenc.Iterator {
it := b.Chunk.Iterator(iterator)
if it == nil {
panic("iterator shouldn't be nil")
}
return boundedIterator{it, b.minT, b.maxT}
}
var _ chunkenc.Iterator = &boundedIterator{}
// boundedIterator is an implementation of Iterator that only iterates through
// samples which timestamps are >= minT and <= maxT
type boundedIterator struct {
chunkenc.Iterator
minT int64
maxT int64
}
// Next the first time its called it will advance as many positions as necessary
// until its able to find a sample within the bounds minT and maxT.
// If there are samples within bounds it will advance one by one amongst them.
// If there are no samples within bounds it will return false.
func (b boundedIterator) Next() bool {
for b.Iterator.Next() {
t, _ := b.Iterator.At()
if t < b.minT {
continue
} else if t > b.maxT {
return false
}
return true
}
return false
}
func (b boundedIterator) Seek(t int64) bool {
if t < b.minT {
// We must seek at least up to b.minT if it is asked for something before that.
ok := b.Iterator.Seek(b.minT)
if !ok {
return false
}
t, _ := b.Iterator.At()
return t <= b.maxT
}
if t > b.maxT {
// We seek anyway so that the subsequent Next() calls will also return false.
b.Iterator.Seek(t)
return false
}
return b.Iterator.Seek(t)
}
// safeChunk makes sure that the chunk can be accessed without a race condition
type safeChunk struct { type safeChunk struct {
chunkenc.Chunk chunkenc.Chunk
s *memSeries s *memSeries

178
tsdb/head_read_test.go Normal file
View file

@ -0,0 +1,178 @@
// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdb
import (
"fmt"
"testing"
"github.com/stretchr/testify/require"
"github.com/prometheus/prometheus/tsdb/chunkenc"
)
func TestBoundedChunk(t *testing.T) {
tests := []struct {
name string
inputChunk chunkenc.Chunk
inputMinT int64
inputMaxT int64
initialSeek int64
seekIsASuccess bool
expSamples []sample
}{
{
name: "if there are no samples it returns nothing",
inputChunk: newTestChunk(0),
expSamples: nil,
},
{
name: "bounds represent a single sample",
inputChunk: newTestChunk(10),
expSamples: []sample{
{0, 0},
},
},
{
name: "if there are bounds set only samples within them are returned",
inputChunk: newTestChunk(10),
inputMinT: 1,
inputMaxT: 8,
expSamples: []sample{
{1, 1},
{2, 2},
{3, 3},
{4, 4},
{5, 5},
{6, 6},
{7, 7},
{8, 8},
},
},
{
name: "if bounds set and only maxt is less than actual maxt",
inputChunk: newTestChunk(10),
inputMinT: 0,
inputMaxT: 5,
expSamples: []sample{
{0, 0},
{1, 1},
{2, 2},
{3, 3},
{4, 4},
{5, 5},
},
},
{
name: "if bounds set and only mint is more than actual mint",
inputChunk: newTestChunk(10),
inputMinT: 5,
inputMaxT: 9,
expSamples: []sample{
{5, 5},
{6, 6},
{7, 7},
{8, 8},
{9, 9},
},
},
{
name: "if there are bounds set with seek before mint",
inputChunk: newTestChunk(10),
inputMinT: 3,
inputMaxT: 7,
initialSeek: 1,
seekIsASuccess: true,
expSamples: []sample{
{3, 3},
{4, 4},
{5, 5},
{6, 6},
{7, 7},
},
},
{
name: "if there are bounds set with seek between mint and maxt",
inputChunk: newTestChunk(10),
inputMinT: 3,
inputMaxT: 7,
initialSeek: 5,
seekIsASuccess: true,
expSamples: []sample{
{5, 5},
{6, 6},
{7, 7},
},
},
{
name: "if there are bounds set with seek after maxt",
inputChunk: newTestChunk(10),
inputMinT: 3,
inputMaxT: 7,
initialSeek: 8,
seekIsASuccess: false,
},
}
for _, tc := range tests {
t.Run(fmt.Sprintf("name=%s", tc.name), func(t *testing.T) {
chunk := boundedChunk{tc.inputChunk, tc.inputMinT, tc.inputMaxT}
// Testing Bytes()
expChunk := chunkenc.NewXORChunk()
if tc.inputChunk.NumSamples() > 0 {
app, err := expChunk.Appender()
require.NoError(t, err)
for ts := tc.inputMinT; ts <= tc.inputMaxT; ts++ {
app.Append(ts, float64(ts))
}
}
require.Equal(t, expChunk.Bytes(), chunk.Bytes())
var samples []sample
it := chunk.Iterator(nil)
if tc.initialSeek != 0 {
// Testing Seek()
ok := it.Seek(tc.initialSeek)
require.Equal(t, tc.seekIsASuccess, ok)
if ok {
t, v := it.At()
samples = append(samples, sample{t, v})
}
}
// Testing Next()
for it.Next() {
t, v := it.At()
samples = append(samples, sample{t, v})
}
// it.Next() should keep returning false.
for i := 0; i < 10; i++ {
require.False(t, it.Next())
}
require.Equal(t, tc.expSamples, samples)
})
}
}
func newTestChunk(numSamples int) chunkenc.Chunk {
xor := chunkenc.NewXORChunk()
a, _ := xor.Appender()
for i := 0; i < numSamples; i++ {
a.Append(int64(i), float64(i))
}
return xor
}

View file

@ -49,7 +49,7 @@ import (
"github.com/prometheus/prometheus/tsdb/wal" "github.com/prometheus/prometheus/tsdb/wal"
) )
func newTestHead(t testing.TB, chunkRange int64, compressWAL bool) (*Head, *wal.WAL) { func newTestHead(t testing.TB, chunkRange int64, compressWAL, oooEnabled bool) (*Head, *wal.WAL) {
dir := t.TempDir() dir := t.TempDir()
wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, compressWAL) wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, compressWAL)
require.NoError(t, err) require.NoError(t, err)
@ -59,18 +59,23 @@ func newTestHead(t testing.TB, chunkRange int64, compressWAL bool) (*Head, *wal.
opts.ChunkDirRoot = dir opts.ChunkDirRoot = dir
opts.EnableExemplarStorage = true opts.EnableExemplarStorage = true
opts.MaxExemplars.Store(config.DefaultExemplarsConfig.MaxExemplars) opts.MaxExemplars.Store(config.DefaultExemplarsConfig.MaxExemplars)
if oooEnabled {
opts.OutOfOrderTimeWindow.Store(10 * time.Minute.Milliseconds())
}
h, err := NewHead(nil, nil, wlog, opts, nil) h, err := NewHead(nil, nil, wlog, nil, opts, nil)
require.NoError(t, err) require.NoError(t, err)
require.NoError(t, h.chunkDiskMapper.IterateAllChunks(func(_ chunks.HeadSeriesRef, _ chunks.ChunkDiskMapperRef, _, _ int64, _ uint16) error { return nil })) require.NoError(t, h.chunkDiskMapper.IterateAllChunks(func(_ chunks.HeadSeriesRef, _ chunks.ChunkDiskMapperRef, _, _ int64, _ uint16, _ chunkenc.Encoding) error {
return nil
}))
return h, wlog return h, wlog
} }
func BenchmarkCreateSeries(b *testing.B) { func BenchmarkCreateSeries(b *testing.B) {
series := genSeries(b.N, 10, 0, 0) series := genSeries(b.N, 10, 0, 0)
h, _ := newTestHead(b, 10000, false) h, _ := newTestHead(b, 10000, false, false)
defer func() { defer func() {
require.NoError(b, h.Close()) require.NoError(b, h.Close())
}() }()
@ -224,7 +229,7 @@ func BenchmarkLoadWAL(b *testing.B) {
require.NoError(b, err) require.NoError(b, err)
for k := 0; k < c.batches*c.seriesPerBatch; k++ { for k := 0; k < c.batches*c.seriesPerBatch; k++ {
// Create one mmapped chunk per series, with one sample at the given time. // Create one mmapped chunk per series, with one sample at the given time.
s := newMemSeries(labels.Labels{}, chunks.HeadSeriesRef(k)*101, c.mmappedChunkT, defaultIsolationDisabled) s := newMemSeries(labels.Labels{}, chunks.HeadSeriesRef(k)*101, c.mmappedChunkT, 1, defaultIsolationDisabled)
s.append(c.mmappedChunkT, 42, 0, chunkDiskMapper) s.append(c.mmappedChunkT, 42, 0, chunkDiskMapper)
s.mmapCurrentHeadChunk(chunkDiskMapper) s.mmapCurrentHeadChunk(chunkDiskMapper)
} }
@ -255,7 +260,7 @@ func BenchmarkLoadWAL(b *testing.B) {
opts := DefaultHeadOptions() opts := DefaultHeadOptions()
opts.ChunkRange = 1000 opts.ChunkRange = 1000
opts.ChunkDirRoot = w.Dir() opts.ChunkDirRoot = w.Dir()
h, err := NewHead(nil, nil, w, opts, nil) h, err := NewHead(nil, nil, w, nil, opts, nil)
require.NoError(b, err) require.NoError(b, err)
h.Init(0) h.Init(0)
} }
@ -271,7 +276,7 @@ func BenchmarkLoadWAL(b *testing.B) {
// While appending the samples to the head it concurrently queries them from multiple go routines and verifies that the // While appending the samples to the head it concurrently queries them from multiple go routines and verifies that the
// returned results are correct. // returned results are correct.
func TestHead_HighConcurrencyReadAndWrite(t *testing.T) { func TestHead_HighConcurrencyReadAndWrite(t *testing.T) {
head, _ := newTestHead(t, DefaultBlockDuration, false) head, _ := newTestHead(t, DefaultBlockDuration, false, false)
defer func() { defer func() {
require.NoError(t, head.Close()) require.NoError(t, head.Close())
}() }()
@ -487,7 +492,7 @@ func TestHead_ReadWAL(t *testing.T) {
}, },
} }
head, w := newTestHead(t, 1000, compress) head, w := newTestHead(t, 1000, compress, false)
defer func() { defer func() {
require.NoError(t, head.Close()) require.NoError(t, head.Close())
}() }()
@ -531,7 +536,7 @@ func TestHead_ReadWAL(t *testing.T) {
} }
func TestHead_WALMultiRef(t *testing.T) { func TestHead_WALMultiRef(t *testing.T) {
head, w := newTestHead(t, 1000, false) head, w := newTestHead(t, 1000, false, false)
require.NoError(t, head.Init(0)) require.NoError(t, head.Init(0))
@ -572,7 +577,7 @@ func TestHead_WALMultiRef(t *testing.T) {
opts := DefaultHeadOptions() opts := DefaultHeadOptions()
opts.ChunkRange = 1000 opts.ChunkRange = 1000
opts.ChunkDirRoot = w.Dir() opts.ChunkDirRoot = w.Dir()
head, err = NewHead(nil, nil, w, opts, nil) head, err = NewHead(nil, nil, w, nil, opts, nil)
require.NoError(t, err) require.NoError(t, err)
require.NoError(t, head.Init(0)) require.NoError(t, head.Init(0))
defer func() { defer func() {
@ -591,7 +596,7 @@ func TestHead_WALMultiRef(t *testing.T) {
} }
func TestHead_ActiveAppenders(t *testing.T) { func TestHead_ActiveAppenders(t *testing.T) {
head, _ := newTestHead(t, 1000, false) head, _ := newTestHead(t, 1000, false, false)
defer head.Close() defer head.Close()
require.NoError(t, head.Init(0)) require.NoError(t, head.Init(0))
@ -624,14 +629,14 @@ func TestHead_ActiveAppenders(t *testing.T) {
} }
func TestHead_UnknownWALRecord(t *testing.T) { func TestHead_UnknownWALRecord(t *testing.T) {
head, w := newTestHead(t, 1000, false) head, w := newTestHead(t, 1000, false, false)
w.Log([]byte{255, 42}) w.Log([]byte{255, 42})
require.NoError(t, head.Init(0)) require.NoError(t, head.Init(0))
require.NoError(t, head.Close()) require.NoError(t, head.Close())
} }
func TestHead_Truncate(t *testing.T) { func TestHead_Truncate(t *testing.T) {
h, _ := newTestHead(t, 1000, false) h, _ := newTestHead(t, 1000, false, false)
defer func() { defer func() {
require.NoError(t, h.Close()) require.NoError(t, h.Close())
}() }()
@ -733,7 +738,7 @@ func TestMemSeries_truncateChunks(t *testing.T) {
}, },
} }
s := newMemSeries(labels.FromStrings("a", "b"), 1, 2000, defaultIsolationDisabled) s := newMemSeries(labels.FromStrings("a", "b"), 1, 2000, 1, defaultIsolationDisabled)
for i := 0; i < 4000; i += 5 { for i := 0; i < 4000; i += 5 {
ok, _ := s.append(int64(i), float64(i), 0, chunkDiskMapper) ok, _ := s.append(int64(i), float64(i), 0, chunkDiskMapper)
@ -752,7 +757,7 @@ func TestMemSeries_truncateChunks(t *testing.T) {
require.NotNil(t, chk) require.NotNil(t, chk)
require.NoError(t, err) require.NoError(t, err)
s.truncateChunksBefore(2000) s.truncateChunksBefore(2000, 0)
require.Equal(t, int64(2000), s.mmappedChunks[0].minTime) require.Equal(t, int64(2000), s.mmappedChunks[0].minTime)
_, _, err = s.chunk(0, chunkDiskMapper, &memChunkPool) _, _, err = s.chunk(0, chunkDiskMapper, &memChunkPool)
@ -789,7 +794,7 @@ func TestHeadDeleteSeriesWithoutSamples(t *testing.T) {
{Ref: 50, T: 90, V: 1}, {Ref: 50, T: 90, V: 1},
}, },
} }
head, w := newTestHead(t, 1000, compress) head, w := newTestHead(t, 1000, compress, false)
defer func() { defer func() {
require.NoError(t, head.Close()) require.NoError(t, head.Close())
}() }()
@ -857,7 +862,8 @@ func TestHeadDeleteSimple(t *testing.T) {
for _, compress := range []bool{false, true} { for _, compress := range []bool{false, true} {
t.Run(fmt.Sprintf("compress=%t", compress), func(t *testing.T) { t.Run(fmt.Sprintf("compress=%t", compress), func(t *testing.T) {
for _, c := range cases { for _, c := range cases {
head, w := newTestHead(t, 1000, compress) head, w := newTestHead(t, 1000, compress, false)
require.NoError(t, head.Init(0))
app := head.Appender(context.Background()) app := head.Appender(context.Background())
for _, smpl := range smplsAll { for _, smpl := range smplsAll {
@ -887,7 +893,7 @@ func TestHeadDeleteSimple(t *testing.T) {
opts := DefaultHeadOptions() opts := DefaultHeadOptions()
opts.ChunkRange = 1000 opts.ChunkRange = 1000
opts.ChunkDirRoot = reloadedW.Dir() opts.ChunkDirRoot = reloadedW.Dir()
reloadedHead, err := NewHead(nil, nil, reloadedW, opts, nil) reloadedHead, err := NewHead(nil, nil, reloadedW, nil, opts, nil)
require.NoError(t, err) require.NoError(t, err)
require.NoError(t, reloadedHead.Init(0)) require.NoError(t, reloadedHead.Init(0))
@ -937,7 +943,7 @@ func TestHeadDeleteSimple(t *testing.T) {
} }
func TestDeleteUntilCurMax(t *testing.T) { func TestDeleteUntilCurMax(t *testing.T) {
hb, _ := newTestHead(t, 1000000, false) hb, _ := newTestHead(t, 1000000, false, false)
defer func() { defer func() {
require.NoError(t, hb.Close()) require.NoError(t, hb.Close())
}() }()
@ -990,7 +996,7 @@ func TestDeletedSamplesAndSeriesStillInWALAfterCheckpoint(t *testing.T) {
numSamples := 10000 numSamples := 10000
// Enough samples to cause a checkpoint. // Enough samples to cause a checkpoint.
hb, w := newTestHead(t, int64(numSamples)*10, false) hb, w := newTestHead(t, int64(numSamples)*10, false, false)
for i := 0; i < numSamples; i++ { for i := 0; i < numSamples; i++ {
app := hb.Appender(context.Background()) app := hb.Appender(context.Background())
@ -1082,7 +1088,7 @@ func TestDelete_e2e(t *testing.T) {
seriesMap[labels.New(l...).String()] = []tsdbutil.Sample{} seriesMap[labels.New(l...).String()] = []tsdbutil.Sample{}
} }
hb, _ := newTestHead(t, 100000, false) hb, _ := newTestHead(t, 100000, false, false)
defer func() { defer func() {
require.NoError(t, hb.Close()) require.NoError(t, hb.Close())
}() }()
@ -1271,7 +1277,7 @@ func TestMemSeries_append(t *testing.T) {
require.NoError(t, chunkDiskMapper.Close()) require.NoError(t, chunkDiskMapper.Close())
}() }()
s := newMemSeries(labels.Labels{}, 1, 500, defaultIsolationDisabled) s := newMemSeries(labels.Labels{}, 1, 500, 1, defaultIsolationDisabled)
// Add first two samples at the very end of a chunk range and the next two // Add first two samples at the very end of a chunk range and the next two
// on and after it. // on and after it.
@ -1325,7 +1331,7 @@ func TestMemSeries_append_atVariableRate(t *testing.T) {
require.NoError(t, chunkDiskMapper.Close()) require.NoError(t, chunkDiskMapper.Close())
}) })
s := newMemSeries(labels.Labels{}, 1, DefaultBlockDuration, defaultIsolationDisabled) s := newMemSeries(labels.Labels{}, 1, DefaultBlockDuration, 0, defaultIsolationDisabled)
// At this slow rate, we will fill the chunk in two block durations. // At this slow rate, we will fill the chunk in two block durations.
slowRate := (DefaultBlockDuration * 2) / samplesPerChunk slowRate := (DefaultBlockDuration * 2) / samplesPerChunk
@ -1361,7 +1367,7 @@ func TestMemSeries_append_atVariableRate(t *testing.T) {
func TestGCChunkAccess(t *testing.T) { func TestGCChunkAccess(t *testing.T) {
// Put a chunk, select it. GC it and then access it. // Put a chunk, select it. GC it and then access it.
h, _ := newTestHead(t, 1000, false) h, _ := newTestHead(t, 1000, false, false)
defer func() { defer func() {
require.NoError(t, h.Close()) require.NoError(t, h.Close())
}() }()
@ -1398,22 +1404,22 @@ func TestGCChunkAccess(t *testing.T) {
cr, err := h.chunksRange(0, 1500, nil) cr, err := h.chunksRange(0, 1500, nil)
require.NoError(t, err) require.NoError(t, err)
_, err = cr.Chunk(chunks[0].Ref) _, err = cr.Chunk(chunks[0])
require.NoError(t, err) require.NoError(t, err)
_, err = cr.Chunk(chunks[1].Ref) _, err = cr.Chunk(chunks[1])
require.NoError(t, err) require.NoError(t, err)
require.NoError(t, h.Truncate(1500)) // Remove a chunk. require.NoError(t, h.Truncate(1500)) // Remove a chunk.
_, err = cr.Chunk(chunks[0].Ref) _, err = cr.Chunk(chunks[0])
require.Equal(t, storage.ErrNotFound, err) require.Equal(t, storage.ErrNotFound, err)
_, err = cr.Chunk(chunks[1].Ref) _, err = cr.Chunk(chunks[1])
require.NoError(t, err) require.NoError(t, err)
} }
func TestGCSeriesAccess(t *testing.T) { func TestGCSeriesAccess(t *testing.T) {
// Put a series, select it. GC it and then access it. // Put a series, select it. GC it and then access it.
h, _ := newTestHead(t, 1000, false) h, _ := newTestHead(t, 1000, false, false)
defer func() { defer func() {
require.NoError(t, h.Close()) require.NoError(t, h.Close())
}() }()
@ -1450,23 +1456,23 @@ func TestGCSeriesAccess(t *testing.T) {
cr, err := h.chunksRange(0, 2000, nil) cr, err := h.chunksRange(0, 2000, nil)
require.NoError(t, err) require.NoError(t, err)
_, err = cr.Chunk(chunks[0].Ref) _, err = cr.Chunk(chunks[0])
require.NoError(t, err) require.NoError(t, err)
_, err = cr.Chunk(chunks[1].Ref) _, err = cr.Chunk(chunks[1])
require.NoError(t, err) require.NoError(t, err)
require.NoError(t, h.Truncate(2000)) // Remove the series. require.NoError(t, h.Truncate(2000)) // Remove the series.
require.Equal(t, (*memSeries)(nil), h.series.getByID(1)) require.Equal(t, (*memSeries)(nil), h.series.getByID(1))
_, err = cr.Chunk(chunks[0].Ref) _, err = cr.Chunk(chunks[0])
require.Equal(t, storage.ErrNotFound, err) require.Equal(t, storage.ErrNotFound, err)
_, err = cr.Chunk(chunks[1].Ref) _, err = cr.Chunk(chunks[1])
require.Equal(t, storage.ErrNotFound, err) require.Equal(t, storage.ErrNotFound, err)
} }
func TestUncommittedSamplesNotLostOnTruncate(t *testing.T) { func TestUncommittedSamplesNotLostOnTruncate(t *testing.T) {
h, _ := newTestHead(t, 1000, false) h, _ := newTestHead(t, 1000, false, false)
defer func() { defer func() {
require.NoError(t, h.Close()) require.NoError(t, h.Close())
}() }()
@ -1496,7 +1502,7 @@ func TestUncommittedSamplesNotLostOnTruncate(t *testing.T) {
} }
func TestRemoveSeriesAfterRollbackAndTruncate(t *testing.T) { func TestRemoveSeriesAfterRollbackAndTruncate(t *testing.T) {
h, _ := newTestHead(t, 1000, false) h, _ := newTestHead(t, 1000, false, false)
defer func() { defer func() {
require.NoError(t, h.Close()) require.NoError(t, h.Close())
}() }()
@ -1529,7 +1535,7 @@ func TestRemoveSeriesAfterRollbackAndTruncate(t *testing.T) {
func TestHead_LogRollback(t *testing.T) { func TestHead_LogRollback(t *testing.T) {
for _, compress := range []bool{false, true} { for _, compress := range []bool{false, true} {
t.Run(fmt.Sprintf("compress=%t", compress), func(t *testing.T) { t.Run(fmt.Sprintf("compress=%t", compress), func(t *testing.T) {
h, w := newTestHead(t, 1000, compress) h, w := newTestHead(t, 1000, compress, false)
defer func() { defer func() {
require.NoError(t, h.Close()) require.NoError(t, h.Close())
}() }()
@ -1606,7 +1612,7 @@ func TestWalRepair_DecodingError(t *testing.T) {
opts := DefaultHeadOptions() opts := DefaultHeadOptions()
opts.ChunkRange = 1 opts.ChunkRange = 1
opts.ChunkDirRoot = w.Dir() opts.ChunkDirRoot = w.Dir()
h, err := NewHead(nil, nil, w, opts, nil) h, err := NewHead(nil, nil, w, nil, opts, nil)
require.NoError(t, err) require.NoError(t, err)
require.Equal(t, 0.0, prom_testutil.ToFloat64(h.metrics.walCorruptionsTotal)) require.Equal(t, 0.0, prom_testutil.ToFloat64(h.metrics.walCorruptionsTotal))
initErr := h.Init(math.MinInt64) initErr := h.Init(math.MinInt64)
@ -1660,7 +1666,8 @@ func TestHeadReadWriterRepair(t *testing.T) {
opts := DefaultHeadOptions() opts := DefaultHeadOptions()
opts.ChunkRange = chunkRange opts.ChunkRange = chunkRange
opts.ChunkDirRoot = dir opts.ChunkDirRoot = dir
h, err := NewHead(nil, nil, w, opts, nil) opts.ChunkWriteQueueSize = 1 // We need to set this option so that we use the async queue. Upstream prometheus uses the queue directly.
h, err := NewHead(nil, nil, w, nil, opts, nil)
require.NoError(t, err) require.NoError(t, err)
require.Equal(t, 0.0, prom_testutil.ToFloat64(h.metrics.mmapChunkCorruptionTotal)) require.Equal(t, 0.0, prom_testutil.ToFloat64(h.metrics.mmapChunkCorruptionTotal))
require.NoError(t, h.Init(math.MinInt64)) require.NoError(t, h.Init(math.MinInt64))
@ -1715,7 +1722,7 @@ func TestHeadReadWriterRepair(t *testing.T) {
} }
func TestNewWalSegmentOnTruncate(t *testing.T) { func TestNewWalSegmentOnTruncate(t *testing.T) {
h, wlog := newTestHead(t, 1000, false) h, wlog := newTestHead(t, 1000, false, false)
defer func() { defer func() {
require.NoError(t, h.Close()) require.NoError(t, h.Close())
}() }()
@ -1745,7 +1752,7 @@ func TestNewWalSegmentOnTruncate(t *testing.T) {
} }
func TestAddDuplicateLabelName(t *testing.T) { func TestAddDuplicateLabelName(t *testing.T) {
h, _ := newTestHead(t, 1000, false) h, _ := newTestHead(t, 1000, false, false)
defer func() { defer func() {
require.NoError(t, h.Close()) require.NoError(t, h.Close())
}() }()
@ -1828,7 +1835,7 @@ func TestMemSeriesIsolation(t *testing.T) {
} }
// Test isolation without restart of Head. // Test isolation without restart of Head.
hb, _ := newTestHead(t, 1000, false) hb, _ := newTestHead(t, 1000, false, false)
i := addSamples(hb) i := addSamples(hb)
testIsolation(hb, i) testIsolation(hb, i)
@ -1890,7 +1897,7 @@ func TestMemSeriesIsolation(t *testing.T) {
require.NoError(t, hb.Close()) require.NoError(t, hb.Close())
// Test isolation with restart of Head. This is to verify the num samples of chunks after m-map chunk replay. // Test isolation with restart of Head. This is to verify the num samples of chunks after m-map chunk replay.
hb, w := newTestHead(t, 1000, false) hb, w := newTestHead(t, 1000, false, false)
i = addSamples(hb) i = addSamples(hb)
require.NoError(t, hb.Close()) require.NoError(t, hb.Close())
@ -1899,7 +1906,7 @@ func TestMemSeriesIsolation(t *testing.T) {
opts := DefaultHeadOptions() opts := DefaultHeadOptions()
opts.ChunkRange = 1000 opts.ChunkRange = 1000
opts.ChunkDirRoot = wlog.Dir() opts.ChunkDirRoot = wlog.Dir()
hb, err = NewHead(nil, nil, wlog, opts, nil) hb, err = NewHead(nil, nil, wlog, nil, opts, nil)
defer func() { require.NoError(t, hb.Close()) }() defer func() { require.NoError(t, hb.Close()) }()
require.NoError(t, err) require.NoError(t, err)
require.NoError(t, hb.Init(0)) require.NoError(t, hb.Init(0))
@ -1943,7 +1950,7 @@ func TestIsolationRollback(t *testing.T) {
} }
// Rollback after a failed append and test if the low watermark has progressed anyway. // Rollback after a failed append and test if the low watermark has progressed anyway.
hb, _ := newTestHead(t, 1000, false) hb, _ := newTestHead(t, 1000, false, false)
defer func() { defer func() {
require.NoError(t, hb.Close()) require.NoError(t, hb.Close())
}() }()
@ -1974,7 +1981,7 @@ func TestIsolationLowWatermarkMonotonous(t *testing.T) {
t.Skip("skipping test since tsdb isolation is disabled") t.Skip("skipping test since tsdb isolation is disabled")
} }
hb, _ := newTestHead(t, 1000, false) hb, _ := newTestHead(t, 1000, false, false)
defer func() { defer func() {
require.NoError(t, hb.Close()) require.NoError(t, hb.Close())
}() }()
@ -2011,7 +2018,7 @@ func TestIsolationAppendIDZeroIsNoop(t *testing.T) {
t.Skip("skipping test since tsdb isolation is disabled") t.Skip("skipping test since tsdb isolation is disabled")
} }
h, _ := newTestHead(t, 1000, false) h, _ := newTestHead(t, 1000, false, false)
defer func() { defer func() {
require.NoError(t, h.Close()) require.NoError(t, h.Close())
}() }()
@ -2036,7 +2043,7 @@ func TestIsolationWithoutAdd(t *testing.T) {
t.Skip("skipping test since tsdb isolation is disabled") t.Skip("skipping test since tsdb isolation is disabled")
} }
hb, _ := newTestHead(t, 1000, false) hb, _ := newTestHead(t, 1000, false, false)
defer func() { defer func() {
require.NoError(t, hb.Close()) require.NoError(t, hb.Close())
}() }()
@ -2131,7 +2138,7 @@ func TestOutOfOrderSamplesMetric(t *testing.T) {
} }
func testHeadSeriesChunkRace(t *testing.T) { func testHeadSeriesChunkRace(t *testing.T) {
h, _ := newTestHead(t, 1000, false) h, _ := newTestHead(t, 1000, false, false)
defer func() { defer func() {
require.NoError(t, h.Close()) require.NoError(t, h.Close())
}() }()
@ -2166,7 +2173,7 @@ func testHeadSeriesChunkRace(t *testing.T) {
} }
func TestHeadLabelNamesValuesWithMinMaxRange(t *testing.T) { func TestHeadLabelNamesValuesWithMinMaxRange(t *testing.T) {
head, _ := newTestHead(t, 1000, false) head, _ := newTestHead(t, 1000, false, false)
defer func() { defer func() {
require.NoError(t, head.Close()) require.NoError(t, head.Close())
}() }()
@ -2226,7 +2233,7 @@ func TestHeadLabelNamesValuesWithMinMaxRange(t *testing.T) {
} }
func TestHeadLabelValuesWithMatchers(t *testing.T) { func TestHeadLabelValuesWithMatchers(t *testing.T) {
head, _ := newTestHead(t, 1000, false) head, _ := newTestHead(t, 1000, false, false)
t.Cleanup(func() { require.NoError(t, head.Close()) }) t.Cleanup(func() { require.NoError(t, head.Close()) })
app := head.Appender(context.Background()) app := head.Appender(context.Background())
@ -2285,7 +2292,7 @@ func TestHeadLabelValuesWithMatchers(t *testing.T) {
} }
func TestHeadLabelNamesWithMatchers(t *testing.T) { func TestHeadLabelNamesWithMatchers(t *testing.T) {
head, _ := newTestHead(t, 1000, false) head, _ := newTestHead(t, 1000, false, false)
defer func() { defer func() {
require.NoError(t, head.Close()) require.NoError(t, head.Close())
}() }()
@ -2353,7 +2360,7 @@ func TestHeadLabelNamesWithMatchers(t *testing.T) {
} }
func TestErrReuseAppender(t *testing.T) { func TestErrReuseAppender(t *testing.T) {
head, _ := newTestHead(t, 1000, false) head, _ := newTestHead(t, 1000, false, false)
defer func() { defer func() {
require.NoError(t, head.Close()) require.NoError(t, head.Close())
}() }()
@ -2389,7 +2396,7 @@ func TestErrReuseAppender(t *testing.T) {
func TestHeadMintAfterTruncation(t *testing.T) { func TestHeadMintAfterTruncation(t *testing.T) {
chunkRange := int64(2000) chunkRange := int64(2000)
head, _ := newTestHead(t, chunkRange, false) head, _ := newTestHead(t, chunkRange, false, false)
app := head.Appender(context.Background()) app := head.Appender(context.Background())
_, err := app.Append(0, labels.FromStrings("a", "b"), 100, 100) _, err := app.Append(0, labels.FromStrings("a", "b"), 100, 100)
@ -2423,7 +2430,7 @@ func TestHeadMintAfterTruncation(t *testing.T) {
func TestHeadExemplars(t *testing.T) { func TestHeadExemplars(t *testing.T) {
chunkRange := int64(2000) chunkRange := int64(2000)
head, _ := newTestHead(t, chunkRange, false) head, _ := newTestHead(t, chunkRange, false, false)
app := head.Appender(context.Background()) app := head.Appender(context.Background())
l := labels.FromStrings("traceId", "123") l := labels.FromStrings("traceId", "123")
@ -2445,7 +2452,7 @@ func TestHeadExemplars(t *testing.T) {
func BenchmarkHeadLabelValuesWithMatchers(b *testing.B) { func BenchmarkHeadLabelValuesWithMatchers(b *testing.B) {
chunkRange := int64(2000) chunkRange := int64(2000)
head, _ := newTestHead(b, chunkRange, false) head, _ := newTestHead(b, chunkRange, false, false)
b.Cleanup(func() { require.NoError(b, head.Close()) }) b.Cleanup(func() { require.NoError(b, head.Close()) })
app := head.Appender(context.Background()) app := head.Appender(context.Background())
@ -2483,7 +2490,7 @@ func TestMemSafeIteratorSeekIntoBuffer(t *testing.T) {
require.NoError(t, chunkDiskMapper.Close()) require.NoError(t, chunkDiskMapper.Close())
}() }()
s := newMemSeries(labels.Labels{}, 1, 500, defaultIsolationDisabled) s := newMemSeries(labels.Labels{}, 1, 500, 1, defaultIsolationDisabled)
for i := 0; i < 7; i++ { for i := 0; i < 7; i++ {
ok, _ := s.append(int64(i), float64(i), 0, chunkDiskMapper) ok, _ := s.append(int64(i), float64(i), 0, chunkDiskMapper)
@ -2754,7 +2761,7 @@ func TestWaitForPendingReadersInTimeRange(t *testing.T) {
} }
func TestChunkSnapshot(t *testing.T) { func TestChunkSnapshot(t *testing.T) {
head, _ := newTestHead(t, 120*4, false) head, _ := newTestHead(t, 120*4, false, false)
defer func() { defer func() {
head.opts.EnableMemorySnapshotOnShutdown = false head.opts.EnableMemorySnapshotOnShutdown = false
require.NoError(t, head.Close()) require.NoError(t, head.Close())
@ -2833,7 +2840,7 @@ func TestChunkSnapshot(t *testing.T) {
openHeadAndCheckReplay := func() { openHeadAndCheckReplay := func() {
w, err := wal.NewSize(nil, nil, head.wal.Dir(), 32768, false) w, err := wal.NewSize(nil, nil, head.wal.Dir(), 32768, false)
require.NoError(t, err) require.NoError(t, err)
head, err = NewHead(nil, nil, w, head.opts, nil) head, err = NewHead(nil, nil, w, nil, head.opts, nil)
require.NoError(t, err) require.NoError(t, err)
require.NoError(t, head.Init(math.MinInt64)) require.NoError(t, head.Init(math.MinInt64))
@ -2996,7 +3003,7 @@ func TestChunkSnapshot(t *testing.T) {
} }
func TestSnapshotError(t *testing.T) { func TestSnapshotError(t *testing.T) {
head, _ := newTestHead(t, 120*4, false) head, _ := newTestHead(t, 120*4, false, false)
defer func() { defer func() {
head.opts.EnableMemorySnapshotOnShutdown = false head.opts.EnableMemorySnapshotOnShutdown = false
require.NoError(t, head.Close()) require.NoError(t, head.Close())
@ -3043,7 +3050,7 @@ func TestSnapshotError(t *testing.T) {
w, err := wal.NewSize(nil, nil, head.wal.Dir(), 32768, false) w, err := wal.NewSize(nil, nil, head.wal.Dir(), 32768, false)
require.NoError(t, err) require.NoError(t, err)
// Testing https://github.com/prometheus/prometheus/issues/9437 with the registry. // Testing https://github.com/prometheus/prometheus/issues/9437 with the registry.
head, err = NewHead(prometheus.NewRegistry(), nil, w, head.opts, nil) head, err = NewHead(prometheus.NewRegistry(), nil, w, nil, head.opts, nil)
require.NoError(t, err) require.NoError(t, err)
require.NoError(t, head.Init(math.MinInt64)) require.NoError(t, head.Init(math.MinInt64))
@ -3102,7 +3109,7 @@ func TestChunkSnapshotReplayBug(t *testing.T) {
opts := DefaultHeadOptions() opts := DefaultHeadOptions()
opts.ChunkDirRoot = dir opts.ChunkDirRoot = dir
opts.EnableMemorySnapshotOnShutdown = true opts.EnableMemorySnapshotOnShutdown = true
head, err := NewHead(nil, nil, wlog, opts, nil) head, err := NewHead(nil, nil, wlog, nil, opts, nil)
require.NoError(t, err) require.NoError(t, err)
require.NoError(t, head.Init(math.MinInt64)) require.NoError(t, head.Init(math.MinInt64))
defer func() { defer func() {
@ -3136,7 +3143,7 @@ func TestChunkSnapshotTakenAfterIncompleteSnapshot(t *testing.T) {
opts := DefaultHeadOptions() opts := DefaultHeadOptions()
opts.ChunkDirRoot = dir opts.ChunkDirRoot = dir
opts.EnableMemorySnapshotOnShutdown = true opts.EnableMemorySnapshotOnShutdown = true
head, err := NewHead(nil, nil, wlog, opts, nil) head, err := NewHead(nil, nil, wlog, nil, opts, nil)
require.NoError(t, err) require.NoError(t, err)
require.NoError(t, head.Init(math.MinInt64)) require.NoError(t, head.Init(math.MinInt64))
@ -3159,6 +3166,251 @@ func TestChunkSnapshotTakenAfterIncompleteSnapshot(t *testing.T) {
require.Greater(t, offset, 0) require.Greater(t, offset, 0)
} }
// TestOOOWalReplay checks the replay at a low level.
// TODO(codesome): Needs test for ooo WAL repair.
func TestOOOWalReplay(t *testing.T) {
dir := t.TempDir()
wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
require.NoError(t, err)
oooWlog, err := wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
require.NoError(t, err)
opts := DefaultHeadOptions()
opts.ChunkRange = 1000
opts.ChunkDirRoot = dir
opts.OutOfOrderTimeWindow.Store(30 * time.Minute.Milliseconds())
h, err := NewHead(nil, nil, wlog, oooWlog, opts, nil)
require.NoError(t, err)
require.NoError(t, h.Init(0))
var expOOOSamples []sample
l := labels.FromStrings("foo", "bar")
appendSample := func(mins int64, isOOO bool) {
app := h.Appender(context.Background())
ts, v := mins*time.Minute.Milliseconds(), float64(mins)
_, err := app.Append(0, l, ts, v)
require.NoError(t, err)
require.NoError(t, app.Commit())
if isOOO {
expOOOSamples = append(expOOOSamples, sample{t: ts, v: v})
}
}
// In-order sample.
appendSample(60, false)
// Out of order samples.
appendSample(40, true)
appendSample(35, true)
appendSample(50, true)
appendSample(55, true)
appendSample(59, true)
appendSample(31, true)
// Check that Head's time ranges are set properly.
require.Equal(t, 60*time.Minute.Milliseconds(), h.MinTime())
require.Equal(t, 60*time.Minute.Milliseconds(), h.MaxTime())
require.Equal(t, 31*time.Minute.Milliseconds(), h.MinOOOTime())
require.Equal(t, 59*time.Minute.Milliseconds(), h.MaxOOOTime())
// Restart head.
require.NoError(t, h.Close())
wlog, err = wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
require.NoError(t, err)
oooWlog, err = wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
require.NoError(t, err)
h, err = NewHead(nil, nil, wlog, oooWlog, opts, nil)
require.NoError(t, err)
require.NoError(t, h.Init(0)) // Replay happens here.
// Get the ooo samples from the Head.
ms, ok, err := h.getOrCreate(l.Hash(), l)
require.NoError(t, err)
require.False(t, ok)
require.NotNil(t, ms)
xor, err := ms.oooHeadChunk.chunk.ToXOR()
require.NoError(t, err)
it := xor.Iterator(nil)
actOOOSamples := make([]sample, 0, len(expOOOSamples))
for it.Next() {
ts, v := it.At()
actOOOSamples = append(actOOOSamples, sample{t: ts, v: v})
}
// OOO chunk will be sorted. Hence sort the expected samples.
sort.Slice(expOOOSamples, func(i, j int) bool {
return expOOOSamples[i].t < expOOOSamples[j].t
})
require.Equal(t, expOOOSamples, actOOOSamples)
require.NoError(t, h.Close())
}
// TestOOOMmapReplay checks the replay at a low level.
func TestOOOMmapReplay(t *testing.T) {
dir := t.TempDir()
wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
require.NoError(t, err)
oooWlog, err := wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
require.NoError(t, err)
opts := DefaultHeadOptions()
opts.ChunkRange = 1000
opts.ChunkDirRoot = dir
opts.OutOfOrderCapMax.Store(30)
opts.OutOfOrderTimeWindow.Store(1000 * time.Minute.Milliseconds())
h, err := NewHead(nil, nil, wlog, oooWlog, opts, nil)
require.NoError(t, err)
require.NoError(t, h.Init(0))
l := labels.FromStrings("foo", "bar")
appendSample := func(mins int64) {
app := h.Appender(context.Background())
ts, v := mins*time.Minute.Milliseconds(), float64(mins)
_, err := app.Append(0, l, ts, v)
require.NoError(t, err)
require.NoError(t, app.Commit())
}
// In-order sample.
appendSample(200)
// Out of order samples. 92 samples to create 3 m-map chunks.
for mins := int64(100); mins <= 191; mins++ {
appendSample(mins)
}
ms, ok, err := h.getOrCreate(l.Hash(), l)
require.NoError(t, err)
require.False(t, ok)
require.NotNil(t, ms)
require.Len(t, ms.oooMmappedChunks, 3)
// Verify that we can access the chunks without error.
for _, m := range ms.oooMmappedChunks {
chk, err := h.chunkDiskMapper.Chunk(m.ref)
require.NoError(t, err)
require.Equal(t, int(m.numSamples), chk.NumSamples())
}
expMmapChunks := make([]*mmappedChunk, 3)
copy(expMmapChunks, ms.oooMmappedChunks)
// Restart head.
require.NoError(t, h.Close())
wlog, err = wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
require.NoError(t, err)
oooWlog, err = wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
require.NoError(t, err)
h, err = NewHead(nil, nil, wlog, oooWlog, opts, nil)
require.NoError(t, err)
require.NoError(t, h.Init(0)) // Replay happens here.
// Get the mmap chunks from the Head.
ms, ok, err = h.getOrCreate(l.Hash(), l)
require.NoError(t, err)
require.False(t, ok)
require.NotNil(t, ms)
require.Len(t, ms.oooMmappedChunks, len(expMmapChunks))
// Verify that we can access the chunks without error.
for _, m := range ms.oooMmappedChunks {
chk, err := h.chunkDiskMapper.Chunk(m.ref)
require.NoError(t, err)
require.Equal(t, int(m.numSamples), chk.NumSamples())
}
actMmapChunks := make([]*mmappedChunk, len(expMmapChunks))
copy(actMmapChunks, ms.oooMmappedChunks)
require.Equal(t, expMmapChunks, actMmapChunks)
require.NoError(t, h.Close())
}
func TestHeadInit_DiscardChunksWithUnsupportedEncoding(t *testing.T) {
h, _ := newTestHead(t, 1000, false, false)
defer func() {
require.NoError(t, h.Close())
}()
require.NoError(t, h.Init(0))
ctx := context.Background()
app := h.Appender(ctx)
seriesLabels := labels.FromStrings("a", "1")
var seriesRef storage.SeriesRef
var err error
for i := 0; i < 400; i++ {
seriesRef, err = app.Append(0, seriesLabels, int64(i), float64(i))
require.NoError(t, err)
}
require.NoError(t, app.Commit())
require.Greater(t, prom_testutil.ToFloat64(h.metrics.chunksCreated), 1.0)
uc := newUnsupportedChunk()
// Make this chunk not overlap with the previous and the next
h.chunkDiskMapper.WriteChunk(chunks.HeadSeriesRef(seriesRef), 500, 600, uc, func(err error) { require.NoError(t, err) })
app = h.Appender(ctx)
for i := 700; i < 1200; i++ {
_, err := app.Append(0, seriesLabels, int64(i), float64(i))
require.NoError(t, err)
}
require.NoError(t, app.Commit())
require.Greater(t, prom_testutil.ToFloat64(h.metrics.chunksCreated), 4.0)
series, created, err := h.getOrCreate(seriesLabels.Hash(), seriesLabels)
require.NoError(t, err)
require.False(t, created, "should already exist")
require.NotNil(t, series, "should return the series we created above")
expChunks := make([]*mmappedChunk, len(series.mmappedChunks))
copy(expChunks, series.mmappedChunks)
require.NoError(t, h.Close())
wlog, err := wal.NewSize(nil, nil, filepath.Join(h.opts.ChunkDirRoot, "wal"), 32768, false)
require.NoError(t, err)
h, err = NewHead(nil, nil, wlog, nil, h.opts, nil)
require.NoError(t, err)
require.NoError(t, h.Init(0))
series, created, err = h.getOrCreate(seriesLabels.Hash(), seriesLabels)
require.NoError(t, err)
require.False(t, created, "should already exist")
require.NotNil(t, series, "should return the series we created above")
require.Equal(t, expChunks, series.mmappedChunks)
}
const (
UnsupportedMask = 0b10000000
EncUnsupportedXOR = chunkenc.EncXOR | UnsupportedMask
)
// unsupportedChunk holds a XORChunk and overrides the Encoding() method.
type unsupportedChunk struct {
*chunkenc.XORChunk
}
func newUnsupportedChunk() *unsupportedChunk {
return &unsupportedChunk{chunkenc.NewXORChunk()}
}
func (c *unsupportedChunk) Encoding() chunkenc.Encoding {
return EncUnsupportedXOR
}
// Tests https://github.com/prometheus/prometheus/issues/10277. // Tests https://github.com/prometheus/prometheus/issues/10277.
func TestMmapPanicAfterMmapReplayCorruption(t *testing.T) { func TestMmapPanicAfterMmapReplayCorruption(t *testing.T) {
dir := t.TempDir() dir := t.TempDir()
@ -3171,7 +3423,7 @@ func TestMmapPanicAfterMmapReplayCorruption(t *testing.T) {
opts.EnableExemplarStorage = true opts.EnableExemplarStorage = true
opts.MaxExemplars.Store(config.DefaultExemplarsConfig.MaxExemplars) opts.MaxExemplars.Store(config.DefaultExemplarsConfig.MaxExemplars)
h, err := NewHead(nil, nil, wlog, opts, nil) h, err := NewHead(nil, nil, wlog, nil, opts, nil)
require.NoError(t, err) require.NoError(t, err)
require.NoError(t, h.Init(0)) require.NoError(t, h.Init(0))
@ -3205,7 +3457,7 @@ func TestMmapPanicAfterMmapReplayCorruption(t *testing.T) {
require.NoError(t, err) require.NoError(t, err)
require.NoError(t, f.Close()) require.NoError(t, f.Close())
h, err = NewHead(nil, nil, wlog, opts, nil) h, err = NewHead(nil, nil, wlog, nil, opts, nil)
require.NoError(t, err) require.NoError(t, err)
require.NoError(t, h.Init(0)) require.NoError(t, h.Init(0))
@ -3230,7 +3482,7 @@ func TestReplayAfterMmapReplayError(t *testing.T) {
opts.EnableMemorySnapshotOnShutdown = true opts.EnableMemorySnapshotOnShutdown = true
opts.MaxExemplars.Store(config.DefaultExemplarsConfig.MaxExemplars) opts.MaxExemplars.Store(config.DefaultExemplarsConfig.MaxExemplars)
h, err = NewHead(nil, nil, wlog, opts, nil) h, err = NewHead(nil, nil, wlog, nil, opts, nil)
require.NoError(t, err) require.NoError(t, err)
require.NoError(t, h.Init(0)) require.NoError(t, h.Init(0))
} }
@ -3292,3 +3544,131 @@ func TestReplayAfterMmapReplayError(t *testing.T) {
require.NoError(t, h.Close()) require.NoError(t, h.Close())
} }
func TestOOOAppendWithNoSeries(t *testing.T) {
dir := t.TempDir()
wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
require.NoError(t, err)
oooWlog, err := wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
require.NoError(t, err)
opts := DefaultHeadOptions()
opts.ChunkDirRoot = dir
opts.OutOfOrderCapMax.Store(30)
opts.OutOfOrderTimeWindow.Store(120 * time.Minute.Milliseconds())
h, err := NewHead(nil, nil, wlog, oooWlog, opts, nil)
require.NoError(t, err)
t.Cleanup(func() {
require.NoError(t, h.Close())
})
require.NoError(t, h.Init(0))
appendSample := func(lbls labels.Labels, ts int64) {
app := h.Appender(context.Background())
_, err := app.Append(0, lbls, ts*time.Minute.Milliseconds(), float64(ts))
require.NoError(t, err)
require.NoError(t, app.Commit())
}
verifyOOOSamples := func(lbls labels.Labels, expSamples int) {
ms, created, err := h.getOrCreate(lbls.Hash(), lbls)
require.NoError(t, err)
require.False(t, created)
require.NotNil(t, ms)
require.Nil(t, ms.headChunk)
require.NotNil(t, ms.oooHeadChunk)
require.Equal(t, expSamples, ms.oooHeadChunk.chunk.NumSamples())
}
verifyInOrderSamples := func(lbls labels.Labels, expSamples int) {
ms, created, err := h.getOrCreate(lbls.Hash(), lbls)
require.NoError(t, err)
require.False(t, created)
require.NotNil(t, ms)
require.Nil(t, ms.oooHeadChunk)
require.NotNil(t, ms.headChunk)
require.Equal(t, expSamples, ms.headChunk.chunk.NumSamples())
}
newLabels := func(idx int) labels.Labels { return labels.FromStrings("foo", fmt.Sprintf("%d", idx)) }
s1 := newLabels(1)
appendSample(s1, 300) // At 300m.
verifyInOrderSamples(s1, 1)
// At 239m, the sample cannot be appended to in-order chunk since it is
// beyond the minValidTime. So it should go in OOO chunk.
// Series does not exist for s2 yet.
s2 := newLabels(2)
appendSample(s2, 239) // OOO sample.
verifyOOOSamples(s2, 1)
// Similar for 180m.
s3 := newLabels(3)
appendSample(s3, 180) // OOO sample.
verifyOOOSamples(s3, 1)
// Now 179m is too old.
s4 := newLabels(4)
app := h.Appender(context.Background())
_, err = app.Append(0, s4, 179*time.Minute.Milliseconds(), float64(179))
require.Equal(t, storage.ErrTooOldSample, err)
require.NoError(t, app.Rollback())
verifyOOOSamples(s3, 1)
// Samples still go into in-order chunk for samples within
// appendable minValidTime.
s5 := newLabels(5)
appendSample(s5, 240)
verifyInOrderSamples(s5, 1)
}
func TestHeadMinOOOTimeUpdate(t *testing.T) {
dir := t.TempDir()
wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
require.NoError(t, err)
oooWlog, err := wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
require.NoError(t, err)
opts := DefaultHeadOptions()
opts.ChunkDirRoot = dir
opts.OutOfOrderTimeWindow.Store(10 * time.Minute.Milliseconds())
h, err := NewHead(nil, nil, wlog, oooWlog, opts, nil)
require.NoError(t, err)
t.Cleanup(func() {
require.NoError(t, h.Close())
})
require.NoError(t, h.Init(0))
appendSample := func(ts int64) {
lbls := labels.FromStrings("foo", "bar")
app := h.Appender(context.Background())
_, err := app.Append(0, lbls, ts*time.Minute.Milliseconds(), float64(ts))
require.NoError(t, err)
require.NoError(t, app.Commit())
}
appendSample(300) // In-order sample.
require.Equal(t, int64(math.MaxInt64), h.MinOOOTime())
appendSample(295) // OOO sample.
require.Equal(t, 295*time.Minute.Milliseconds(), h.MinOOOTime())
// Allowed window for OOO is >=290, which is before the earliest ooo sample 295, so it gets set to the lower value.
require.NoError(t, h.truncateOOO(0, 1))
require.Equal(t, 290*time.Minute.Milliseconds(), h.MinOOOTime())
appendSample(310) // In-order sample.
appendSample(305) // OOO sample.
require.Equal(t, 290*time.Minute.Milliseconds(), h.MinOOOTime())
// Now the OOO sample 295 was not gc'ed yet. And allowed window for OOO is now >=300.
// So the lowest among them, 295, is set as minOOOTime.
require.NoError(t, h.truncateOOO(0, 2))
require.Equal(t, 295*time.Minute.Milliseconds(), h.MinOOOTime())
}

View file

@ -42,7 +42,7 @@ import (
"github.com/prometheus/prometheus/tsdb/wal" "github.com/prometheus/prometheus/tsdb/wal"
) )
func (h *Head) loadWAL(r *wal.Reader, multiRef map[chunks.HeadSeriesRef]chunks.HeadSeriesRef, mmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk) (err error) { func (h *Head) loadWAL(r *wal.Reader, multiRef map[chunks.HeadSeriesRef]chunks.HeadSeriesRef, mmappedChunks, oooMmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk) (err error) {
// Track number of samples that referenced a series we don't know about // Track number of samples that referenced a series we don't know about
// for error reporting. // for error reporting.
var unknownRefs atomic.Uint64 var unknownRefs atomic.Uint64
@ -107,7 +107,7 @@ func (h *Head) loadWAL(r *wal.Reader, multiRef map[chunks.HeadSeriesRef]chunks.H
processors[i].setup() processors[i].setup()
go func(wp *walSubsetProcessor) { go func(wp *walSubsetProcessor) {
unknown, overlapping := wp.processWALSamples(h, mmappedChunks) unknown, overlapping := wp.processWALSamples(h, mmappedChunks, oooMmappedChunks)
unknownRefs.Add(unknown) unknownRefs.Add(unknown)
mmapOverlappingChunks.Add(overlapping) mmapOverlappingChunks.Add(overlapping)
wg.Done() wg.Done()
@ -343,7 +343,7 @@ Outer:
} }
// resetSeriesWithMMappedChunks is only used during the WAL replay. // resetSeriesWithMMappedChunks is only used during the WAL replay.
func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc []*mmappedChunk, walSeriesRef chunks.HeadSeriesRef) (overlapped bool) { func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc, oooMmc []*mmappedChunk, walSeriesRef chunks.HeadSeriesRef) (overlapped bool) {
if mSeries.ref != walSeriesRef { if mSeries.ref != walSeriesRef {
// Checking if the new m-mapped chunks overlap with the already existing ones. // Checking if the new m-mapped chunks overlap with the already existing ones.
if len(mSeries.mmappedChunks) > 0 && len(mmc) > 0 { if len(mSeries.mmappedChunks) > 0 && len(mmc) > 0 {
@ -368,10 +368,11 @@ func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc []*mmappedCh
} }
} }
h.metrics.chunksCreated.Add(float64(len(mmc))) h.metrics.chunksCreated.Add(float64(len(mmc) + len(oooMmc)))
h.metrics.chunksRemoved.Add(float64(len(mSeries.mmappedChunks))) h.metrics.chunksRemoved.Add(float64(len(mSeries.mmappedChunks)))
h.metrics.chunks.Add(float64(len(mmc) - len(mSeries.mmappedChunks))) h.metrics.chunks.Add(float64(len(mmc) + len(oooMmc) - len(mSeries.mmappedChunks)))
mSeries.mmappedChunks = mmc mSeries.mmappedChunks = mmc
mSeries.oooMmappedChunks = oooMmc
// Cache the last mmapped chunk time, so we can skip calling append() for samples it will reject. // Cache the last mmapped chunk time, so we can skip calling append() for samples it will reject.
if len(mmc) == 0 { if len(mmc) == 0 {
mSeries.mmMaxTime = math.MinInt64 mSeries.mmMaxTime = math.MinInt64
@ -379,6 +380,19 @@ func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc []*mmappedCh
mSeries.mmMaxTime = mmc[len(mmc)-1].maxTime mSeries.mmMaxTime = mmc[len(mmc)-1].maxTime
h.updateMinMaxTime(mmc[0].minTime, mSeries.mmMaxTime) h.updateMinMaxTime(mmc[0].minTime, mSeries.mmMaxTime)
} }
if len(oooMmc) != 0 {
// Mint and maxt can be in any chunk, they are not sorted.
mint, maxt := int64(math.MaxInt64), int64(math.MinInt64)
for _, ch := range oooMmc {
if ch.minTime < mint {
mint = ch.minTime
}
if ch.maxTime > maxt {
maxt = ch.maxTime
}
}
h.updateMinOOOMaxOOOTime(mint, maxt)
}
// Any samples replayed till now would already be compacted. Resetting the head chunk. // Any samples replayed till now would already be compacted. Resetting the head chunk.
mSeries.nextAt = 0 mSeries.nextAt = 0
@ -421,7 +435,7 @@ func (wp *walSubsetProcessor) reuseBuf() []record.RefSample {
// processWALSamples adds the samples it receives to the head and passes // processWALSamples adds the samples it receives to the head and passes
// the buffer received to an output channel for reuse. // the buffer received to an output channel for reuse.
func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk) (unknownRefs, mmapOverlappingChunks uint64) { func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks, oooMmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk) (unknownRefs, mmapOverlappingChunks uint64) {
defer close(wp.output) defer close(wp.output)
mint, maxt := int64(math.MaxInt64), int64(math.MinInt64) mint, maxt := int64(math.MaxInt64), int64(math.MinInt64)
@ -429,7 +443,8 @@ func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks map[chunk
for in := range wp.input { for in := range wp.input {
if in.existingSeries != nil { if in.existingSeries != nil {
mmc := mmappedChunks[in.walSeriesRef] mmc := mmappedChunks[in.walSeriesRef]
if h.resetSeriesWithMMappedChunks(in.existingSeries, mmc, in.walSeriesRef) { oooMmc := oooMmappedChunks[in.walSeriesRef]
if h.resetSeriesWithMMappedChunks(in.existingSeries, mmc, oooMmc, in.walSeriesRef) {
mmapOverlappingChunks++ mmapOverlappingChunks++
} }
continue continue
@ -465,6 +480,292 @@ func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks map[chunk
return unknownRefs, mmapOverlappingChunks return unknownRefs, mmapOverlappingChunks
} }
func (h *Head) loadWBL(r *wal.Reader, multiRef map[chunks.HeadSeriesRef]chunks.HeadSeriesRef, lastMmapRef chunks.ChunkDiskMapperRef) (err error) {
// Track number of samples, m-map markers, that referenced a series we don't know about
// for error reporting.
var unknownRefs, mmapMarkerUnknownRefs atomic.Uint64
lastSeq, lastOff := lastMmapRef.Unpack()
// Start workers that each process samples for a partition of the series ID space.
var (
wg sync.WaitGroup
n = runtime.GOMAXPROCS(0)
processors = make([]wblSubsetProcessor, n)
dec record.Decoder
shards = make([][]record.RefSample, n)
decodedCh = make(chan interface{}, 10)
decodeErr error
samplesPool = sync.Pool{
New: func() interface{} {
return []record.RefSample{}
},
}
markersPool = sync.Pool{
New: func() interface{} {
return []record.RefMmapMarker{}
},
}
)
defer func() {
// For CorruptionErr ensure to terminate all workers before exiting.
// We also wrap it to identify OOO WBL corruption.
_, ok := err.(*wal.CorruptionErr)
if ok {
err = &errLoadWbl{err: err}
for i := 0; i < n; i++ {
processors[i].closeAndDrain()
}
wg.Wait()
}
}()
wg.Add(n)
for i := 0; i < n; i++ {
processors[i].setup()
go func(wp *wblSubsetProcessor) {
unknown := wp.processWBLSamples(h)
unknownRefs.Add(unknown)
wg.Done()
}(&processors[i])
}
go func() {
defer close(decodedCh)
for r.Next() {
rec := r.Record()
switch dec.Type(rec) {
case record.Samples:
samples := samplesPool.Get().([]record.RefSample)[:0]
samples, err = dec.Samples(rec, samples)
if err != nil {
decodeErr = &wal.CorruptionErr{
Err: errors.Wrap(err, "decode samples"),
Segment: r.Segment(),
Offset: r.Offset(),
}
return
}
decodedCh <- samples
case record.MmapMarkers:
markers := markersPool.Get().([]record.RefMmapMarker)[:0]
markers, err = dec.MmapMarkers(rec, markers)
if err != nil {
decodeErr = &wal.CorruptionErr{
Err: errors.Wrap(err, "decode mmap markers"),
Segment: r.Segment(),
Offset: r.Offset(),
}
return
}
decodedCh <- markers
default:
// Noop.
}
}
}()
// The records are always replayed from the oldest to the newest.
for d := range decodedCh {
switch v := d.(type) {
case []record.RefSample:
samples := v
// We split up the samples into parts of 5000 samples or less.
// With O(300 * #cores) in-flight sample batches, large scrapes could otherwise
// cause thousands of very large in flight buffers occupying large amounts
// of unused memory.
for len(samples) > 0 {
m := 5000
if len(samples) < m {
m = len(samples)
}
for i := 0; i < n; i++ {
shards[i] = processors[i].reuseBuf()
}
for _, sam := range samples[:m] {
if r, ok := multiRef[sam.Ref]; ok {
sam.Ref = r
}
mod := uint64(sam.Ref) % uint64(n)
shards[mod] = append(shards[mod], sam)
}
for i := 0; i < n; i++ {
processors[i].input <- shards[i]
}
samples = samples[m:]
}
//nolint:staticcheck // Ignore SA6002 relax staticcheck verification.
samplesPool.Put(d)
case []record.RefMmapMarker:
markers := v
for _, rm := range markers {
seq, off := rm.MmapRef.Unpack()
if seq > lastSeq || (seq == lastSeq && off > lastOff) {
// This m-map chunk from markers was not present during
// the load of mmapped chunks that happened in the head
// initialization.
continue
}
if r, ok := multiRef[rm.Ref]; ok {
rm.Ref = r
}
ms := h.series.getByID(rm.Ref)
if ms == nil {
mmapMarkerUnknownRefs.Inc()
continue
}
idx := uint64(ms.ref) % uint64(n)
// It is possible that some old sample is being processed in processWALSamples that
// could cause race below. So we wait for the goroutine to empty input the buffer and finish
// processing all old samples after emptying the buffer.
processors[idx].waitUntilIdle()
// Lock the subset so we can modify the series object
processors[idx].mx.Lock()
// All samples till now have been m-mapped. Hence clear out the headChunk.
// In case some samples slipped through and went into m-map chunks because of changed
// chunk size parameters, we are not taking care of that here.
// TODO(codesome): see if there is a way to avoid duplicate m-map chunks if
// the size of ooo chunk was reduced between restart.
ms.oooHeadChunk = nil
processors[idx].mx.Unlock()
}
default:
panic(fmt.Errorf("unexpected decodedCh type: %T", d))
}
}
if decodeErr != nil {
return decodeErr
}
// Signal termination to each worker and wait for it to close its output channel.
for i := 0; i < n; i++ {
processors[i].closeAndDrain()
}
wg.Wait()
if r.Err() != nil {
return errors.Wrap(r.Err(), "read records")
}
if unknownRefs.Load() > 0 || mmapMarkerUnknownRefs.Load() > 0 {
level.Warn(h.logger).Log("msg", "Unknown series references for ooo WAL replay", "samples", unknownRefs.Load(), "mmap_markers", mmapMarkerUnknownRefs.Load())
}
return nil
}
type errLoadWbl struct {
err error
}
func (e errLoadWbl) Error() string {
return e.err.Error()
}
// To support errors.Cause().
func (e errLoadWbl) Cause() error {
return e.err
}
// To support errors.Unwrap().
func (e errLoadWbl) Unwrap() error {
return e.err
}
// isErrLoadOOOWal returns a boolean if the error is errLoadWbl.
func isErrLoadOOOWal(err error) bool {
_, ok := err.(*errLoadWbl)
return ok
}
type wblSubsetProcessor struct {
mx sync.Mutex // Take this lock while modifying series in the subset.
input chan []record.RefSample
output chan []record.RefSample
}
func (wp *wblSubsetProcessor) setup() {
wp.output = make(chan []record.RefSample, 300)
wp.input = make(chan []record.RefSample, 300)
}
func (wp *wblSubsetProcessor) closeAndDrain() {
close(wp.input)
for range wp.output {
}
}
// If there is a buffer in the output chan, return it for reuse, otherwise return nil.
func (wp *wblSubsetProcessor) reuseBuf() []record.RefSample {
select {
case buf := <-wp.output:
return buf[:0]
default:
}
return nil
}
// processWBLSamples adds the samples it receives to the head and passes
// the buffer received to an output channel for reuse.
// Samples before the minValidTime timestamp are discarded.
func (wp *wblSubsetProcessor) processWBLSamples(h *Head) (unknownRefs uint64) {
defer close(wp.output)
// We don't check for minValidTime for ooo samples.
mint, maxt := int64(math.MaxInt64), int64(math.MinInt64)
for samples := range wp.input {
wp.mx.Lock()
for _, s := range samples {
ms := h.series.getByID(s.Ref)
if ms == nil {
unknownRefs++
continue
}
ok, chunkCreated, _ := ms.insert(s.T, s.V, h.chunkDiskMapper)
if chunkCreated {
h.metrics.chunksCreated.Inc()
h.metrics.chunks.Inc()
}
if ok {
if s.T < mint {
mint = s.T
}
if s.T > maxt {
maxt = s.T
}
}
}
wp.mx.Unlock()
wp.output <- samples
}
h.updateMinOOOMaxOOOTime(mint, maxt)
return unknownRefs
}
func (wp *wblSubsetProcessor) waitUntilIdle() {
select {
case <-wp.output: // Allow output side to drain to avoid deadlock.
default:
}
wp.input <- []record.RefSample{}
for len(wp.input) != 0 {
time.Sleep(10 * time.Microsecond)
select {
case <-wp.output: // Allow output side to drain to avoid deadlock.
default:
}
}
}
const ( const (
chunkSnapshotRecordTypeSeries uint8 = 1 chunkSnapshotRecordTypeSeries uint8 = 1
chunkSnapshotRecordTypeTombstones uint8 = 2 chunkSnapshotRecordTypeTombstones uint8 = 2

159
tsdb/ooo_head.go Normal file
View file

@ -0,0 +1,159 @@
// Copyright 2022 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdb
import (
"fmt"
"sort"
"github.com/prometheus/prometheus/tsdb/chunkenc"
"github.com/prometheus/prometheus/tsdb/tombstones"
)
// OOOChunk maintains samples in time-ascending order.
// Inserts for timestamps already seen, are dropped.
// Samples are stored uncompressed to allow easy sorting.
// Perhaps we can be more efficient later.
type OOOChunk struct {
samples []sample
}
func NewOOOChunk() *OOOChunk {
return &OOOChunk{samples: make([]sample, 0, 4)}
}
// Insert inserts the sample such that order is maintained.
// Returns false if insert was not possible due to the same timestamp already existing.
func (o *OOOChunk) Insert(t int64, v float64) bool {
// Find index of sample we should replace.
i := sort.Search(len(o.samples), func(i int) bool { return o.samples[i].t >= t })
if i >= len(o.samples) {
// none found. append it at the end
o.samples = append(o.samples, sample{t, v})
return true
}
if o.samples[i].t == t {
return false
}
// Expand length by 1 to make room. use a zero sample, we will overwrite it anyway.
o.samples = append(o.samples, sample{})
copy(o.samples[i+1:], o.samples[i:])
o.samples[i] = sample{t, v}
return true
}
func (o *OOOChunk) NumSamples() int {
return len(o.samples)
}
func (o *OOOChunk) ToXOR() (*chunkenc.XORChunk, error) {
x := chunkenc.NewXORChunk()
app, err := x.Appender()
if err != nil {
return nil, err
}
for _, s := range o.samples {
app.Append(s.t, s.v)
}
return x, nil
}
func (o *OOOChunk) ToXORBetweenTimestamps(mint, maxt int64) (*chunkenc.XORChunk, error) {
x := chunkenc.NewXORChunk()
app, err := x.Appender()
if err != nil {
return nil, err
}
for _, s := range o.samples {
if s.t < mint {
continue
}
if s.t > maxt {
break
}
app.Append(s.t, s.v)
}
return x, nil
}
var _ BlockReader = &OOORangeHead{}
// OOORangeHead allows querying Head out of order samples via BlockReader
// interface implementation.
type OOORangeHead struct {
head *Head
// mint and maxt are tracked because when a query is handled we only want
// the timerange of the query and having preexisting pointers to the first
// and last timestamp help with that.
mint, maxt int64
}
func NewOOORangeHead(head *Head, mint, maxt int64) *OOORangeHead {
return &OOORangeHead{
head: head,
mint: mint,
maxt: maxt,
}
}
func (oh *OOORangeHead) Index() (IndexReader, error) {
return NewOOOHeadIndexReader(oh.head, oh.mint, oh.maxt), nil
}
func (oh *OOORangeHead) Chunks() (ChunkReader, error) {
return NewOOOHeadChunkReader(oh.head, oh.mint, oh.maxt), nil
}
func (oh *OOORangeHead) Tombstones() (tombstones.Reader, error) {
// As stated in the design doc https://docs.google.com/document/d/1Kppm7qL9C-BJB1j6yb6-9ObG3AbdZnFUBYPNNWwDBYM/edit?usp=sharing
// Tombstones are not supported for out of order metrics.
return tombstones.NewMemTombstones(), nil
}
func (oh *OOORangeHead) Meta() BlockMeta {
var id [16]byte
copy(id[:], "____ooo_head____")
return BlockMeta{
MinTime: oh.mint,
MaxTime: oh.maxt,
ULID: id,
Stats: BlockStats{
NumSeries: oh.head.NumSeries(),
},
}
}
// Size returns the size taken by the Head block.
func (oh *OOORangeHead) Size() int64 {
return oh.head.Size()
}
// String returns an human readable representation of the out of order range
// head. It's important to keep this function in order to avoid the struct dump
// when the head is stringified in errors or logs.
func (oh *OOORangeHead) String() string {
return fmt.Sprintf("ooo range head (mint: %d, maxt: %d)", oh.MinTime(), oh.MaxTime())
}
func (oh *OOORangeHead) MinTime() int64 {
return oh.mint
}
func (oh *OOORangeHead) MaxTime() int64 {
return oh.maxt
}

433
tsdb/ooo_head_read.go Normal file
View file

@ -0,0 +1,433 @@
// Copyright 2022 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdb
import (
"errors"
"math"
"sort"
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/storage"
"github.com/prometheus/prometheus/tsdb/chunkenc"
"github.com/prometheus/prometheus/tsdb/chunks"
"github.com/prometheus/prometheus/tsdb/index"
"github.com/prometheus/prometheus/tsdb/tombstones"
)
var _ IndexReader = &OOOHeadIndexReader{}
// OOOHeadIndexReader implements IndexReader so ooo samples in the head can be
// accessed.
// It also has a reference to headIndexReader so we can leverage on its
// IndexReader implementation for all the methods that remain the same. We
// decided to do this to avoid code duplication.
// The only methods that change are the ones about getting Series and Postings.
type OOOHeadIndexReader struct {
*headIndexReader // A reference to the headIndexReader so we can reuse as many interface implementation as possible.
}
func NewOOOHeadIndexReader(head *Head, mint, maxt int64) *OOOHeadIndexReader {
hr := &headIndexReader{
head: head,
mint: mint,
maxt: maxt,
}
return &OOOHeadIndexReader{hr}
}
func (oh *OOOHeadIndexReader) Series(ref storage.SeriesRef, lbls *labels.Labels, chks *[]chunks.Meta) error {
return oh.series(ref, lbls, chks, 0)
}
// The passed lastMmapRef tells upto what max m-map chunk that we can consider.
// If it is 0, it means all chunks need to be considered.
// If it is non-0, then the oooHeadChunk must not be considered.
func (oh *OOOHeadIndexReader) series(ref storage.SeriesRef, lbls *labels.Labels, chks *[]chunks.Meta, lastMmapRef chunks.ChunkDiskMapperRef) error {
s := oh.head.series.getByID(chunks.HeadSeriesRef(ref))
if s == nil {
oh.head.metrics.seriesNotFound.Inc()
return storage.ErrNotFound
}
*lbls = append((*lbls)[:0], s.lset...)
if chks == nil {
return nil
}
s.Lock()
defer s.Unlock()
*chks = (*chks)[:0]
tmpChks := make([]chunks.Meta, 0, len(s.oooMmappedChunks))
// We define these markers to track the last chunk reference while we
// fill the chunk meta.
// These markers are useful to give consistent responses to repeated queries
// even if new chunks that might be overlapping or not are added afterwards.
// Also, lastMinT and lastMaxT are initialized to the max int as a sentinel
// value to know they are unset.
var lastChunkRef chunks.ChunkRef
lastMinT, lastMaxT := int64(math.MaxInt64), int64(math.MaxInt64)
addChunk := func(minT, maxT int64, ref chunks.ChunkRef) {
// the first time we get called is for the last included chunk.
// set the markers accordingly
if lastMinT == int64(math.MaxInt64) {
lastChunkRef = ref
lastMinT = minT
lastMaxT = maxT
}
tmpChks = append(tmpChks, chunks.Meta{
MinTime: minT,
MaxTime: maxT,
Ref: ref,
OOOLastRef: lastChunkRef,
OOOLastMinTime: lastMinT,
OOOLastMaxTime: lastMaxT,
})
}
// Collect all chunks that overlap the query range, in order from most recent to most old,
// so we can set the correct markers.
if s.oooHeadChunk != nil {
c := s.oooHeadChunk
if c.OverlapsClosedInterval(oh.mint, oh.maxt) && lastMmapRef == 0 {
ref := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(len(s.oooMmappedChunks))))
addChunk(c.minTime, c.maxTime, ref)
}
}
for i := len(s.oooMmappedChunks) - 1; i >= 0; i-- {
c := s.oooMmappedChunks[i]
if c.OverlapsClosedInterval(oh.mint, oh.maxt) && (lastMmapRef == 0 || lastMmapRef.GreaterThanOrEqualTo(c.ref)) {
ref := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(i)))
addChunk(c.minTime, c.maxTime, ref)
}
}
// There is nothing to do if we did not collect any chunk
if len(tmpChks) == 0 {
return nil
}
// Next we want to sort all the collected chunks by min time so we can find
// those that overlap.
sort.Sort(metaByMinTimeAndMinRef(tmpChks))
// Next we want to iterate the sorted collected chunks and only return the
// chunks Meta the first chunk that overlaps with others.
// Example chunks of a series: 5:(100, 200) 6:(500, 600) 7:(150, 250) 8:(550, 650)
// In the example 5 overlaps with 7 and 6 overlaps with 8 so we only want to
// to return chunk Metas for chunk 5 and chunk 6
*chks = append(*chks, tmpChks[0])
maxTime := tmpChks[0].MaxTime // tracks the maxTime of the previous "to be merged chunk"
for _, c := range tmpChks[1:] {
if c.MinTime > maxTime {
*chks = append(*chks, c)
maxTime = c.MaxTime
} else if c.MaxTime > maxTime {
maxTime = c.MaxTime
(*chks)[len(*chks)-1].MaxTime = c.MaxTime
}
}
return nil
}
// LabelValues needs to be overridden from the headIndexReader implementation due
// to the check that happens at the beginning where we make sure that the query
// interval overlaps with the head minooot and maxooot.
func (oh *OOOHeadIndexReader) LabelValues(name string, matchers ...*labels.Matcher) ([]string, error) {
if oh.maxt < oh.head.MinOOOTime() || oh.mint > oh.head.MaxOOOTime() {
return []string{}, nil
}
if len(matchers) == 0 {
return oh.head.postings.LabelValues(name), nil
}
return labelValuesWithMatchers(oh, name, matchers...)
}
type chunkMetaAndChunkDiskMapperRef struct {
meta chunks.Meta
ref chunks.ChunkDiskMapperRef
origMinT int64
origMaxT int64
}
type byMinTimeAndMinRef []chunkMetaAndChunkDiskMapperRef
func (b byMinTimeAndMinRef) Len() int { return len(b) }
func (b byMinTimeAndMinRef) Less(i, j int) bool {
if b[i].meta.MinTime == b[j].meta.MinTime {
return b[i].meta.Ref < b[j].meta.Ref
}
return b[i].meta.MinTime < b[j].meta.MinTime
}
func (b byMinTimeAndMinRef) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
type metaByMinTimeAndMinRef []chunks.Meta
func (b metaByMinTimeAndMinRef) Len() int { return len(b) }
func (b metaByMinTimeAndMinRef) Less(i, j int) bool {
if b[i].MinTime == b[j].MinTime {
return b[i].Ref < b[j].Ref
}
return b[i].MinTime < b[j].MinTime
}
func (b metaByMinTimeAndMinRef) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
func (oh *OOOHeadIndexReader) Postings(name string, values ...string) (index.Postings, error) {
switch len(values) {
case 0:
return index.EmptyPostings(), nil
case 1:
return oh.head.postings.Get(name, values[0]), nil // TODO(ganesh) Also call GetOOOPostings
default:
// TODO(ganesh) We want to only return postings for out of order series.
res := make([]index.Postings, 0, len(values))
for _, value := range values {
res = append(res, oh.head.postings.Get(name, value)) // TODO(ganesh) Also call GetOOOPostings
}
return index.Merge(res...), nil
}
}
type OOOHeadChunkReader struct {
head *Head
mint, maxt int64
}
func NewOOOHeadChunkReader(head *Head, mint, maxt int64) *OOOHeadChunkReader {
return &OOOHeadChunkReader{
head: head,
mint: mint,
maxt: maxt,
}
}
func (cr OOOHeadChunkReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
sid, _ := chunks.HeadChunkRef(meta.Ref).Unpack()
s := cr.head.series.getByID(sid)
// This means that the series has been garbage collected.
if s == nil {
return nil, storage.ErrNotFound
}
s.Lock()
c, err := s.oooMergedChunk(meta, cr.head.chunkDiskMapper, cr.mint, cr.maxt)
s.Unlock()
if err != nil {
return nil, err
}
// This means that the query range did not overlap with the requested chunk.
if len(c.chunks) == 0 {
return nil, storage.ErrNotFound
}
return c, nil
}
func (cr OOOHeadChunkReader) Close() error {
return nil
}
type OOOCompactionHead struct {
oooIR *OOOHeadIndexReader
lastMmapRef chunks.ChunkDiskMapperRef
lastWBLFile int
postings []storage.SeriesRef
chunkRange int64
mint, maxt int64 // Among all the compactable chunks.
}
// NewOOOCompactionHead does the following:
// 1. M-maps all the in-memory ooo chunks.
// 2. Compute the expected block ranges while iterating through all ooo series and store it.
// 3. Store the list of postings having ooo series.
// 4. Cuts a new WBL file for the OOO WBL.
// All the above together have a bit of CPU and memory overhead, and can have a bit of impact
// on the sample append latency. So call NewOOOCompactionHead only right before compaction.
func NewOOOCompactionHead(head *Head) (*OOOCompactionHead, error) {
newWBLFile, err := head.wbl.NextSegmentSync()
if err != nil {
return nil, err
}
ch := &OOOCompactionHead{
chunkRange: head.chunkRange.Load(),
mint: math.MaxInt64,
maxt: math.MinInt64,
lastWBLFile: newWBLFile,
}
ch.oooIR = NewOOOHeadIndexReader(head, math.MinInt64, math.MaxInt64)
n, v := index.AllPostingsKey()
// TODO: verify this gets only ooo samples.
p, err := ch.oooIR.Postings(n, v)
if err != nil {
return nil, err
}
p = ch.oooIR.SortedPostings(p)
var lastSeq, lastOff int
for p.Next() {
seriesRef := p.At()
ms := head.series.getByID(chunks.HeadSeriesRef(seriesRef))
if ms == nil {
continue
}
// M-map the in-memory chunk and keep track of the last one.
// Also build the block ranges -> series map.
// TODO: consider having a lock specifically for ooo data.
ms.Lock()
mmapRef := ms.mmapCurrentOOOHeadChunk(head.chunkDiskMapper)
if mmapRef == 0 && len(ms.oooMmappedChunks) > 0 {
// Nothing was m-mapped. So take the mmapRef from the existing slice if it exists.
mmapRef = ms.oooMmappedChunks[len(ms.oooMmappedChunks)-1].ref
}
seq, off := mmapRef.Unpack()
if seq > lastSeq || (seq == lastSeq && off > lastOff) {
ch.lastMmapRef, lastSeq, lastOff = mmapRef, seq, off
}
if len(ms.oooMmappedChunks) > 0 {
ch.postings = append(ch.postings, seriesRef)
for _, c := range ms.oooMmappedChunks {
if c.minTime < ch.mint {
ch.mint = c.minTime
}
if c.maxTime > ch.maxt {
ch.maxt = c.maxTime
}
}
}
ms.Unlock()
}
return ch, nil
}
func (ch *OOOCompactionHead) Index() (IndexReader, error) {
return NewOOOCompactionHeadIndexReader(ch), nil
}
func (ch *OOOCompactionHead) Chunks() (ChunkReader, error) {
return NewOOOHeadChunkReader(ch.oooIR.head, ch.oooIR.mint, ch.oooIR.maxt), nil
}
func (ch *OOOCompactionHead) Tombstones() (tombstones.Reader, error) {
return tombstones.NewMemTombstones(), nil
}
func (ch *OOOCompactionHead) Meta() BlockMeta {
var id [16]byte
copy(id[:], "copy(id[:], \"ooo_compact_head\")")
return BlockMeta{
MinTime: ch.mint,
MaxTime: ch.maxt,
ULID: id,
Stats: BlockStats{
NumSeries: uint64(len(ch.postings)),
},
}
}
// CloneForTimeRange clones the OOOCompactionHead such that the IndexReader and ChunkReader
// obtained from this only looks at the m-map chunks within the given time ranges while not looking
// beyond the ch.lastMmapRef.
// Only the method of BlockReader interface are valid for the cloned OOOCompactionHead.
func (ch *OOOCompactionHead) CloneForTimeRange(mint, maxt int64) *OOOCompactionHead {
return &OOOCompactionHead{
oooIR: NewOOOHeadIndexReader(ch.oooIR.head, mint, maxt),
lastMmapRef: ch.lastMmapRef,
postings: ch.postings,
chunkRange: ch.chunkRange,
mint: ch.mint,
maxt: ch.maxt,
}
}
func (ch *OOOCompactionHead) Size() int64 { return 0 }
func (ch *OOOCompactionHead) MinTime() int64 { return ch.mint }
func (ch *OOOCompactionHead) MaxTime() int64 { return ch.maxt }
func (ch *OOOCompactionHead) ChunkRange() int64 { return ch.chunkRange }
func (ch *OOOCompactionHead) LastMmapRef() chunks.ChunkDiskMapperRef { return ch.lastMmapRef }
func (ch *OOOCompactionHead) LastWBLFile() int { return ch.lastWBLFile }
type OOOCompactionHeadIndexReader struct {
ch *OOOCompactionHead
}
func NewOOOCompactionHeadIndexReader(ch *OOOCompactionHead) IndexReader {
return &OOOCompactionHeadIndexReader{ch: ch}
}
func (ir *OOOCompactionHeadIndexReader) Symbols() index.StringIter {
return ir.ch.oooIR.Symbols()
}
func (ir *OOOCompactionHeadIndexReader) Postings(name string, values ...string) (index.Postings, error) {
n, v := index.AllPostingsKey()
if name != n || len(values) != 1 || values[0] != v {
return nil, errors.New("only AllPostingsKey is supported")
}
return index.NewListPostings(ir.ch.postings), nil
}
func (ir *OOOCompactionHeadIndexReader) SortedPostings(p index.Postings) index.Postings {
// This will already be sorted from the Postings() call above.
return p
}
func (ir *OOOCompactionHeadIndexReader) Series(ref storage.SeriesRef, lset *labels.Labels, chks *[]chunks.Meta) error {
return ir.ch.oooIR.series(ref, lset, chks, ir.ch.lastMmapRef)
}
func (ir *OOOCompactionHeadIndexReader) SortedLabelValues(name string, matchers ...*labels.Matcher) ([]string, error) {
return nil, errors.New("not implemented")
}
func (ir *OOOCompactionHeadIndexReader) LabelValues(name string, matchers ...*labels.Matcher) ([]string, error) {
return nil, errors.New("not implemented")
}
func (ir *OOOCompactionHeadIndexReader) PostingsForMatchers(concurrent bool, ms ...*labels.Matcher) (index.Postings, error) {
return nil, errors.New("not implemented")
}
func (ir *OOOCompactionHeadIndexReader) LabelNames(matchers ...*labels.Matcher) ([]string, error) {
return nil, errors.New("not implemented")
}
func (ir *OOOCompactionHeadIndexReader) LabelValueFor(id storage.SeriesRef, label string) (string, error) {
return "", errors.New("not implemented")
}
func (ir *OOOCompactionHeadIndexReader) LabelNamesFor(ids ...storage.SeriesRef) ([]string, error) {
return nil, errors.New("not implemented")
}
func (ir *OOOCompactionHeadIndexReader) Close() error {
return ir.ch.oooIR.Close()
}

1207
tsdb/ooo_head_read_test.go Normal file

File diff suppressed because it is too large Load diff

93
tsdb/ooo_head_test.go Normal file
View file

@ -0,0 +1,93 @@
// Copyright 2022 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdb
import (
"testing"
"github.com/stretchr/testify/require"
)
const testMaxSize int = 32
// Formulas chosen to make testing easy:
func valEven(pos int) int { return pos*2 + 2 } // s[0]=2, s[1]=4, s[2]=6, ..., s[31]=64 - Predictable pre-existing values
func valOdd(pos int) int { return pos*2 + 1 } // s[0]=1, s[1]=3, s[2]=5, ..., s[31]=63 - New values will interject at chosen position because they sort before the pre-existing vals.
func samplify(v int) sample { return sample{int64(v), float64(v)} }
func makeEvenSampleSlice(n int) []sample {
s := make([]sample, n)
for i := 0; i < n; i++ {
s[i] = samplify(valEven(i))
}
return s
}
// TestOOOInsert tests the following cases:
// - Number of pre-existing samples anywhere from 0 to testMaxSize-1.
// - Insert new sample before first pre-existing samples, after the last, and anywhere in between.
// - With a chunk initial capacity of testMaxSize/8 and testMaxSize, which lets us test non-full and full chunks, and chunks that need to expand themselves.
// Note: In all samples used, t always equals v in numeric value. when we talk about 'value' we just refer to a value that will be used for both sample.t and sample.v.
func TestOOOInsert(t *testing.T) {
for numPreExisting := 0; numPreExisting <= testMaxSize; numPreExisting++ {
// For example, if we have numPreExisting 2, then:
// chunk.samples indexes filled 0 1
// chunk.samples with these values 2 4 // valEven
// we want to test inserting at index 0 1 2 // insertPos=0..numPreExisting
// we can do this by using values 1, 3 5 // valOdd(insertPos)
for insertPos := 0; insertPos <= numPreExisting; insertPos++ {
chunk := NewOOOChunk()
chunk.samples = makeEvenSampleSlice(numPreExisting)
newSample := samplify(valOdd(insertPos))
chunk.Insert(newSample.t, newSample.v)
var expSamples []sample
// Our expected new samples slice, will be first the original samples.
for i := 0; i < insertPos; i++ {
expSamples = append(expSamples, samplify(valEven(i)))
}
// Then the new sample.
expSamples = append(expSamples, newSample)
// Followed by any original samples that were pushed back by the new one.
for i := insertPos; i < numPreExisting; i++ {
expSamples = append(expSamples, samplify(valEven(i)))
}
require.Equal(t, expSamples, chunk.samples, "numPreExisting %d, insertPos %d", numPreExisting, insertPos)
}
}
}
// TestOOOInsertDuplicate tests the correct behavior when inserting a sample that is a duplicate of any
// pre-existing samples, with between 1 and testMaxSize pre-existing samples and
// with a chunk initial capacity of testMaxSize/8 and testMaxSize, which lets us test non-full and full chunks, and chunks that need to expand themselves.
func TestOOOInsertDuplicate(t *testing.T) {
for num := 1; num <= testMaxSize; num++ {
for dupPos := 0; dupPos < num; dupPos++ {
chunk := NewOOOChunk()
chunk.samples = makeEvenSampleSlice(num)
dupSample := chunk.samples[dupPos]
dupSample.v = 0.123
ok := chunk.Insert(dupSample.t, dupSample.v)
expSamples := makeEvenSampleSlice(num) // We expect no change.
require.False(t, ok)
require.Equal(t, expSamples, chunk.samples, "num %d, dupPos %d", num, dupPos)
}
}
}

View file

@ -569,7 +569,7 @@ func (p *populateWithDelGenericSeriesIterator) next() bool {
p.i++ p.i++
p.currChkMeta = p.chks[p.i] p.currChkMeta = p.chks[p.i]
p.currChkMeta.Chunk, p.err = p.chunks.Chunk(p.currChkMeta.Ref) p.currChkMeta.Chunk, p.err = p.chunks.Chunk(p.currChkMeta)
if p.err != nil { if p.err != nil {
p.err = errors.Wrapf(p.err, "cannot populate chunk %d", p.currChkMeta.Ref) p.err = errors.Wrapf(p.err, "cannot populate chunk %d", p.currChkMeta.Ref)
return false return false
@ -898,7 +898,7 @@ func newNopChunkReader() ChunkReader {
} }
} }
func (cr nopChunkReader) Chunk(ref chunks.ChunkRef) (chunkenc.Chunk, error) { func (cr nopChunkReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
return cr.emptyChunk, nil return cr.emptyChunk, nil
} }

View file

@ -34,7 +34,7 @@ func BenchmarkQuerier(b *testing.B) {
opts := DefaultHeadOptions() opts := DefaultHeadOptions()
opts.ChunkRange = 1000 opts.ChunkRange = 1000
opts.ChunkDirRoot = chunkDir opts.ChunkDirRoot = chunkDir
h, err := NewHead(nil, nil, nil, opts, nil) h, err := NewHead(nil, nil, nil, nil, opts, nil)
require.NoError(b, err) require.NoError(b, err)
defer func() { defer func() {
require.NoError(b, h.Close()) require.NoError(b, h.Close())
@ -180,7 +180,7 @@ func BenchmarkQuerierSelect(b *testing.B) {
opts := DefaultHeadOptions() opts := DefaultHeadOptions()
opts.ChunkRange = 1000 opts.ChunkRange = 1000
opts.ChunkDirRoot = chunkDir opts.ChunkDirRoot = chunkDir
h, err := NewHead(nil, nil, nil, opts, nil) h, err := NewHead(nil, nil, nil, nil, opts, nil)
require.NoError(b, err) require.NoError(b, err)
defer h.Close() defer h.Close()
app := h.Appender(context.Background()) app := h.Appender(context.Background())

View file

@ -458,7 +458,7 @@ func TestBlockQuerier_AgainstHeadWithOpenChunks(t *testing.T) {
t.Run("", func(t *testing.T) { t.Run("", func(t *testing.T) {
opts := DefaultHeadOptions() opts := DefaultHeadOptions()
opts.ChunkRange = 2 * time.Hour.Milliseconds() opts.ChunkRange = 2 * time.Hour.Milliseconds()
h, err := NewHead(nil, nil, nil, opts, nil) h, err := NewHead(nil, nil, nil, nil, opts, nil)
require.NoError(t, err) require.NoError(t, err)
defer h.Close() defer h.Close()
@ -627,10 +627,10 @@ func createFakeReaderAndNotPopulatedChunks(s ...[]tsdbutil.Sample) (*fakeChunksR
return f, chks return f, chks
} }
func (r *fakeChunksReader) Chunk(ref chunks.ChunkRef) (chunkenc.Chunk, error) { func (r *fakeChunksReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
chk, ok := r.chks[ref] chk, ok := r.chks[meta.Ref]
if !ok { if !ok {
return nil, errors.Errorf("chunk not found at ref %v", ref) return nil, errors.Errorf("chunk not found at ref %v", meta.Ref)
} }
return chk, nil return chk, nil
} }
@ -1016,8 +1016,8 @@ func BenchmarkMergedSeriesSet(b *testing.B) {
type mockChunkReader map[chunks.ChunkRef]chunkenc.Chunk type mockChunkReader map[chunks.ChunkRef]chunkenc.Chunk
func (cr mockChunkReader) Chunk(id chunks.ChunkRef) (chunkenc.Chunk, error) { func (cr mockChunkReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
chk, ok := cr[id] chk, ok := cr[meta.Ref]
if ok { if ok {
return chk, nil return chk, nil
} }
@ -1643,7 +1643,7 @@ func TestPostingsForMatchers(t *testing.T) {
opts := DefaultHeadOptions() opts := DefaultHeadOptions()
opts.ChunkRange = 1000 opts.ChunkRange = 1000
opts.ChunkDirRoot = chunkDir opts.ChunkDirRoot = chunkDir
h, err := NewHead(nil, nil, nil, opts, nil) h, err := NewHead(nil, nil, nil, nil, opts, nil)
require.NoError(t, err) require.NoError(t, err)
defer func() { defer func() {
require.NoError(t, h.Close()) require.NoError(t, h.Close())
@ -1944,13 +1944,17 @@ func BenchmarkQueries(b *testing.B) {
}, },
} }
queryTypes := make(map[string]storage.Querier) type qt struct {
typ string
querier storage.Querier
}
var queryTypes []qt // We use a slice instead of map to keep the order of test cases consistent.
defer func() { defer func() {
for _, q := range queryTypes { for _, q := range queryTypes {
// Can't run a check for error here as some of these will fail as // Can't run a check for error here as some of these will fail as
// queryTypes is using the same slice for the different block queriers // queryTypes is using the same slice for the different block queriers
// and would have been closed in the previous iteration. // and would have been closed in the previous iteration.
q.Close() q.querier.Close()
} }
}() }()
@ -1991,21 +1995,38 @@ func BenchmarkQueries(b *testing.B) {
qs = append(qs, q) qs = append(qs, q)
} }
queryTypes["_1-Block"] = storage.NewMergeQuerier(qs[:1], nil, storage.ChainedSeriesMerge) queryTypes = append(queryTypes, qt{"_1-Block", storage.NewMergeQuerier(qs[:1], nil, storage.ChainedSeriesMerge)})
queryTypes["_3-Blocks"] = storage.NewMergeQuerier(qs[0:3], nil, storage.ChainedSeriesMerge) queryTypes = append(queryTypes, qt{"_3-Blocks", storage.NewMergeQuerier(qs[0:3], nil, storage.ChainedSeriesMerge)})
queryTypes["_10-Blocks"] = storage.NewMergeQuerier(qs, nil, storage.ChainedSeriesMerge) queryTypes = append(queryTypes, qt{"_10-Blocks", storage.NewMergeQuerier(qs, nil, storage.ChainedSeriesMerge)})
chunkDir := b.TempDir() chunkDir := b.TempDir()
head := createHead(b, nil, series, chunkDir) head := createHead(b, nil, series, chunkDir)
qHead, err := NewBlockQuerier(head, 1, nSamples) qHead, err := NewBlockQuerier(NewRangeHead(head, 1, nSamples), 1, nSamples)
require.NoError(b, err) require.NoError(b, err)
queryTypes["_Head"] = qHead queryTypes = append(queryTypes, qt{"_Head", qHead})
for qtype, querier := range queryTypes { for _, oooPercentage := range []int{1, 3, 5, 10} {
b.Run(title+qtype+"_nSeries:"+strconv.Itoa(nSeries)+"_nSamples:"+strconv.Itoa(int(nSamples)), func(b *testing.B) { chunkDir := b.TempDir()
totalOOOSamples := oooPercentage * int(nSamples) / 100
oooSampleFrequency := int(nSamples) / totalOOOSamples
head := createHeadWithOOOSamples(b, nil, series, chunkDir, oooSampleFrequency)
qHead, err := NewBlockQuerier(NewRangeHead(head, 1, nSamples), 1, nSamples)
require.NoError(b, err)
qOOOHead, err := NewBlockQuerier(NewOOORangeHead(head, 1, nSamples), 1, nSamples)
require.NoError(b, err)
queryTypes = append(queryTypes, qt{
fmt.Sprintf("_Head_oooPercent:%d", oooPercentage),
storage.NewMergeQuerier([]storage.Querier{qHead, qOOOHead}, nil, storage.ChainedSeriesMerge),
})
}
for _, q := range queryTypes {
b.Run(title+q.typ+"_nSeries:"+strconv.Itoa(nSeries)+"_nSamples:"+strconv.Itoa(int(nSamples)), func(b *testing.B) {
expExpansions, err := strconv.Atoi(string(title[len(title)-1])) expExpansions, err := strconv.Atoi(string(title[len(title)-1]))
require.NoError(b, err) require.NoError(b, err)
benchQuery(b, expExpansions, querier, selectors) benchQuery(b, expExpansions, q.querier, selectors)
}) })
} }
require.NoError(b, head.Close()) require.NoError(b, head.Close())
@ -2025,6 +2046,7 @@ func benchQuery(b *testing.B, expExpansions int, q storage.Querier, selectors la
s.Labels() s.Labels()
it := s.Iterator() it := s.Iterator()
for it.Next() { for it.Next() {
_, _ = it.At()
} }
actualExpansions++ actualExpansions++
} }

View file

@ -43,6 +43,8 @@ const (
Tombstones Type = 3 Tombstones Type = 3
// Exemplars is used to match WAL records of type Exemplars. // Exemplars is used to match WAL records of type Exemplars.
Exemplars Type = 4 Exemplars Type = 4
// MmapMarkers is used to match OOO WBL records of type MmapMarkers.
MmapMarkers Type = 5
// Metadata is used to match WAL records of type Metadata. // Metadata is used to match WAL records of type Metadata.
Metadata Type = 6 Metadata Type = 6
) )
@ -57,6 +59,8 @@ func (rt Type) String() string {
return "exemplars" return "exemplars"
case Tombstones: case Tombstones:
return "tombstones" return "tombstones"
case MmapMarkers:
return "mmapmarkers"
case Metadata: case Metadata:
return "metadata" return "metadata"
default: default:
@ -157,6 +161,12 @@ type RefExemplar struct {
Labels labels.Labels Labels labels.Labels
} }
// RefMmapMarker marks that the all the samples of the given series until now have been m-mapped to disk.
type RefMmapMarker struct {
Ref chunks.HeadSeriesRef
MmapRef chunks.ChunkDiskMapperRef
}
// Decoder decodes series, sample, metadata and tombstone records. // Decoder decodes series, sample, metadata and tombstone records.
// The zero value is ready to use. // The zero value is ready to use.
type Decoder struct{} type Decoder struct{}
@ -168,7 +178,7 @@ func (d *Decoder) Type(rec []byte) Type {
return Unknown return Unknown
} }
switch t := Type(rec[0]); t { switch t := Type(rec[0]); t {
case Series, Samples, Tombstones, Exemplars, Metadata: case Series, Samples, Tombstones, Exemplars, MmapMarkers, Metadata:
return t return t
} }
return Unknown return Unknown
@ -354,6 +364,34 @@ func (d *Decoder) ExemplarsFromBuffer(dec *encoding.Decbuf, exemplars []RefExemp
return exemplars, nil return exemplars, nil
} }
func (d *Decoder) MmapMarkers(rec []byte, markers []RefMmapMarker) ([]RefMmapMarker, error) {
dec := encoding.Decbuf{B: rec}
t := Type(dec.Byte())
if t != MmapMarkers {
return nil, errors.New("invalid record type")
}
if dec.Len() == 0 {
return markers, nil
}
for len(dec.B) > 0 && dec.Err() == nil {
ref := chunks.HeadSeriesRef(dec.Be64())
mmapRef := chunks.ChunkDiskMapperRef(dec.Be64())
markers = append(markers, RefMmapMarker{
Ref: ref,
MmapRef: mmapRef,
})
}
if dec.Err() != nil {
return nil, errors.Wrapf(dec.Err(), "decode error after %d mmap markers", len(markers))
}
if len(dec.B) > 0 {
return nil, errors.Errorf("unexpected %d bytes left in entry", len(dec.B))
}
return markers, nil
}
// Encoder encodes series, sample, and tombstones records. // Encoder encodes series, sample, and tombstones records.
// The zero value is ready to use. // The zero value is ready to use.
type Encoder struct{} type Encoder struct{}
@ -467,3 +505,15 @@ func (e *Encoder) EncodeExemplarsIntoBuffer(exemplars []RefExemplar, buf *encodi
EncodeLabels(buf, ex.Labels) EncodeLabels(buf, ex.Labels)
} }
} }
func (e *Encoder) MmapMarkers(markers []RefMmapMarker, b []byte) []byte {
buf := encoding.Encbuf{B: b}
buf.PutByte(byte(MmapMarkers))
for _, s := range markers {
buf.PutBE64(uint64(s.Ref))
buf.PutBE64(uint64(s.MmapRef))
}
return buf.Get()
}

View file

@ -40,6 +40,7 @@ const (
DefaultSegmentSize = 128 * 1024 * 1024 // 128 MB DefaultSegmentSize = 128 * 1024 * 1024 // 128 MB
pageSize = 32 * 1024 // 32KB pageSize = 32 * 1024 // 32KB
recordHeaderSize = 7 recordHeaderSize = 7
WblDirName = "wbl"
) )
// The table gets initialized with sync.Once but may still cause a race // The table gets initialized with sync.Once but may still cause a race
@ -204,32 +205,32 @@ func newWALMetrics(r prometheus.Registerer) *walMetrics {
m := &walMetrics{} m := &walMetrics{}
m.fsyncDuration = prometheus.NewSummary(prometheus.SummaryOpts{ m.fsyncDuration = prometheus.NewSummary(prometheus.SummaryOpts{
Name: "prometheus_tsdb_wal_fsync_duration_seconds", Name: "fsync_duration_seconds",
Help: "Duration of WAL fsync.", Help: "Duration of WAL fsync.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
}) })
m.pageFlushes = prometheus.NewCounter(prometheus.CounterOpts{ m.pageFlushes = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_page_flushes_total", Name: "page_flushes_total",
Help: "Total number of page flushes.", Help: "Total number of page flushes.",
}) })
m.pageCompletions = prometheus.NewCounter(prometheus.CounterOpts{ m.pageCompletions = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_completed_pages_total", Name: "completed_pages_total",
Help: "Total number of completed pages.", Help: "Total number of completed pages.",
}) })
m.truncateFail = prometheus.NewCounter(prometheus.CounterOpts{ m.truncateFail = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_truncations_failed_total", Name: "truncations_failed_total",
Help: "Total number of WAL truncations that failed.", Help: "Total number of WAL truncations that failed.",
}) })
m.truncateTotal = prometheus.NewCounter(prometheus.CounterOpts{ m.truncateTotal = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_truncations_total", Name: "truncations_total",
Help: "Total number of WAL truncations attempted.", Help: "Total number of WAL truncations attempted.",
}) })
m.currentSegment = prometheus.NewGauge(prometheus.GaugeOpts{ m.currentSegment = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "prometheus_tsdb_wal_segment_current", Name: "segment_current",
Help: "WAL segment index that TSDB is currently writing to.", Help: "WAL segment index that TSDB is currently writing to.",
}) })
m.writesFailed = prometheus.NewCounter(prometheus.CounterOpts{ m.writesFailed = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_writes_failed_total", Name: "writes_failed_total",
Help: "Total number of WAL writes that failed.", Help: "Total number of WAL writes that failed.",
}) })
@ -274,7 +275,11 @@ func NewSize(logger log.Logger, reg prometheus.Registerer, dir string, segmentSi
stopc: make(chan chan struct{}), stopc: make(chan chan struct{}),
compress: compress, compress: compress,
} }
w.metrics = newWALMetrics(reg) prefix := "prometheus_tsdb_wal_"
if filepath.Base(dir) == WblDirName {
prefix = "prometheus_tsdb_out_of_order_wal_"
}
w.metrics = newWALMetrics(prometheus.WrapRegistererWithPrefix(prefix, reg))
_, last, err := Segments(w.Dir()) _, last, err := Segments(w.Dir())
if err != nil { if err != nil {
@ -459,36 +464,46 @@ func SegmentName(dir string, i int) string {
return filepath.Join(dir, fmt.Sprintf("%08d", i)) return filepath.Join(dir, fmt.Sprintf("%08d", i))
} }
// NextSegment creates the next segment and closes the previous one. // NextSegment creates the next segment and closes the previous one asynchronously.
func (w *WAL) NextSegment() error { // It returns the file number of the new file.
func (w *WAL) NextSegment() (int, error) {
w.mtx.Lock() w.mtx.Lock()
defer w.mtx.Unlock() defer w.mtx.Unlock()
return w.nextSegment() return w.nextSegment(true)
}
// NextSegmentSync creates the next segment and closes the previous one in sync.
// It returns the file number of the new file.
func (w *WAL) NextSegmentSync() (int, error) {
w.mtx.Lock()
defer w.mtx.Unlock()
return w.nextSegment(false)
} }
// nextSegment creates the next segment and closes the previous one. // nextSegment creates the next segment and closes the previous one.
func (w *WAL) nextSegment() error { // It returns the file number of the new file.
func (w *WAL) nextSegment(async bool) (int, error) {
if w.closed { if w.closed {
return errors.New("wal is closed") return 0, errors.New("wal is closed")
} }
// Only flush the current page if it actually holds data. // Only flush the current page if it actually holds data.
if w.page.alloc > 0 { if w.page.alloc > 0 {
if err := w.flushPage(true); err != nil { if err := w.flushPage(true); err != nil {
return err return 0, err
} }
} }
next, err := CreateSegment(w.Dir(), w.segment.Index()+1) next, err := CreateSegment(w.Dir(), w.segment.Index()+1)
if err != nil { if err != nil {
return errors.Wrap(err, "create new segment file") return 0, errors.Wrap(err, "create new segment file")
} }
prev := w.segment prev := w.segment
if err := w.setSegment(next); err != nil { if err := w.setSegment(next); err != nil {
return err return 0, err
} }
// Don't block further writes by fsyncing the last segment. // Don't block further writes by fsyncing the last segment.
w.actorc <- func() { f := func() {
if err := w.fsync(prev); err != nil { if err := w.fsync(prev); err != nil {
level.Error(w.logger).Log("msg", "sync previous segment", "err", err) level.Error(w.logger).Log("msg", "sync previous segment", "err", err)
} }
@ -496,7 +511,12 @@ func (w *WAL) nextSegment() error {
level.Error(w.logger).Log("msg", "close previous segment", "err", err) level.Error(w.logger).Log("msg", "close previous segment", "err", err)
} }
} }
return nil if async {
w.actorc <- f
} else {
f()
}
return next.Index(), nil
} }
func (w *WAL) setSegment(segment *Segment) error { func (w *WAL) setSegment(segment *Segment) error {
@ -638,7 +658,7 @@ func (w *WAL) log(rec []byte, final bool) error {
left += (pageSize - recordHeaderSize) * (w.pagesPerSegment() - w.donePages - 1) // Free pages in the active segment. left += (pageSize - recordHeaderSize) * (w.pagesPerSegment() - w.donePages - 1) // Free pages in the active segment.
if len(rec) > left { if len(rec) > left {
if err := w.nextSegment(); err != nil { if _, err := w.nextSegment(true); err != nil {
return err return err
} }
} }
@ -745,6 +765,13 @@ func (w *WAL) fsync(f *Segment) error {
return err return err
} }
// Sync forces a file sync on the current wal segment. This function is meant
// to be used only on tests due to different behaviour on Operating Systems
// like windows and linux
func (w *WAL) Sync() error {
return w.fsync(w.segment)
}
// Close flushes all writes and closes active segment. // Close flushes all writes and closes active segment.
func (w *WAL) Close() (err error) { func (w *WAL) Close() (err error) {
w.mtx.Lock() w.mtx.Lock()

View file

@ -364,14 +364,16 @@ func TestReadCheckpoint(t *testing.T) {
err := os.Mkdir(wdir, 0o777) err := os.Mkdir(wdir, 0o777)
require.NoError(t, err) require.NoError(t, err)
os.Create(SegmentName(wdir, 30)) f, err := os.Create(SegmentName(wdir, 30))
require.NoError(t, err)
require.NoError(t, f.Close())
enc := record.Encoder{} enc := record.Encoder{}
w, err := NewSize(nil, nil, wdir, 128*pageSize, compress) w, err := NewSize(nil, nil, wdir, 128*pageSize, compress)
require.NoError(t, err) require.NoError(t, err)
defer func() { t.Cleanup(func() {
require.NoError(t, w.Close()) require.NoError(t, w.Close())
}() })
// Write to the initial segment then checkpoint. // Write to the initial segment then checkpoint.
for i := 0; i < seriesCount; i++ { for i := 0; i < seriesCount; i++ {
@ -396,8 +398,11 @@ func TestReadCheckpoint(t *testing.T) {
require.NoError(t, w.Log(sample)) require.NoError(t, w.Log(sample))
} }
} }
Checkpoint(log.NewNopLogger(), w, 30, 31, func(x chunks.HeadSeriesRef) bool { return true }, 0) _, err = w.NextSegmentSync()
w.Truncate(32) require.NoError(t, err)
_, err = Checkpoint(log.NewNopLogger(), w, 30, 31, func(x chunks.HeadSeriesRef) bool { return true }, 0)
require.NoError(t, err)
require.NoError(t, w.Truncate(32))
// Start read after checkpoint, no more data written. // Start read after checkpoint, no more data written.
_, _, err = Segments(w.Dir()) _, _, err = Segments(w.Dir())

View file

@ -2314,7 +2314,7 @@ func (f *fakeDB) Stats(statsByLabelName string) (_ *tsdb.Stats, retErr error) {
}() }()
opts := tsdb.DefaultHeadOptions() opts := tsdb.DefaultHeadOptions()
opts.ChunkRange = 1000 opts.ChunkRange = 1000
h, _ := tsdb.NewHead(nil, nil, nil, opts, nil) h, _ := tsdb.NewHead(nil, nil, nil, nil, opts, nil)
return h.Stats(statsByLabelName), nil return h.Stats(statsByLabelName), nil
} }