Add out-of-order sample support to the TSDB (#11075)

* Introduce out-of-order TSDB support

This implementation is based on this design doc:
https://docs.google.com/document/d/1Kppm7qL9C-BJB1j6yb6-9ObG3AbdZnFUBYPNNWwDBYM/edit?usp=sharing

This commit adds support to accept out-of-order ("OOO") sample into the TSDB
up to a configurable time allowance. If OOO is enabled, overlapping querying
are automatically enabled.

Most of the additions have been borrowed from
https://github.com/grafana/mimir-prometheus/
Here is the list ist of the original commits cherry picked
from mimir-prometheus into this branch:
- 4b2198d7ec
- 2836e5513f
- 00b379c3a5
- ff0dc75758
- a632c73352
- c6f3d4ab33
- 5e8406a1d4
- abde1e0ba1
- e70e769889
- df59320886

Co-authored-by: Jesus Vazquez <jesus.vazquez@grafana.com>
Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com>
Co-authored-by: Dieter Plaetinck <dieter@grafana.com>
Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* gofumpt files

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Add license header to missing files

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Fix OOO tests due to existing chunk disk mapper implementation

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Fix truncate int overflow

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Add Sync method to the WAL and update tests

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* remove useless sync

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Update minOOOTime after truncating Head

* Update minOOOTime after truncating Head

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix lint

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Add a unit test

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Load OutOfOrderTimeWindow only once per appender

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Fix OOO Head LabelValues and PostingsForMatchers

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Fix replay of OOO mmap chunks

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Remove unnecessary err check

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Prevent panic with ApplyConfig

Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com
Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Run OOO compaction after restart if there is OOO data from WBL

Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com
Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Apply Bartek's suggestions

Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com>
Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Refactor OOO compaction

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Address comments and TODOs

- Added a comment explaining why we need the allow overlapping
  compaction toggle
- Clarified TSDBConfig OutOfOrderTimeWindow doc
- Added an owner to all the TODOs in the code

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Run go format

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Fix remaining review comments

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix tests

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Change wbl reference when truncating ooo in TestHeadMinOOOTimeUpdate

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>

* Fix TestWBLAndMmapReplay test failure on windows

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Address most of the feedback

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Refactor the block meta for out of order

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix windows error

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix review comments

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com>
Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com
Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com>
Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com>
Co-authored-by: Dieter Plaetinck <dieter@grafana.com>
Co-authored-by: Oleg Zaytsev <mail@olegzaytsev.com>
Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com>
This commit is contained in:
Jesus Vazquez 2022-09-20 19:05:50 +02:00 committed by GitHub
parent af6167df58
commit c1b669bf9b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
38 changed files with 6655 additions and 380 deletions

View file

@ -463,6 +463,9 @@ func main() {
}
cfg.tsdb.MaxExemplars = int64(cfgFile.StorageConfig.ExemplarsConfig.MaxExemplars)
}
if cfgFile.StorageConfig.TSDBConfig != nil {
cfg.tsdb.OutOfOrderTimeWindow = cfgFile.StorageConfig.TSDBConfig.OutOfOrderTimeWindow
}
// Now that the validity of the config is established, set the config
// success metrics accordingly, although the config isn't really loaded
@ -1537,6 +1540,7 @@ type tsdbOptions struct {
StripeSize int
MinBlockDuration model.Duration
MaxBlockDuration model.Duration
OutOfOrderTimeWindow int64
EnableExemplarStorage bool
MaxExemplars int64
EnableMemorySnapshotOnShutdown bool
@ -1549,7 +1553,8 @@ func (opts tsdbOptions) ToTSDBOptions() tsdb.Options {
RetentionDuration: int64(time.Duration(opts.RetentionDuration) / time.Millisecond),
MaxBytes: int64(opts.MaxBytes),
NoLockfile: opts.NoLockfile,
AllowOverlappingBlocks: opts.AllowOverlappingBlocks,
AllowOverlappingCompaction: opts.AllowOverlappingBlocks,
AllowOverlappingQueries: opts.AllowOverlappingBlocks,
WALCompression: opts.WALCompression,
HeadChunksWriteQueueSize: opts.HeadChunksWriteQueueSize,
StripeSize: opts.StripeSize,
@ -1558,6 +1563,7 @@ func (opts tsdbOptions) ToTSDBOptions() tsdb.Options {
EnableExemplarStorage: opts.EnableExemplarStorage,
MaxExemplars: opts.MaxExemplars,
EnableMemorySnapshotOnShutdown: opts.EnableMemorySnapshotOnShutdown,
OutOfOrderTimeWindow: opts.OutOfOrderTimeWindow,
}
}

View file

@ -117,7 +117,8 @@ func TestBackfillRuleIntegration(t *testing.T) {
}
opts := tsdb.DefaultOptions()
opts.AllowOverlappingBlocks = true
opts.AllowOverlappingQueries = true
opts.AllowOverlappingCompaction = true
db, err := tsdb.Open(tmpDir, nil, nil, opts, nil)
require.NoError(t, err)
@ -245,7 +246,8 @@ func TestBackfillLabels(t *testing.T) {
}
opts := tsdb.DefaultOptions()
opts.AllowOverlappingBlocks = true
opts.AllowOverlappingQueries = true
opts.AllowOverlappingCompaction = true
db, err := tsdb.Open(tmpDir, nil, nil, opts, nil)
require.NoError(t, err)

View file

@ -597,7 +597,7 @@ func analyzeCompaction(block tsdb.BlockReader, indexr tsdb.IndexReader) (err err
for _, chk := range chks {
// Load the actual data of the chunk.
chk, err := chunkr.Chunk(chk.Ref)
chk, err := chunkr.Chunk(chk)
if err != nil {
return err
}

View file

@ -501,9 +501,37 @@ func (c *ScrapeConfig) MarshalYAML() (interface{}, error) {
// StorageConfig configures runtime reloadable configuration options.
type StorageConfig struct {
TSDBConfig *TSDBConfig `yaml:"tsdb,omitempty"`
ExemplarsConfig *ExemplarsConfig `yaml:"exemplars,omitempty"`
}
// TSDBConfig configures runtime reloadable configuration options.
type TSDBConfig struct {
// OutOfOrderTimeWindow sets how long back in time an out-of-order sample can be inserted
// into the TSDB. This flag is typically set while unmarshaling the configuration file and translating
// OutOfOrderTimeWindowFlag's duration. The unit of this flag is expected to be the same as any
// other timestamp in the TSDB.
OutOfOrderTimeWindow int64
// OutOfOrderTimeWindowFlag holds the parsed duration from the config file.
// During unmarshall, this is converted into milliseconds and stored in OutOfOrderTimeWindow.
// This should not be used directly and must be converted into OutOfOrderTimeWindow.
OutOfOrderTimeWindowFlag model.Duration `yaml:"out_of_order_time_window,omitempty"`
}
// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (t *TSDBConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
*t = TSDBConfig{}
type plain TSDBConfig
if err := unmarshal((*plain)(t)); err != nil {
return err
}
t.OutOfOrderTimeWindow = time.Duration(t.OutOfOrderTimeWindowFlag).Milliseconds()
return nil
}
type TracingClientType string
const (

View file

@ -27,10 +27,15 @@ import (
// The errors exposed.
var (
ErrNotFound = errors.New("not found")
ErrOutOfOrderSample = errors.New("out of order sample")
ErrNotFound = errors.New("not found")
// ErrOutOfOrderSample is when out of order support is disabled and the sample is out of order.
ErrOutOfOrderSample = errors.New("out of order sample")
// ErrOutOfBounds is when out of order support is disabled and the sample is older than the min valid time for the append.
ErrOutOfBounds = errors.New("out of bounds")
// ErrTooOldSample is when out of order support is enabled but the sample is outside the time window allowed.
ErrTooOldSample = errors.New("too old sample")
// ErrDuplicateSampleForTimestamp is when the sample has same timestamp but different value.
ErrDuplicateSampleForTimestamp = errors.New("duplicate sample for timestamp")
ErrOutOfBounds = errors.New("out of bounds")
ErrOutOfOrderExemplar = errors.New("out of order exemplar")
ErrDuplicateExemplar = errors.New("duplicate exemplar")
ErrExemplarLabelLength = fmt.Errorf("label length for exemplar exceeds maximum of %d UTF-8 characters", exemplar.ExemplarMaxLabelSetLength)

View file

@ -717,3 +717,56 @@ func (h *chunkIteratorHeap) Pop() interface{} {
*h = old[0 : n-1]
return x
}
// NewConcatenatingChunkSeriesMerger returns a VerticalChunkSeriesMergeFunc that simply concatenates the
// chunks from the series. The resultant stream of chunks for a series might be overlapping and unsorted.
func NewConcatenatingChunkSeriesMerger() VerticalChunkSeriesMergeFunc {
return func(series ...ChunkSeries) ChunkSeries {
if len(series) == 0 {
return nil
}
return &ChunkSeriesEntry{
Lset: series[0].Labels(),
ChunkIteratorFn: func() chunks.Iterator {
iterators := make([]chunks.Iterator, 0, len(series))
for _, s := range series {
iterators = append(iterators, s.Iterator())
}
return &concatenatingChunkIterator{
iterators: iterators,
}
},
}
}
}
type concatenatingChunkIterator struct {
iterators []chunks.Iterator
idx int
curr chunks.Meta
}
func (c *concatenatingChunkIterator) At() chunks.Meta {
return c.curr
}
func (c *concatenatingChunkIterator) Next() bool {
if c.idx >= len(c.iterators) {
return false
}
if c.iterators[c.idx].Next() {
c.curr = c.iterators[c.idx].At()
return true
}
c.idx++
return c.Next()
}
func (c *concatenatingChunkIterator) Err() error {
errs := tsdb_errors.NewMulti()
for _, iter := range c.iterators {
errs.Add(iter.Err())
}
return errs.Err()
}

View file

@ -499,6 +499,140 @@ func TestCompactingChunkSeriesMerger(t *testing.T) {
}
}
func TestConcatenatingChunkSeriesMerger(t *testing.T) {
m := NewConcatenatingChunkSeriesMerger()
for _, tc := range []struct {
name string
input []ChunkSeries
expected ChunkSeries
}{
{
name: "single empty series",
input: []ChunkSeries{
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil),
},
expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil),
},
{
name: "single series",
input: []ChunkSeries{
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}}),
},
expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}}),
},
{
name: "two empty series",
input: []ChunkSeries{
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil),
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil),
},
expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil, nil),
},
{
name: "two non overlapping",
input: []ChunkSeries{
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}, sample{5, 5}}),
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{7, 7}, sample{9, 9}}, []tsdbutil.Sample{sample{10, 10}}),
},
expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}, sample{5, 5}}, []tsdbutil.Sample{sample{7, 7}, sample{9, 9}}, []tsdbutil.Sample{sample{10, 10}}),
},
{
name: "two overlapping",
input: []ChunkSeries{
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}, sample{8, 8}}),
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{7, 7}, sample{9, 9}}, []tsdbutil.Sample{sample{10, 10}}),
},
expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
[]tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}, sample{8, 8}},
[]tsdbutil.Sample{sample{7, 7}, sample{9, 9}}, []tsdbutil.Sample{sample{10, 10}},
),
},
{
name: "two duplicated",
input: []ChunkSeries{
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}}),
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{2, 2}, sample{3, 3}, sample{5, 5}}),
},
expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
[]tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}},
[]tsdbutil.Sample{sample{2, 2}, sample{3, 3}, sample{5, 5}},
),
},
{
name: "three overlapping",
input: []ChunkSeries{
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}}),
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{2, 2}, sample{3, 3}, sample{6, 6}}),
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{0, 0}, sample{4, 4}}),
},
expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
[]tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}},
[]tsdbutil.Sample{sample{2, 2}, sample{3, 3}, sample{6, 6}},
[]tsdbutil.Sample{sample{0, 0}, sample{4, 4}},
),
},
{
name: "three in chained overlap",
input: []ChunkSeries{
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}}),
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{4, 4}, sample{6, 66}}),
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{6, 6}, sample{10, 10}}),
},
expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
[]tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}},
[]tsdbutil.Sample{sample{4, 4}, sample{6, 66}},
[]tsdbutil.Sample{sample{6, 6}, sample{10, 10}},
),
},
{
name: "three in chained overlap complex",
input: []ChunkSeries{
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{0, 0}, sample{5, 5}}, []tsdbutil.Sample{sample{10, 10}, sample{15, 15}}),
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{2, 2}, sample{20, 20}}, []tsdbutil.Sample{sample{25, 25}, sample{30, 30}}),
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{18, 18}, sample{26, 26}}, []tsdbutil.Sample{sample{31, 31}, sample{35, 35}}),
},
expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
[]tsdbutil.Sample{sample{0, 0}, sample{5, 5}}, []tsdbutil.Sample{sample{10, 10}, sample{15, 15}},
[]tsdbutil.Sample{sample{2, 2}, sample{20, 20}}, []tsdbutil.Sample{sample{25, 25}, sample{30, 30}},
[]tsdbutil.Sample{sample{18, 18}, sample{26, 26}}, []tsdbutil.Sample{sample{31, 31}, sample{35, 35}},
),
},
{
name: "110 overlapping",
input: []ChunkSeries{
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), tsdbutil.GenerateSamples(0, 110)), // [0 - 110)
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), tsdbutil.GenerateSamples(60, 50)), // [60 - 110)
},
expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
tsdbutil.GenerateSamples(0, 110),
tsdbutil.GenerateSamples(60, 50),
),
},
{
name: "150 overlapping samples, simply concatenated and no splits",
input: []ChunkSeries{
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), tsdbutil.GenerateSamples(0, 90)), // [0 - 90)
NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), tsdbutil.GenerateSamples(60, 90)), // [90 - 150)
},
expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
tsdbutil.GenerateSamples(0, 90),
tsdbutil.GenerateSamples(60, 90),
),
},
} {
t.Run(tc.name, func(t *testing.T) {
merged := m(tc.input...)
require.Equal(t, tc.expected.Labels(), merged.Labels())
actChks, actErr := ExpandChunks(merged.Iterator())
expChks, expErr := ExpandChunks(tc.expected.Iterator())
require.Equal(t, expErr, actErr)
require.Equal(t, expChks, actChks)
})
}
}
type mockQuerier struct {
LabelQuerier

View file

@ -567,8 +567,7 @@ func (db *DB) truncate(mint int64) error {
// Start a new segment so low ingestion volume instances don't have more WAL
// than needed.
err = db.wal.NextSegment()
if err != nil {
if _, err := db.wal.NextSegment(); err != nil {
return errors.Wrap(err, "next segment")
}

View file

@ -116,7 +116,7 @@ type ChunkWriter interface {
// ChunkReader provides reading access of serialized time series data.
type ChunkReader interface {
// Chunk returns the series data chunk with the given reference.
Chunk(ref chunks.ChunkRef) (chunkenc.Chunk, error)
Chunk(meta chunks.Meta) (chunkenc.Chunk, error)
// Close releases all underlying resources of the reader.
Close() error
@ -189,12 +189,39 @@ type BlockMetaCompaction struct {
// this block.
Parents []BlockDesc `json:"parents,omitempty"`
Failed bool `json:"failed,omitempty"`
// Additional information about the compaction, for example, block created from out-of-order chunks.
Hints []string `json:"hints,omitempty"`
}
func (bm *BlockMetaCompaction) SetOutOfOrder() {
if bm.containsHint(CompactionHintFromOutOfOrder) {
return
}
bm.Hints = append(bm.Hints, CompactionHintFromOutOfOrder)
sort.Strings(bm.Hints)
}
func (bm *BlockMetaCompaction) FromOutOfOrder() bool {
return bm.containsHint(CompactionHintFromOutOfOrder)
}
func (bm *BlockMetaCompaction) containsHint(hint string) bool {
for _, h := range bm.Hints {
if h == hint {
return true
}
}
return false
}
const (
indexFilename = "index"
metaFilename = "meta.json"
metaVersion1 = 1
// CompactionHintFromOutOfOrder is a hint noting that the block
// was created from out-of-order chunks.
CompactionHintFromOutOfOrder = "from-out-of-order"
)
func chunkDir(dir string) string { return filepath.Join(dir, "chunks") }

View file

@ -27,6 +27,7 @@ import (
"testing"
"github.com/go-kit/log"
prom_testutil "github.com/prometheus/client_golang/prometheus/testutil"
"github.com/stretchr/testify/require"
"github.com/prometheus/prometheus/model/labels"
@ -487,7 +488,7 @@ func createBlockFromHead(tb testing.TB, dir string, head *Head) string {
func createHead(tb testing.TB, w *wal.WAL, series []storage.Series, chunkDir string) *Head {
opts := DefaultHeadOptions()
opts.ChunkDirRoot = chunkDir
head, err := NewHead(nil, nil, w, opts, nil)
head, err := NewHead(nil, nil, w, nil, opts, nil)
require.NoError(tb, err)
app := head.Appender(context.Background())
@ -506,6 +507,66 @@ func createHead(tb testing.TB, w *wal.WAL, series []storage.Series, chunkDir str
return head
}
func createHeadWithOOOSamples(tb testing.TB, w *wal.WAL, series []storage.Series, chunkDir string, oooSampleFrequency int) *Head {
opts := DefaultHeadOptions()
opts.ChunkDirRoot = chunkDir
opts.OutOfOrderTimeWindow.Store(10000000000)
head, err := NewHead(nil, nil, w, nil, opts, nil)
require.NoError(tb, err)
oooSampleLabels := make([]labels.Labels, 0, len(series))
oooSamples := make([]tsdbutil.SampleSlice, 0, len(series))
totalSamples := 0
app := head.Appender(context.Background())
for _, s := range series {
ref := storage.SeriesRef(0)
it := s.Iterator()
lset := s.Labels()
os := tsdbutil.SampleSlice{}
count := 0
for it.Next() {
totalSamples++
count++
t, v := it.At()
if count%oooSampleFrequency == 0 {
os = append(os, sample{t: t, v: v})
continue
}
ref, err = app.Append(ref, lset, t, v)
require.NoError(tb, err)
}
require.NoError(tb, it.Err())
if len(os) > 0 {
oooSampleLabels = append(oooSampleLabels, lset)
oooSamples = append(oooSamples, os)
}
}
require.NoError(tb, app.Commit())
oooSamplesAppended := 0
require.Equal(tb, float64(0), prom_testutil.ToFloat64(head.metrics.outOfOrderSamplesAppended))
app = head.Appender(context.Background())
for i, lset := range oooSampleLabels {
ref := storage.SeriesRef(0)
for _, sample := range oooSamples[i] {
ref, err = app.Append(ref, lset, sample.T(), sample.V())
require.NoError(tb, err)
oooSamplesAppended++
}
}
require.NoError(tb, app.Commit())
actOOOAppended := prom_testutil.ToFloat64(head.metrics.outOfOrderSamplesAppended)
require.GreaterOrEqual(tb, actOOOAppended, float64(oooSamplesAppended-len(series)))
require.LessOrEqual(tb, actOOOAppended, float64(oooSamplesAppended))
require.Equal(tb, float64(totalSamples), prom_testutil.ToFloat64(head.metrics.samplesAppended))
return head
}
const (
defaultLabelName = "labelName"
defaultLabelValue = "labelValue"

View file

@ -39,7 +39,7 @@ type BlockWriter struct {
}
// ErrNoSeriesAppended is returned if the series count is zero while flushing blocks.
var ErrNoSeriesAppended error = errors.New("no series appended, aborting")
var ErrNoSeriesAppended = errors.New("no series appended, aborting")
// NewBlockWriter create a new block writer.
//
@ -71,7 +71,7 @@ func (w *BlockWriter) initHead() error {
opts := DefaultHeadOptions()
opts.ChunkRange = w.blockSize
opts.ChunkDirRoot = w.chunkDir
h, err := NewHead(nil, w.logger, nil, opts, NewHeadStats())
h, err := NewHead(nil, w.logger, nil, nil, opts, NewHeadStats())
if err != nil {
return errors.Wrap(err, "tsdb.NewHead")
}

View file

@ -39,6 +39,21 @@ const (
EncXOR
)
// Chunk encodings for out-of-order chunks.
// These encodings must be only used by the Head block for its internal bookkeeping.
const (
OutOfOrderMask = 0b10000000
EncOOOXOR = EncXOR | OutOfOrderMask
)
func IsOutOfOrderChunk(e Encoding) bool {
return (e & OutOfOrderMask) != 0
}
func IsValidEncoding(e Encoding) bool {
return e == EncXOR || e == EncOOOXOR
}
// Chunk holds a sequence of sample pairs that can be iterated over and appended to.
type Chunk interface {
// Bytes returns the underlying byte slice of the chunk.
@ -155,7 +170,7 @@ func NewPool() Pool {
func (p *pool) Get(e Encoding, b []byte) (Chunk, error) {
switch e {
case EncXOR:
case EncXOR, EncOOOXOR:
c := p.xor.Get().(*XORChunk)
c.b.stream = b
c.b.count = 0
@ -166,7 +181,7 @@ func (p *pool) Get(e Encoding, b []byte) (Chunk, error) {
func (p *pool) Put(c Chunk) error {
switch c.Encoding() {
case EncXOR:
case EncXOR, EncOOOXOR:
xc, ok := c.(*XORChunk)
// This may happen often with wrapped chunks. Nothing we can really do about
// it but returning an error would cause a lot of allocations again. Thus,
@ -188,7 +203,7 @@ func (p *pool) Put(c Chunk) error {
// bytes.
func FromData(e Encoding, d []byte) (Chunk, error) {
switch e {
case EncXOR:
case EncXOR, EncOOOXOR:
return &XORChunk{b: bstream{count: 0, stream: d}}, nil
}
return nil, errors.Errorf("invalid chunk encoding %q", e)

View file

@ -457,3 +457,12 @@ func (it *xorIterator) readValue() bool {
it.numRead++
return true
}
// OOOXORChunk holds a XORChunk and overrides the Encoding() method.
type OOOXORChunk struct {
*XORChunk
}
func (c *OOOXORChunk) Encoding() Encoding {
return EncOOOXOR
}

View file

@ -121,6 +121,15 @@ type Meta struct {
// Time range the data covers.
// When MaxTime == math.MaxInt64 the chunk is still open and being appended to.
MinTime, MaxTime int64
// OOOLastRef, OOOLastMinTime and OOOLastMaxTime are kept as markers for
// overlapping chunks.
// These fields point to the last created out of order Chunk (the head) that existed
// when Series() was called and was overlapping.
// Series() and Chunk() method responses should be consistent for the same
// query even if new data is added in between the calls.
OOOLastRef ChunkRef
OOOLastMinTime, OOOLastMaxTime int64
}
// Iterator iterates over the chunks of a single time series.
@ -556,8 +565,8 @@ func (s *Reader) Size() int64 {
}
// Chunk returns a chunk from a given reference.
func (s *Reader) Chunk(ref ChunkRef) (chunkenc.Chunk, error) {
sgmIndex, chkStart := BlockChunkRef(ref).Unpack()
func (s *Reader) Chunk(meta Meta) (chunkenc.Chunk, error) {
sgmIndex, chkStart := BlockChunkRef(meta.Ref).Unpack()
if sgmIndex >= len(s.bs) {
return nil, errors.Errorf("segment index %d out of range", sgmIndex)

View file

@ -23,6 +23,6 @@ func TestReaderWithInvalidBuffer(t *testing.T) {
b := realByteSlice([]byte{0x81, 0x81, 0x81, 0x81, 0x81, 0x81})
r := &Reader{bs: []ByteSlice{b}}
_, err := r.Chunk(0)
_, err := r.Chunk(Meta{Ref: 0})
require.Error(t, err)
}

View file

@ -87,6 +87,18 @@ func (ref ChunkDiskMapperRef) Unpack() (seq, offset int) {
return seq, offset
}
func (ref ChunkDiskMapperRef) GreaterThanOrEqualTo(r ChunkDiskMapperRef) bool {
s1, o1 := ref.Unpack()
s2, o2 := r.Unpack()
return s1 > s2 || (s1 == s2 && o1 >= o2)
}
func (ref ChunkDiskMapperRef) GreaterThan(r ChunkDiskMapperRef) bool {
s1, o1 := ref.Unpack()
s2, o2 := r.Unpack()
return s1 > s2 || (s1 == s2 && o1 > o2)
}
// CorruptionErr is an error that's returned when corruption is encountered.
type CorruptionErr struct {
Dir string
@ -736,7 +748,7 @@ func (cdm *ChunkDiskMapper) Chunk(ref ChunkDiskMapperRef) (chunkenc.Chunk, error
// and runs the provided function with information about each chunk. It returns on the first error encountered.
// NOTE: This method needs to be called at least once after creating ChunkDiskMapper
// to set the maxt of all the file.
func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, numSamples uint16) error) (err error) {
func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, numSamples uint16, encoding chunkenc.Encoding) error) (err error) {
cdm.writePathMtx.Lock()
defer cdm.writePathMtx.Unlock()
@ -799,7 +811,8 @@ func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chu
break
}
idx += ChunkEncodingSize // Skip encoding.
chkEnc := chunkenc.Encoding(mmapFile.byteSlice.Range(idx, idx+ChunkEncodingSize)[0])
idx += ChunkEncodingSize
dataLen, n := binary.Uvarint(mmapFile.byteSlice.Range(idx, idx+MaxChunkLengthFieldSize))
idx += n
@ -834,7 +847,7 @@ func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chu
mmapFile.maxt = maxt
}
if err := f(seriesRef, chunkRef, mint, maxt, numSamples); err != nil {
if err := f(seriesRef, chunkRef, mint, maxt, numSamples, chkEnc); err != nil {
if cerr, ok := err.(*CorruptionErr); ok {
cerr.Dir = cdm.dir.Name()
cerr.FileIndex = segID
@ -857,12 +870,8 @@ func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chu
return nil
}
// Truncate deletes the head chunk files which are strictly below the mint.
// mint should be in milliseconds.
func (cdm *ChunkDiskMapper) Truncate(mint int64) error {
if !cdm.fileMaxtSet {
return errors.New("maxt of the files are not set")
}
// Truncate deletes the head chunk files whose file number is less than given fileNo.
func (cdm *ChunkDiskMapper) Truncate(fileNo uint32) error {
cdm.readPathMtx.RLock()
// Sort the file indices, else if files deletion fails in between,
@ -875,12 +884,10 @@ func (cdm *ChunkDiskMapper) Truncate(mint int64) error {
var removedFiles []int
for _, seq := range chkFileIndices {
if seq == cdm.curFileSequence || cdm.mmappedChunkFiles[seq].maxt >= mint {
if seq == cdm.curFileSequence || uint32(seq) >= fileNo {
break
}
if cdm.mmappedChunkFiles[seq].maxt < mint {
removedFiles = append(removedFiles, seq)
}
removedFiles = append(removedFiles, seq)
}
cdm.readPathMtx.RUnlock()

View file

@ -58,6 +58,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) {
mint, maxt int64
numSamples uint16
chunk chunkenc.Chunk
isOOO bool
}
expectedData := []expectedDataType{}
@ -67,7 +68,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) {
for hrw.curFileSequence < 3 || hrw.chkWriter.Buffered() == 0 {
addChunks := func(numChunks int) {
for i := 0; i < numChunks; i++ {
seriesRef, chkRef, mint, maxt, chunk := createChunk(t, totalChunks, hrw)
seriesRef, chkRef, mint, maxt, chunk, isOOO := createChunk(t, totalChunks, hrw)
totalChunks++
expectedData = append(expectedData, expectedDataType{
seriesRef: seriesRef,
@ -76,6 +77,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) {
chunkRef: chkRef,
chunk: chunk,
numSamples: uint16(chunk.NumSamples()),
isOOO: isOOO,
})
if hrw.curFileSequence != 1 {
@ -147,7 +149,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) {
hrw = createChunkDiskMapper(t, dir)
idx := 0
require.NoError(t, hrw.IterateAllChunks(func(seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, numSamples uint16) error {
require.NoError(t, hrw.IterateAllChunks(func(seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, numSamples uint16, encoding chunkenc.Encoding) error {
t.Helper()
expData := expectedData[idx]
@ -156,6 +158,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) {
require.Equal(t, expData.maxt, maxt)
require.Equal(t, expData.maxt, maxt)
require.Equal(t, expData.numSamples, numSamples)
require.Equal(t, expData.isOOO, chunkenc.IsOutOfOrderChunk(encoding))
actChunk, err := hrw.Chunk(expData.chunkRef)
require.NoError(t, err)
@ -178,9 +181,7 @@ func TestChunkDiskMapper_Truncate(t *testing.T) {
}()
timeRange := 0
fileTimeStep := 100
var thirdFileMinT, sixthFileMinT int64
addChunk := func() int {
addChunk := func() {
t.Helper()
step := 100
@ -194,8 +195,6 @@ func TestChunkDiskMapper_Truncate(t *testing.T) {
<-awaitCb
require.NoError(t, err)
timeRange += step
return mint
}
verifyFiles := func(remainingFiles []int) {
@ -216,17 +215,12 @@ func TestChunkDiskMapper_Truncate(t *testing.T) {
// Create segments 1 to 7.
for i := 1; i <= 7; i++ {
hrw.CutNewFile()
mint := int64(addChunk())
if i == 3 {
thirdFileMinT = mint
} else if i == 6 {
sixthFileMinT = mint
}
addChunk()
}
verifyFiles([]int{1, 2, 3, 4, 5, 6, 7})
// Truncating files.
require.NoError(t, hrw.Truncate(thirdFileMinT))
require.NoError(t, hrw.Truncate(3))
// Add a chunk to trigger cutting of new file.
addChunk()
@ -245,11 +239,11 @@ func TestChunkDiskMapper_Truncate(t *testing.T) {
verifyFiles([]int{3, 4, 5, 6, 7, 8, 9})
// Truncating files after restart.
require.NoError(t, hrw.Truncate(sixthFileMinT))
require.NoError(t, hrw.Truncate(6))
verifyFiles([]int{6, 7, 8, 9})
// Truncating a second time without adding a chunk shouldn't create a new file.
require.NoError(t, hrw.Truncate(sixthFileMinT+1))
require.NoError(t, hrw.Truncate(6))
verifyFiles([]int{6, 7, 8, 9})
// Add a chunk to trigger cutting of new file.
@ -257,8 +251,12 @@ func TestChunkDiskMapper_Truncate(t *testing.T) {
verifyFiles([]int{6, 7, 8, 9, 10})
// Truncation by file number.
require.NoError(t, hrw.Truncate(8))
verifyFiles([]int{8, 9, 10})
// Truncating till current time should not delete the current active file.
require.NoError(t, hrw.Truncate(int64(timeRange+(2*fileTimeStep))))
require.NoError(t, hrw.Truncate(10))
// Add a chunk to trigger cutting of new file.
addChunk()
@ -335,8 +333,7 @@ func TestChunkDiskMapper_Truncate_PreservesFileSequence(t *testing.T) {
// Truncating files till 2. It should not delete anything after 3 (inclusive)
// though files 4 and 6 are empty.
file2Maxt := hrw.mmappedChunkFiles[2].maxt
require.NoError(t, hrw.Truncate(file2Maxt+1))
require.NoError(t, hrw.Truncate(3))
verifyFiles([]int{3, 4, 5, 6})
// Add chunk, so file 6 is not empty anymore.
@ -344,8 +341,7 @@ func TestChunkDiskMapper_Truncate_PreservesFileSequence(t *testing.T) {
verifyFiles([]int{3, 4, 5, 6})
// Truncating till file 3 should also delete file 4, because it is empty.
file3Maxt := hrw.mmappedChunkFiles[3].maxt
require.NoError(t, hrw.Truncate(file3Maxt+1))
require.NoError(t, hrw.Truncate(5))
addChunk()
verifyFiles([]int{5, 6, 7})
@ -381,7 +377,7 @@ func TestHeadReadWriter_TruncateAfterFailedIterateChunks(t *testing.T) {
hrw = createChunkDiskMapper(t, dir)
// Forcefully failing IterateAllChunks.
require.Error(t, hrw.IterateAllChunks(func(_ HeadSeriesRef, _ ChunkDiskMapperRef, _, _ int64, _ uint16) error {
require.Error(t, hrw.IterateAllChunks(func(_ HeadSeriesRef, _ ChunkDiskMapperRef, _, _ int64, _ uint16, _ chunkenc.Encoding) error {
return errors.New("random error")
}))
@ -471,7 +467,9 @@ func createChunkDiskMapper(t *testing.T, dir string) *ChunkDiskMapper {
hrw, err := NewChunkDiskMapper(nil, dir, chunkenc.NewPool(), DefaultWriteBufferSize, writeQueueSize)
require.NoError(t, err)
require.False(t, hrw.fileMaxtSet)
require.NoError(t, hrw.IterateAllChunks(func(_ HeadSeriesRef, _ ChunkDiskMapperRef, _, _ int64, _ uint16) error { return nil }))
require.NoError(t, hrw.IterateAllChunks(func(_ HeadSeriesRef, _ ChunkDiskMapperRef, _, _ int64, _ uint16, _ chunkenc.Encoding) error {
return nil
}))
require.True(t, hrw.fileMaxtSet)
return hrw
@ -488,13 +486,17 @@ func randomChunk(t *testing.T) chunkenc.Chunk {
return chunk
}
func createChunk(t *testing.T, idx int, hrw *ChunkDiskMapper) (seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, chunk chunkenc.Chunk) {
func createChunk(t *testing.T, idx int, hrw *ChunkDiskMapper) (seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, chunk chunkenc.Chunk, isOOO bool) {
var err error
seriesRef = HeadSeriesRef(rand.Int63())
mint = int64((idx)*1000 + 1)
maxt = int64((idx + 1) * 1000)
chunk = randomChunk(t)
awaitCb := make(chan struct{})
if rand.Intn(2) == 0 {
isOOO = true
chunk = &chunkenc.OOOXORChunk{XORChunk: chunk.(*chunkenc.XORChunk)}
}
chunkRef = hrw.WriteChunk(seriesRef, mint, maxt, chunk, func(cbErr error) {
require.NoError(t, err)
close(awaitCb)

View file

@ -1080,7 +1080,7 @@ func BenchmarkCompactionFromHead(b *testing.B) {
opts := DefaultHeadOptions()
opts.ChunkRange = 1000
opts.ChunkDirRoot = chunkDir
h, err := NewHead(nil, nil, nil, opts, nil)
h, err := NewHead(nil, nil, nil, nil, opts, nil)
require.NoError(b, err)
for ln := 0; ln < labelNames; ln++ {
app := h.Appender(context.Background())

View file

@ -33,6 +33,7 @@ import (
"github.com/oklog/ulid"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"go.uber.org/atomic"
"golang.org/x/sync/errgroup"
"github.com/prometheus/prometheus/config"
@ -69,18 +70,19 @@ var ErrNotReady = errors.New("TSDB not ready")
// millisecond precision timestamps.
func DefaultOptions() *Options {
return &Options{
WALSegmentSize: wal.DefaultSegmentSize,
MaxBlockChunkSegmentSize: chunks.DefaultChunkSegmentSize,
RetentionDuration: int64(15 * 24 * time.Hour / time.Millisecond),
MinBlockDuration: DefaultBlockDuration,
MaxBlockDuration: DefaultBlockDuration,
NoLockfile: false,
AllowOverlappingBlocks: false,
WALCompression: false,
StripeSize: DefaultStripeSize,
HeadChunksWriteBufferSize: chunks.DefaultWriteBufferSize,
IsolationDisabled: defaultIsolationDisabled,
HeadChunksWriteQueueSize: chunks.DefaultWriteQueueSize,
WALSegmentSize: wal.DefaultSegmentSize,
MaxBlockChunkSegmentSize: chunks.DefaultChunkSegmentSize,
RetentionDuration: int64(15 * 24 * time.Hour / time.Millisecond),
MinBlockDuration: DefaultBlockDuration,
MaxBlockDuration: DefaultBlockDuration,
NoLockfile: false,
AllowOverlappingCompaction: false,
AllowOverlappingQueries: false,
WALCompression: false,
StripeSize: DefaultStripeSize,
HeadChunksWriteBufferSize: chunks.DefaultWriteBufferSize,
IsolationDisabled: defaultIsolationDisabled,
OutOfOrderCapMax: DefaultOutOfOrderCapMax,
}
}
@ -112,9 +114,19 @@ type Options struct {
// NoLockfile disables creation and consideration of a lock file.
NoLockfile bool
// Overlapping blocks are allowed if AllowOverlappingBlocks is true.
// This in-turn enables vertical compaction and vertical query merge.
AllowOverlappingBlocks bool
// Querying on overlapping blocks are allowed if AllowOverlappingQueries is true.
// Since querying is a required operation for TSDB, if there are going to be
// overlapping blocks, then this should be set to true.
// NOTE: Do not use this directly in DB. Use it via DB.AllowOverlappingQueries().
AllowOverlappingQueries bool
// Compaction of overlapping blocks are allowed if AllowOverlappingCompaction is true.
// This is an optional flag for overlapping blocks.
// The reason why this flag exists is because there are various users of the TSDB
// that do not want vertical compaction happening on ingest time. Instead,
// they'd rather keep overlapping blocks and let another component do the overlapping compaction later.
// For Prometheus, this will always be enabled if overlapping queries is enabled.
AllowOverlappingCompaction bool
// WALCompression will turn on Snappy compression for records on the WAL.
WALCompression bool
@ -160,6 +172,15 @@ type Options struct {
// Disables isolation between reads and in-flight appends.
IsolationDisabled bool
// OutOfOrderTimeWindow specifies how much out of order is allowed, if any.
// This can change during run-time, so this value from here should only be used
// while initialising.
OutOfOrderTimeWindow int64
// OutOfOrderCapMax is maximum capacity for OOO chunks (in samples).
// If it is <=0, the default value is assumed.
OutOfOrderCapMax int64
}
type BlocksToDeleteFunc func(blocks []*Block) map[ulid.ULID]struct{}
@ -197,6 +218,13 @@ type DB struct {
// Cancel a running compaction when a shutdown is initiated.
compactCancel context.CancelFunc
// oooWasEnabled is true if out of order support was enabled at least one time
// during the time TSDB was up. In which case we need to keep supporting
// out-of-order compaction and vertical queries.
oooWasEnabled atomic.Bool
registerer prometheus.Registerer
}
type dbMetrics struct {
@ -372,9 +400,17 @@ func (db *DBReadOnly) FlushWAL(dir string) (returnErr error) {
if err != nil {
return err
}
var wbl *wal.WAL
wblDir := filepath.Join(db.dir, wal.WblDirName)
if _, err := os.Stat(wblDir); !os.IsNotExist(err) {
wbl, err = wal.Open(db.logger, wblDir)
if err != nil {
return err
}
}
opts := DefaultHeadOptions()
opts.ChunkDirRoot = db.dir
head, err := NewHead(nil, db.logger, w, opts, NewHeadStats())
head, err := NewHead(nil, db.logger, w, wbl, opts, NewHeadStats())
if err != nil {
return err
}
@ -430,7 +466,7 @@ func (db *DBReadOnly) loadDataAsQueryable(maxt int64) (storage.SampleAndChunkQue
opts := DefaultHeadOptions()
opts.ChunkDirRoot = db.dir
head, err := NewHead(nil, db.logger, nil, opts, NewHeadStats())
head, err := NewHead(nil, db.logger, nil, nil, opts, NewHeadStats())
if err != nil {
return nil, err
}
@ -448,9 +484,17 @@ func (db *DBReadOnly) loadDataAsQueryable(maxt int64) (storage.SampleAndChunkQue
if err != nil {
return nil, err
}
var wbl *wal.WAL
wblDir := filepath.Join(db.dir, wal.WblDirName)
if _, err := os.Stat(wblDir); !os.IsNotExist(err) {
wbl, err = wal.Open(db.logger, wblDir)
if err != nil {
return nil, err
}
}
opts := DefaultHeadOptions()
opts.ChunkDirRoot = db.dir
head, err = NewHead(nil, db.logger, w, opts, NewHeadStats())
head, err = NewHead(nil, db.logger, w, wbl, opts, NewHeadStats())
if err != nil {
return nil, err
}
@ -598,6 +642,15 @@ func validateOpts(opts *Options, rngs []int64) (*Options, []int64) {
if opts.MinBlockDuration > opts.MaxBlockDuration {
opts.MaxBlockDuration = opts.MinBlockDuration
}
if opts.OutOfOrderTimeWindow > 0 {
opts.AllowOverlappingQueries = true
}
if opts.OutOfOrderCapMax <= 0 {
opts.OutOfOrderCapMax = DefaultOutOfOrderCapMax
}
if opts.OutOfOrderTimeWindow < 0 {
opts.OutOfOrderTimeWindow = 0
}
if len(rngs) == 0 {
// Start with smallest block duration and create exponential buckets until the exceed the
@ -634,6 +687,7 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
}
walDir := filepath.Join(dir, "wal")
wblDir := filepath.Join(dir, wal.WblDirName)
// Migrate old WAL if one exists.
if err := MigrateWAL(l, walDir); err != nil {
@ -656,6 +710,7 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
autoCompact: true,
chunkPool: chunkenc.NewPool(),
blocksToDelete: opts.BlocksToDelete,
registerer: r,
}
defer func() {
// Close files if startup fails somewhere.
@ -694,7 +749,7 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
}
db.compactCancel = cancel
var wlog *wal.WAL
var wlog, wblog *wal.WAL
segmentSize := wal.DefaultSegmentSize
// Wal is enabled.
if opts.WALSegmentSize >= 0 {
@ -706,8 +761,19 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
if err != nil {
return nil, err
}
// Check if there is a WBL on disk, in which case we should replay that data.
wblSize, err := fileutil.DirSize(wblDir)
if err != nil && !os.IsNotExist(err) {
return nil, err
}
if opts.OutOfOrderTimeWindow > 0 || wblSize > 0 {
wblog, err = wal.NewSize(l, r, wblDir, segmentSize, opts.WALCompression)
if err != nil {
return nil, err
}
}
}
db.oooWasEnabled.Store(opts.OutOfOrderTimeWindow > 0)
headOpts := DefaultHeadOptions()
headOpts.ChunkRange = rngs[0]
headOpts.ChunkDirRoot = dir
@ -719,11 +785,13 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
headOpts.EnableExemplarStorage = opts.EnableExemplarStorage
headOpts.MaxExemplars.Store(opts.MaxExemplars)
headOpts.EnableMemorySnapshotOnShutdown = opts.EnableMemorySnapshotOnShutdown
headOpts.OutOfOrderTimeWindow.Store(opts.OutOfOrderTimeWindow)
headOpts.OutOfOrderCapMax.Store(opts.OutOfOrderCapMax)
if opts.IsolationDisabled {
// We only override this flag if isolation is disabled at DB level. We use the default otherwise.
headOpts.IsolationDisabled = opts.IsolationDisabled
}
db.head, err = NewHead(r, l, wlog, headOpts, stats.Head)
db.head, err = NewHead(r, l, wlog, wblog, headOpts, stats.Head)
if err != nil {
return nil, err
}
@ -741,20 +809,36 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
}
// Set the min valid time for the ingested samples
// to be no lower than the maxt of the last block.
blocks := db.Blocks()
minValidTime := int64(math.MinInt64)
if len(blocks) > 0 {
minValidTime = blocks[len(blocks)-1].Meta().MaxTime
// We do not consider blocks created from out-of-order samples for Head's minValidTime
// since minValidTime is only for the in-order data and we do not want to discard unnecessary
// samples from the Head.
inOrderMaxTime, ok := db.inOrderBlocksMaxTime()
if ok {
minValidTime = inOrderMaxTime
}
if initErr := db.head.Init(minValidTime); initErr != nil {
db.head.metrics.walCorruptionsTotal.Inc()
level.Warn(db.logger).Log("msg", "Encountered WAL read error, attempting repair", "err", initErr)
if err := wlog.Repair(initErr); err != nil {
return nil, errors.Wrap(err, "repair corrupted WAL")
isOOOErr := isErrLoadOOOWal(initErr)
if isOOOErr {
level.Warn(db.logger).Log("msg", "Encountered OOO WAL read error, attempting repair", "err", initErr)
if err := wblog.Repair(initErr); err != nil {
return nil, errors.Wrap(err, "repair corrupted OOO WAL")
}
} else {
level.Warn(db.logger).Log("msg", "Encountered WAL read error, attempting repair", "err", initErr)
if err := wlog.Repair(initErr); err != nil {
return nil, errors.Wrap(err, "repair corrupted WAL")
}
}
}
if db.head.MinOOOTime() != int64(math.MaxInt64) {
// Some OOO data was replayed from the disk that needs compaction and cleanup.
db.oooWasEnabled.Store(true)
}
go db.run()
return db, nil
@ -846,8 +930,58 @@ func (db *DB) Appender(ctx context.Context) storage.Appender {
return dbAppender{db: db, Appender: db.head.Appender(ctx)}
}
// ApplyConfig applies a new config to the DB.
// Behaviour of 'OutOfOrderTimeWindow' is as follows:
// OOO enabled = oooTimeWindow > 0. OOO disabled = oooTimeWindow is 0.
// 1) Before: OOO disabled, Now: OOO enabled =>
// - A new WBL is created for the head block.
// - OOO compaction is enabled.
// - Overlapping queries are enabled.
//
// 2) Before: OOO enabled, Now: OOO enabled =>
// - Only the time window is updated.
//
// 3) Before: OOO enabled, Now: OOO disabled =>
// - Time Window set to 0. So no new OOO samples will be allowed.
// - OOO WBL will stay and will be eventually cleaned up.
// - OOO Compaction and overlapping queries will remain enabled until a restart or until all OOO samples are compacted.
//
// 4) Before: OOO disabled, Now: OOO disabled => no-op.
func (db *DB) ApplyConfig(conf *config.Config) error {
return db.head.ApplyConfig(conf)
oooTimeWindow := int64(0)
if conf.StorageConfig.TSDBConfig != nil {
oooTimeWindow = conf.StorageConfig.TSDBConfig.OutOfOrderTimeWindow
}
if oooTimeWindow < 0 {
oooTimeWindow = 0
}
// Create WBL if it was not present and if OOO is enabled with WAL enabled.
var wblog *wal.WAL
var err error
if db.head.wbl != nil {
// The existing WBL from the disk might have been replayed while OOO was disabled.
wblog = db.head.wbl
} else if !db.oooWasEnabled.Load() && oooTimeWindow > 0 && db.opts.WALSegmentSize >= 0 {
segmentSize := wal.DefaultSegmentSize
// Wal is set to a custom size.
if db.opts.WALSegmentSize > 0 {
segmentSize = db.opts.WALSegmentSize
}
oooWalDir := filepath.Join(db.dir, wal.WblDirName)
wblog, err = wal.NewSize(db.logger, db.registerer, oooWalDir, segmentSize, db.opts.WALCompression)
if err != nil {
return err
}
}
db.opts.OutOfOrderTimeWindow = oooTimeWindow
db.head.ApplyConfig(conf, wblog)
if !db.oooWasEnabled.Load() {
db.oooWasEnabled.Store(oooTimeWindow > 0)
}
return nil
}
// dbAppender wraps the DB's head appender and triggers compactions on commit
@ -946,6 +1080,14 @@ func (db *DB) Compact() (returnErr error) {
"block_range", db.head.chunkRange.Load(),
)
}
if lastBlockMaxt != math.MinInt64 {
// The head was compacted, so we compact OOO head as well.
if err := db.compactOOOHead(); err != nil {
return errors.Wrap(err, "compact ooo head")
}
}
return db.compactBlocks()
}
@ -964,6 +1106,102 @@ func (db *DB) CompactHead(head *RangeHead) error {
return nil
}
// CompactOOOHead compacts the OOO Head.
func (db *DB) CompactOOOHead() error {
db.cmtx.Lock()
defer db.cmtx.Unlock()
return db.compactOOOHead()
}
func (db *DB) compactOOOHead() error {
if !db.oooWasEnabled.Load() {
return nil
}
oooHead, err := NewOOOCompactionHead(db.head)
if err != nil {
return errors.Wrap(err, "get ooo compaction head")
}
ulids, err := db.compactOOO(db.dir, oooHead)
if err != nil {
return errors.Wrap(err, "compact ooo head")
}
if err := db.reloadBlocks(); err != nil {
errs := tsdb_errors.NewMulti(err)
for _, uid := range ulids {
if errRemoveAll := os.RemoveAll(filepath.Join(db.dir, uid.String())); errRemoveAll != nil {
errs.Add(errRemoveAll)
}
}
return errors.Wrap(errs.Err(), "reloadBlocks blocks after failed compact ooo head")
}
lastWBLFile, minOOOMmapRef := oooHead.LastWBLFile(), oooHead.LastMmapRef()
if lastWBLFile != 0 || minOOOMmapRef != 0 {
if err := db.head.truncateOOO(lastWBLFile, minOOOMmapRef); err != nil {
return errors.Wrap(err, "truncate ooo wbl")
}
}
return nil
}
// compactOOO creates a new block per possible block range in the compactor's directory from the OOO Head given.
// Each ULID in the result corresponds to a block in a unique time range.
func (db *DB) compactOOO(dest string, oooHead *OOOCompactionHead) (_ []ulid.ULID, err error) {
start := time.Now()
blockSize := oooHead.ChunkRange()
oooHeadMint, oooHeadMaxt := oooHead.MinTime(), oooHead.MaxTime()
ulids := make([]ulid.ULID, 0)
defer func() {
if err != nil {
// Best effort removal of created block on any error.
for _, uid := range ulids {
_ = os.RemoveAll(filepath.Join(db.dir, uid.String()))
}
}
}()
for t := blockSize * (oooHeadMint / blockSize); t <= oooHeadMaxt; t = t + blockSize {
mint, maxt := t, t+blockSize
// Block intervals are half-open: [b.MinTime, b.MaxTime). Block intervals are always +1 than the total samples it includes.
uid, err := db.compactor.Write(dest, oooHead.CloneForTimeRange(mint, maxt-1), mint, maxt, nil)
if err != nil {
return nil, err
}
if uid.Compare(ulid.ULID{}) != 0 {
ulids = append(ulids, uid)
blockDir := filepath.Join(dest, uid.String())
meta, _, err := readMetaFile(blockDir)
if err != nil {
return ulids, errors.Wrap(err, "read meta")
}
meta.Compaction.SetOutOfOrder()
_, err = writeMetaFile(db.logger, blockDir, meta)
if err != nil {
return ulids, errors.Wrap(err, "write meta")
}
}
}
if len(ulids) == 0 {
level.Info(db.logger).Log(
"msg", "compact ooo head resulted in no blocks",
"duration", time.Since(start),
)
return nil, nil
}
level.Info(db.logger).Log(
"msg", "out-of-order compaction completed",
"duration", time.Since(start),
"ulids", fmt.Sprintf("%v", ulids),
)
return ulids, nil
}
// compactHead compacts the given RangeHead.
// The compaction mutex should be held before calling this method.
func (db *DB) compactHead(head *RangeHead) error {
@ -1038,10 +1276,11 @@ func (db *DB) reload() error {
if err := db.reloadBlocks(); err != nil {
return errors.Wrap(err, "reloadBlocks")
}
if len(db.blocks) == 0 {
maxt, ok := db.inOrderBlocksMaxTime()
if !ok {
return nil
}
if err := db.head.Truncate(db.blocks[len(db.blocks)-1].MaxTime()); err != nil {
if err := db.head.Truncate(maxt); err != nil {
return errors.Wrap(err, "head truncate")
}
return nil
@ -1121,7 +1360,7 @@ func (db *DB) reloadBlocks() (err error) {
sort.Slice(toLoad, func(i, j int) bool {
return toLoad[i].Meta().MinTime < toLoad[j].Meta().MinTime
})
if !db.opts.AllowOverlappingBlocks {
if !db.AllowOverlappingQueries() {
if err := validateBlockSequence(toLoad); err != nil {
return errors.Wrap(err, "invalid block sequence")
}
@ -1151,6 +1390,10 @@ func (db *DB) reloadBlocks() (err error) {
return nil
}
func (db *DB) AllowOverlappingQueries() bool {
return db.opts.AllowOverlappingQueries || db.oooWasEnabled.Load()
}
func openBlocks(l log.Logger, dir string, loaded []*Block, chunkPool chunkenc.Pool) (blocks []*Block, corrupted map[ulid.ULID]error, err error) {
bDirs, err := blockDirs(dir)
if err != nil {
@ -1428,6 +1671,21 @@ func (db *DB) Blocks() []*Block {
return db.blocks
}
// inOrderBlocksMaxTime returns the max time among the blocks that were not totally created
// out of out-of-order data. If the returned boolean is true, it means there is at least
// one such block.
func (db *DB) inOrderBlocksMaxTime() (maxt int64, ok bool) {
maxt, ok = int64(math.MinInt64), false
// If blocks are overlapping, last block might not have the max time. So check all blocks.
for _, b := range db.Blocks() {
if !b.meta.Compaction.FromOutOfOrder() && b.meta.MaxTime > maxt {
ok = true
maxt = b.meta.MaxTime
}
}
return maxt, ok
}
// Head returns the databases's head.
func (db *DB) Head() *Head {
return db.head
@ -1526,13 +1784,13 @@ func (db *DB) Querier(_ context.Context, mint, maxt int64) (storage.Querier, err
blocks = append(blocks, b)
}
}
var headQuerier storage.Querier
var inOrderHeadQuerier storage.Querier
if maxt >= db.head.MinTime() {
rh := NewRangeHead(db.head, mint, maxt)
var err error
headQuerier, err = NewBlockQuerier(rh, mint, maxt)
inOrderHeadQuerier, err = NewBlockQuerier(rh, mint, maxt)
if err != nil {
return nil, errors.Wrapf(err, "open querier for head %s", rh)
return nil, errors.Wrapf(err, "open block querier for head %s", rh)
}
// Getting the querier above registers itself in the queue that the truncation waits on.
@ -1540,20 +1798,30 @@ func (db *DB) Querier(_ context.Context, mint, maxt int64) (storage.Querier, err
// won't run into a race later since any truncation that comes after will wait on this querier if it overlaps.
shouldClose, getNew, newMint := db.head.IsQuerierCollidingWithTruncation(mint, maxt)
if shouldClose {
if err := headQuerier.Close(); err != nil {
return nil, errors.Wrapf(err, "closing head querier %s", rh)
if err := inOrderHeadQuerier.Close(); err != nil {
return nil, errors.Wrapf(err, "closing head block querier %s", rh)
}
headQuerier = nil
inOrderHeadQuerier = nil
}
if getNew {
rh := NewRangeHead(db.head, newMint, maxt)
headQuerier, err = NewBlockQuerier(rh, newMint, maxt)
inOrderHeadQuerier, err = NewBlockQuerier(rh, newMint, maxt)
if err != nil {
return nil, errors.Wrapf(err, "open querier for head while getting new querier %s", rh)
return nil, errors.Wrapf(err, "open block querier for head while getting new querier %s", rh)
}
}
}
var outOfOrderHeadQuerier storage.Querier
if overlapsClosedInterval(mint, maxt, db.head.MinOOOTime(), db.head.MaxOOOTime()) {
rh := NewOOORangeHead(db.head, mint, maxt)
var err error
outOfOrderHeadQuerier, err = NewBlockQuerier(rh, mint, maxt)
if err != nil {
return nil, errors.Wrapf(err, "open block querier for ooo head %s", rh)
}
}
blockQueriers := make([]storage.Querier, 0, len(blocks))
for _, b := range blocks {
q, err := NewBlockQuerier(b, mint, maxt)
@ -1568,14 +1836,18 @@ func (db *DB) Querier(_ context.Context, mint, maxt int64) (storage.Querier, err
}
return nil, errors.Wrapf(err, "open querier for block %s", b)
}
if headQuerier != nil {
blockQueriers = append(blockQueriers, headQuerier)
if inOrderHeadQuerier != nil {
blockQueriers = append(blockQueriers, inOrderHeadQuerier)
}
if outOfOrderHeadQuerier != nil {
blockQueriers = append(blockQueriers, outOfOrderHeadQuerier)
}
return storage.NewMergeQuerier(blockQueriers, nil, storage.ChainedSeriesMerge), nil
}
// ChunkQuerier returns a new chunk querier over the data partition for the given time range.
func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQuerier, error) {
// blockQueriersForRange returns individual block chunk queriers from the persistent blocks, in-order head block, and the
// out-of-order head block, overlapping with the given time range.
func (db *DB) blockChunkQuerierForRange(mint, maxt int64) ([]storage.ChunkQuerier, error) {
var blocks []BlockReader
db.mtx.RLock()
@ -1586,11 +1858,11 @@ func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQu
blocks = append(blocks, b)
}
}
var headQuerier storage.ChunkQuerier
var inOrderHeadQuerier storage.ChunkQuerier
if maxt >= db.head.MinTime() {
rh := NewRangeHead(db.head, mint, maxt)
var err error
headQuerier, err = NewBlockChunkQuerier(rh, mint, maxt)
inOrderHeadQuerier, err = NewBlockChunkQuerier(rh, mint, maxt)
if err != nil {
return nil, errors.Wrapf(err, "open querier for head %s", rh)
}
@ -1600,20 +1872,30 @@ func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQu
// won't run into a race later since any truncation that comes after will wait on this querier if it overlaps.
shouldClose, getNew, newMint := db.head.IsQuerierCollidingWithTruncation(mint, maxt)
if shouldClose {
if err := headQuerier.Close(); err != nil {
if err := inOrderHeadQuerier.Close(); err != nil {
return nil, errors.Wrapf(err, "closing head querier %s", rh)
}
headQuerier = nil
inOrderHeadQuerier = nil
}
if getNew {
rh := NewRangeHead(db.head, newMint, maxt)
headQuerier, err = NewBlockChunkQuerier(rh, newMint, maxt)
inOrderHeadQuerier, err = NewBlockChunkQuerier(rh, newMint, maxt)
if err != nil {
return nil, errors.Wrapf(err, "open querier for head while getting new querier %s", rh)
}
}
}
var outOfOrderHeadQuerier storage.ChunkQuerier
if overlapsClosedInterval(mint, maxt, db.head.MinOOOTime(), db.head.MaxOOOTime()) {
rh := NewOOORangeHead(db.head, mint, maxt)
var err error
outOfOrderHeadQuerier, err = NewBlockChunkQuerier(rh, mint, maxt)
if err != nil {
return nil, errors.Wrapf(err, "open block chunk querier for ooo head %s", rh)
}
}
blockQueriers := make([]storage.ChunkQuerier, 0, len(blocks))
for _, b := range blocks {
q, err := NewBlockChunkQuerier(b, mint, maxt)
@ -1628,10 +1910,22 @@ func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQu
}
return nil, errors.Wrapf(err, "open querier for block %s", b)
}
if headQuerier != nil {
blockQueriers = append(blockQueriers, headQuerier)
if inOrderHeadQuerier != nil {
blockQueriers = append(blockQueriers, inOrderHeadQuerier)
}
if outOfOrderHeadQuerier != nil {
blockQueriers = append(blockQueriers, outOfOrderHeadQuerier)
}
return blockQueriers, nil
}
// ChunkQuerier returns a new chunk querier over the data partition for the given time range.
func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQuerier, error) {
blockQueriers, err := db.blockChunkQuerierForRange(mint, maxt)
if err != nil {
return nil, err
}
return storage.NewMergeChunkQuerier(blockQueriers, nil, storage.NewCompactingChunkSeriesMerger(storage.ChainedSeriesMerge)), nil
}

File diff suppressed because it is too large Load diff

View file

@ -25,9 +25,10 @@ import (
"github.com/go-kit/log/level"
"github.com/oklog/ulid"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"go.uber.org/atomic"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/prometheus/config"
"github.com/prometheus/prometheus/model/exemplar"
"github.com/prometheus/prometheus/model/labels"
@ -62,15 +63,19 @@ var (
type Head struct {
chunkRange atomic.Int64
numSeries atomic.Uint64
minTime, maxTime atomic.Int64 // Current min and max of the samples included in the head.
minOOOTime, maxOOOTime atomic.Int64 // TODO(jesusvazquez) These should be updated after garbage collection.
minTime, maxTime atomic.Int64 // Current min and max of the samples included in the head. TODO(jesusvazquez) Ensure these are properly tracked.
minValidTime atomic.Int64 // Mint allowed to be added to the head. It shouldn't be lower than the maxt of the last persisted block.
lastWALTruncationTime atomic.Int64
lastMemoryTruncationTime atomic.Int64
lastSeriesID atomic.Uint64
// All the ooo m-map chunks should be after this. This is used to truncate old ooo m-map chunks.
// This should be typecasted to chunks.ChunkDiskMapperRef after loading.
minOOOMmapRef atomic.Uint64
metrics *headMetrics
opts *HeadOptions
wal *wal.WAL
wal, wbl *wal.WAL
exemplarMetrics *ExemplarMetrics
exemplars ExemplarStorage
logger log.Logger
@ -87,6 +92,7 @@ type Head struct {
deletedMtx sync.Mutex
deleted map[chunks.HeadSeriesRef]int // Deleted series, and what WAL segment they must be kept until.
// TODO(codesome): Extend MemPostings to return only OOOPostings, Set OOOStatus, ... Like an additional map of ooo postings.
postings *index.MemPostings // Postings lists for terms.
tombstones *tombstones.MemTombstones
@ -130,6 +136,8 @@ type HeadOptions struct {
ChunkPool chunkenc.Pool
ChunkWriteBufferSize int
ChunkWriteQueueSize int
OutOfOrderTimeWindow atomic.Int64
OutOfOrderCapMax atomic.Int64
// StripeSize sets the number of entries in the hash map, it must be a power of 2.
// A larger StripeSize will allocate more memory up-front, but will increase performance when handling a large number of series.
@ -142,8 +150,13 @@ type HeadOptions struct {
IsolationDisabled bool
}
const (
// DefaultOutOfOrderCapMax is the default maximum size of an in-memory out-of-order chunk.
DefaultOutOfOrderCapMax int64 = 32
)
func DefaultHeadOptions() *HeadOptions {
return &HeadOptions{
ho := &HeadOptions{
ChunkRange: DefaultBlockDuration,
ChunkDirRoot: "",
ChunkPool: chunkenc.NewPool(),
@ -153,6 +166,8 @@ func DefaultHeadOptions() *HeadOptions {
SeriesCallback: &noopSeriesLifecycleCallback{},
IsolationDisabled: defaultIsolationDisabled,
}
ho.OutOfOrderCapMax.Store(DefaultOutOfOrderCapMax)
return ho
}
// SeriesLifecycleCallback specifies a list of callbacks that will be called during a lifecycle of a series.
@ -171,11 +186,23 @@ type SeriesLifecycleCallback interface {
}
// NewHead opens the head block in dir.
func NewHead(r prometheus.Registerer, l log.Logger, wal *wal.WAL, opts *HeadOptions, stats *HeadStats) (*Head, error) {
func NewHead(r prometheus.Registerer, l log.Logger, wal, wbl *wal.WAL, opts *HeadOptions, stats *HeadStats) (*Head, error) {
var err error
if l == nil {
l = log.NewNopLogger()
}
if opts.OutOfOrderTimeWindow.Load() < 0 {
opts.OutOfOrderTimeWindow.Store(0)
}
// Time window can be set on runtime. So the capMin and capMax should be valid
// even if ooo is not enabled yet.
capMax := opts.OutOfOrderCapMax.Load()
if capMax <= 0 || capMax > 255 {
return nil, errors.Errorf("OOOCapMax of %d is invalid. must be > 0 and <= 255", capMax)
}
if opts.ChunkRange < 1 {
return nil, errors.Errorf("invalid chunk range %d", opts.ChunkRange)
}
@ -193,6 +220,7 @@ func NewHead(r prometheus.Registerer, l log.Logger, wal *wal.WAL, opts *HeadOpti
h := &Head{
wal: wal,
wbl: wbl,
logger: l,
opts: opts,
memChunkPool: sync.Pool{
@ -254,35 +282,40 @@ func (h *Head) resetInMemoryState() error {
h.chunkRange.Store(h.opts.ChunkRange)
h.minTime.Store(math.MaxInt64)
h.maxTime.Store(math.MinInt64)
h.minOOOTime.Store(math.MaxInt64)
h.maxOOOTime.Store(math.MinInt64)
h.lastWALTruncationTime.Store(math.MinInt64)
h.lastMemoryTruncationTime.Store(math.MinInt64)
return nil
}
type headMetrics struct {
activeAppenders prometheus.Gauge
series prometheus.GaugeFunc
seriesCreated prometheus.Counter
seriesRemoved prometheus.Counter
seriesNotFound prometheus.Counter
chunks prometheus.Gauge
chunksCreated prometheus.Counter
chunksRemoved prometheus.Counter
gcDuration prometheus.Summary
samplesAppended prometheus.Counter
outOfBoundSamples prometheus.Counter
outOfOrderSamples prometheus.Counter
walTruncateDuration prometheus.Summary
walCorruptionsTotal prometheus.Counter
walTotalReplayDuration prometheus.Gauge
headTruncateFail prometheus.Counter
headTruncateTotal prometheus.Counter
checkpointDeleteFail prometheus.Counter
checkpointDeleteTotal prometheus.Counter
checkpointCreationFail prometheus.Counter
checkpointCreationTotal prometheus.Counter
mmapChunkCorruptionTotal prometheus.Counter
snapshotReplayErrorTotal prometheus.Counter // Will be either 0 or 1.
activeAppenders prometheus.Gauge
series prometheus.GaugeFunc
seriesCreated prometheus.Counter
seriesRemoved prometheus.Counter
seriesNotFound prometheus.Counter
chunks prometheus.Gauge
chunksCreated prometheus.Counter
chunksRemoved prometheus.Counter
gcDuration prometheus.Summary
samplesAppended prometheus.Counter
outOfOrderSamplesAppended prometheus.Counter
outOfBoundSamples prometheus.Counter
outOfOrderSamples prometheus.Counter
tooOldSamples prometheus.Counter
walTruncateDuration prometheus.Summary
walCorruptionsTotal prometheus.Counter
dataTotalReplayDuration prometheus.Gauge
headTruncateFail prometheus.Counter
headTruncateTotal prometheus.Counter
checkpointDeleteFail prometheus.Counter
checkpointDeleteTotal prometheus.Counter
checkpointCreationFail prometheus.Counter
checkpointCreationTotal prometheus.Counter
mmapChunkCorruptionTotal prometheus.Counter
snapshotReplayErrorTotal prometheus.Counter // Will be either 0 or 1.
oooHistogram prometheus.Histogram
}
func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
@ -333,7 +366,7 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
Name: "prometheus_tsdb_wal_corruptions_total",
Help: "Total number of WAL corruptions.",
}),
walTotalReplayDuration: prometheus.NewGauge(prometheus.GaugeOpts{
dataTotalReplayDuration: prometheus.NewGauge(prometheus.GaugeOpts{
Name: "prometheus_tsdb_data_replay_duration_seconds",
Help: "Time taken to replay the data on disk.",
}),
@ -341,13 +374,21 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
Name: "prometheus_tsdb_head_samples_appended_total",
Help: "Total number of appended samples.",
}),
outOfOrderSamplesAppended: prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_head_out_of_order_samples_appended_total",
Help: "Total number of appended out of order samples.",
}),
outOfBoundSamples: prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_out_of_bound_samples_total",
Help: "Total number of out of bound samples ingestion failed attempts.",
Help: "Total number of out of bound samples ingestion failed attempts with out of order support disabled.",
}),
outOfOrderSamples: prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_out_of_order_samples_total",
Help: "Total number of out of order samples ingestion failed attempts.",
Help: "Total number of out of order samples ingestion failed attempts due to out of order being disabled.",
}),
tooOldSamples: prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_too_old_samples_total",
Help: "Total number of out of order samples ingestion failed attempts with out of support enabled, but sample outside of time window.",
}),
headTruncateFail: prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_head_truncations_failed_total",
@ -381,6 +422,19 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
Name: "prometheus_tsdb_snapshot_replay_error_total",
Help: "Total number snapshot replays that failed.",
}),
oooHistogram: prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "prometheus_tsdb_sample_ooo_delta",
Help: "Delta in seconds by which a sample is considered out of order (reported regardless of OOO time window and whether sample is accepted or not).",
Buckets: []float64{
60 * 10, // 10 min
60 * 30, // 30 min
60 * 60, // 60 min
60 * 60 * 2, // 2h
60 * 60 * 3, // 3h
60 * 60 * 6, // 6h
60 * 60 * 12, // 12h
},
}),
}
if r != nil {
@ -396,10 +450,12 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
m.gcDuration,
m.walTruncateDuration,
m.walCorruptionsTotal,
m.walTotalReplayDuration,
m.dataTotalReplayDuration,
m.samplesAppended,
m.outOfOrderSamplesAppended,
m.outOfBoundSamples,
m.outOfOrderSamples,
m.tooOldSamples,
m.headTruncateFail,
m.headTruncateTotal,
m.checkpointDeleteFail,
@ -517,8 +573,9 @@ func (h *Head) Init(minValidTime int64) error {
}
mmapChunkReplayStart := time.Now()
mmappedChunks, err := h.loadMmappedChunks(refSeries)
mmappedChunks, oooMmappedChunks, lastMmapRef, err := h.loadMmappedChunks(refSeries)
if err != nil {
// TODO(codesome): clear out all m-map chunks here for refSeries.
level.Error(h.logger).Log("msg", "Loading on-disk chunks failed", "err", err)
if _, ok := errors.Cause(err).(*chunks.CorruptionErr); ok {
h.metrics.mmapChunkCorruptionTotal.Inc()
@ -529,7 +586,7 @@ func (h *Head) Init(minValidTime int64) error {
// If this fails, data will be recovered from WAL.
// Hence we wont lose any data (given WAL is not corrupt).
mmappedChunks, err = h.removeCorruptedMmappedChunks(err)
mmappedChunks, oooMmappedChunks, lastMmapRef, err = h.removeCorruptedMmappedChunks(err)
if err != nil {
return err
}
@ -572,7 +629,7 @@ func (h *Head) Init(minValidTime int64) error {
// A corrupted checkpoint is a hard error for now and requires user
// intervention. There's likely little data that can be recovered anyway.
if err := h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks); err != nil {
if err := h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks, oooMmappedChunks); err != nil {
return errors.Wrap(err, "backfill checkpoint")
}
h.updateWALReplayStatusRead(startFrom)
@ -605,7 +662,7 @@ func (h *Head) Init(minValidTime int64) error {
if err != nil {
return errors.Wrapf(err, "segment reader (offset=%d)", offset)
}
err = h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks)
err = h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks, oooMmappedChunks)
if err := sr.Close(); err != nil {
level.Warn(h.logger).Log("msg", "Error while closing the wal segments reader", "err", err)
}
@ -615,26 +672,94 @@ func (h *Head) Init(minValidTime int64) error {
level.Info(h.logger).Log("msg", "WAL segment loaded", "segment", i, "maxSegment", endAt)
h.updateWALReplayStatusRead(i)
}
walReplayDuration := time.Since(walReplayStart)
walReplayDuration := time.Since(start)
h.metrics.walTotalReplayDuration.Set(walReplayDuration.Seconds())
wblReplayStart := time.Now()
if h.wbl != nil {
// Replay OOO WAL.
startFrom, endAt, e = wal.Segments(h.wbl.Dir())
if e != nil {
return errors.Wrap(e, "finding OOO WAL segments")
}
h.startWALReplayStatus(startFrom, endAt)
for i := startFrom; i <= endAt; i++ {
s, err := wal.OpenReadSegment(wal.SegmentName(h.wbl.Dir(), i))
if err != nil {
return errors.Wrap(err, fmt.Sprintf("open WBL segment: %d", i))
}
sr := wal.NewSegmentBufReader(s)
err = h.loadWBL(wal.NewReader(sr), multiRef, lastMmapRef)
if err := sr.Close(); err != nil {
level.Warn(h.logger).Log("msg", "Error while closing the wbl segments reader", "err", err)
}
if err != nil {
return err
}
level.Info(h.logger).Log("msg", "WBL segment loaded", "segment", i, "maxSegment", endAt)
h.updateWALReplayStatusRead(i)
}
}
wblReplayDuration := time.Since(wblReplayStart)
totalReplayDuration := time.Since(start)
h.metrics.dataTotalReplayDuration.Set(totalReplayDuration.Seconds())
level.Info(h.logger).Log(
"msg", "WAL replay completed",
"checkpoint_replay_duration", checkpointReplayDuration.String(),
"wal_replay_duration", time.Since(walReplayStart).String(),
"total_replay_duration", walReplayDuration.String(),
"wal_replay_duration", walReplayDuration.String(),
"wbl_replay_duration", wblReplayDuration.String(),
"total_replay_duration", totalReplayDuration.String(),
)
return nil
}
func (h *Head) loadMmappedChunks(refSeries map[chunks.HeadSeriesRef]*memSeries) (map[chunks.HeadSeriesRef][]*mmappedChunk, error) {
func (h *Head) loadMmappedChunks(refSeries map[chunks.HeadSeriesRef]*memSeries) (map[chunks.HeadSeriesRef][]*mmappedChunk, map[chunks.HeadSeriesRef][]*mmappedChunk, chunks.ChunkDiskMapperRef, error) {
mmappedChunks := map[chunks.HeadSeriesRef][]*mmappedChunk{}
if err := h.chunkDiskMapper.IterateAllChunks(func(seriesRef chunks.HeadSeriesRef, chunkRef chunks.ChunkDiskMapperRef, mint, maxt int64, numSamples uint16) error {
if maxt < h.minValidTime.Load() {
oooMmappedChunks := map[chunks.HeadSeriesRef][]*mmappedChunk{}
var lastRef, secondLastRef chunks.ChunkDiskMapperRef
if err := h.chunkDiskMapper.IterateAllChunks(func(seriesRef chunks.HeadSeriesRef, chunkRef chunks.ChunkDiskMapperRef, mint, maxt int64, numSamples uint16, encoding chunkenc.Encoding) error {
secondLastRef = lastRef
lastRef = chunkRef
isOOO := chunkenc.IsOutOfOrderChunk(encoding)
if !isOOO && maxt < h.minValidTime.Load() {
return nil
}
// We ignore any chunk that doesn't have a valid encoding
if !chunkenc.IsValidEncoding(encoding) {
return nil
}
ms, ok := refSeries[seriesRef]
if isOOO {
if !ok {
oooMmappedChunks[seriesRef] = append(oooMmappedChunks[seriesRef], &mmappedChunk{
ref: chunkRef,
minTime: mint,
maxTime: maxt,
numSamples: numSamples,
})
return nil
}
h.metrics.chunks.Inc()
h.metrics.chunksCreated.Inc()
ms.oooMmappedChunks = append(ms.oooMmappedChunks, &mmappedChunk{
ref: chunkRef,
minTime: mint,
maxTime: maxt,
numSamples: numSamples,
})
return nil
}
if !ok {
slice := mmappedChunks[seriesRef]
if len(slice) > 0 && slice[len(slice)-1].maxTime >= mint {
@ -677,45 +802,57 @@ func (h *Head) loadMmappedChunks(refSeries map[chunks.HeadSeriesRef]*memSeries)
}
return nil
}); err != nil {
return nil, errors.Wrap(err, "iterate on on-disk chunks")
// secondLastRef because the lastRef caused an error.
return nil, nil, secondLastRef, errors.Wrap(err, "iterate on on-disk chunks")
}
return mmappedChunks, nil
return mmappedChunks, oooMmappedChunks, lastRef, nil
}
// removeCorruptedMmappedChunks attempts to delete the corrupted mmapped chunks and if it fails, it clears all the previously
// loaded mmapped chunks.
func (h *Head) removeCorruptedMmappedChunks(err error) (map[chunks.HeadSeriesRef][]*mmappedChunk, error) {
func (h *Head) removeCorruptedMmappedChunks(err error) (map[chunks.HeadSeriesRef][]*mmappedChunk, map[chunks.HeadSeriesRef][]*mmappedChunk, chunks.ChunkDiskMapperRef, error) {
level.Info(h.logger).Log("msg", "Deleting mmapped chunk files")
// We never want to preserve the in-memory series from snapshots if we are repairing m-map chunks.
if err := h.resetInMemoryState(); err != nil {
return nil, err
return map[chunks.HeadSeriesRef][]*mmappedChunk{}, map[chunks.HeadSeriesRef][]*mmappedChunk{}, 0, err
}
level.Info(h.logger).Log("msg", "Deleting mmapped chunk files")
if err := h.chunkDiskMapper.DeleteCorrupted(err); err != nil {
level.Info(h.logger).Log("msg", "Deletion of corrupted mmap chunk files failed, discarding chunk files completely", "err", err)
if err := h.chunkDiskMapper.Truncate(math.MaxInt64); err != nil {
if err := h.chunkDiskMapper.Truncate(math.MaxUint32); err != nil {
level.Error(h.logger).Log("msg", "Deletion of all mmap chunk files failed", "err", err)
}
return map[chunks.HeadSeriesRef][]*mmappedChunk{}, nil
return map[chunks.HeadSeriesRef][]*mmappedChunk{}, map[chunks.HeadSeriesRef][]*mmappedChunk{}, 0, nil
}
level.Info(h.logger).Log("msg", "Deletion of mmap chunk files successful, reattempting m-mapping the on-disk chunks")
mmappedChunks, err := h.loadMmappedChunks(make(map[chunks.HeadSeriesRef]*memSeries))
mmappedChunks, oooMmappedChunks, lastRef, err := h.loadMmappedChunks(make(map[chunks.HeadSeriesRef]*memSeries))
if err != nil {
level.Error(h.logger).Log("msg", "Loading on-disk chunks failed, discarding chunk files completely", "err", err)
if err := h.chunkDiskMapper.Truncate(math.MaxInt64); err != nil {
if err := h.chunkDiskMapper.Truncate(math.MaxUint32); err != nil {
level.Error(h.logger).Log("msg", "Deletion of all mmap chunk files failed after failed loading", "err", err)
}
mmappedChunks = map[chunks.HeadSeriesRef][]*mmappedChunk{}
}
return mmappedChunks, nil
return mmappedChunks, oooMmappedChunks, lastRef, nil
}
func (h *Head) ApplyConfig(cfg *config.Config) error {
func (h *Head) ApplyConfig(cfg *config.Config, wbl *wal.WAL) {
oooTimeWindow := int64(0)
if cfg.StorageConfig.TSDBConfig != nil {
oooTimeWindow = cfg.StorageConfig.TSDBConfig.OutOfOrderTimeWindow
}
if oooTimeWindow < 0 {
oooTimeWindow = 0
}
h.SetOutOfOrderTimeWindow(oooTimeWindow, wbl)
if !h.opts.EnableExemplarStorage {
return nil
return
}
// Head uses opts.MaxExemplars in combination with opts.EnableExemplarStorage
@ -726,12 +863,21 @@ func (h *Head) ApplyConfig(cfg *config.Config) error {
newSize := h.opts.MaxExemplars.Load()
if prevSize == newSize {
return nil
return
}
migrated := h.exemplars.(*CircularExemplarStorage).Resize(newSize)
level.Info(h.logger).Log("msg", "Exemplar storage resized", "from", prevSize, "to", newSize, "migrated", migrated)
return nil
}
// SetOutOfOrderTimeWindow updates the out of order related parameters.
// If the Head already has a WBL set, then the wbl will be ignored.
func (h *Head) SetOutOfOrderTimeWindow(oooTimeWindow int64, wbl *wal.WAL) {
if oooTimeWindow > 0 && h.wbl == nil {
h.wbl = wbl
}
h.opts.OutOfOrderTimeWindow.Store(oooTimeWindow)
}
// PostingsCardinalityStats returns top 10 highest cardinality stats By label and value names.
@ -773,6 +919,27 @@ func (h *Head) updateMinMaxTime(mint, maxt int64) {
}
}
func (h *Head) updateMinOOOMaxOOOTime(mint, maxt int64) {
for {
lt := h.MinOOOTime()
if mint >= lt {
break
}
if h.minOOOTime.CompareAndSwap(lt, mint) {
break
}
}
for {
ht := h.MaxOOOTime()
if maxt <= ht {
break
}
if h.maxOOOTime.CompareAndSwap(ht, maxt) {
break
}
}
}
// SetMinValidTime sets the minimum timestamp the head can ingest.
func (h *Head) SetMinValidTime(minValidTime int64) {
h.minValidTime.Store(minValidTime)
@ -838,30 +1005,7 @@ func (h *Head) truncateMemory(mint int64) (err error) {
}
h.metrics.headTruncateTotal.Inc()
start := time.Now()
actualMint := h.gc()
level.Info(h.logger).Log("msg", "Head GC completed", "duration", time.Since(start))
h.metrics.gcDuration.Observe(time.Since(start).Seconds())
if actualMint > h.minTime.Load() {
// The actual mint of the Head is higher than the one asked to truncate.
appendableMinValidTime := h.appendableMinValidTime()
if actualMint < appendableMinValidTime {
h.minTime.Store(actualMint)
h.minValidTime.Store(actualMint)
} else {
// The actual min time is in the appendable window.
// So we set the mint to the appendableMinValidTime.
h.minTime.Store(appendableMinValidTime)
h.minValidTime.Store(appendableMinValidTime)
}
}
// Truncate the chunk m-mapper.
if err := h.chunkDiskMapper.Truncate(mint); err != nil {
return errors.Wrap(err, "truncate chunks.HeadReadWriter")
}
return nil
return h.truncateSeriesAndChunkDiskMapper("truncateMemory")
}
// WaitForPendingReadersInTimeRange waits for queries overlapping with given range to finish querying.
@ -950,7 +1094,7 @@ func (h *Head) truncateWAL(mint int64) error {
}
// Start a new segment, so low ingestion volume TSDB don't have more WAL than
// needed.
if err := h.wal.NextSegment(); err != nil {
if _, err := h.wal.NextSegment(); err != nil {
return errors.Wrap(err, "next segment")
}
last-- // Never consider last segment for checkpoint.
@ -1016,6 +1160,59 @@ func (h *Head) truncateWAL(mint int64) error {
return nil
}
// truncateOOO
// - truncates the OOO WBL files whose index is strictly less than lastWBLFile.
// - garbage collects all the m-map chunks from the memory that are less than or equal to minOOOMmapRef
// and then deletes the series that do not have any data anymore.
func (h *Head) truncateOOO(lastWBLFile int, minOOOMmapRef chunks.ChunkDiskMapperRef) error {
curMinOOOMmapRef := chunks.ChunkDiskMapperRef(h.minOOOMmapRef.Load())
if minOOOMmapRef.GreaterThan(curMinOOOMmapRef) {
h.minOOOMmapRef.Store(uint64(minOOOMmapRef))
if err := h.truncateSeriesAndChunkDiskMapper("truncateOOO"); err != nil {
return err
}
}
return h.wbl.Truncate(lastWBLFile)
}
// truncateSeriesAndChunkDiskMapper is a helper function for truncateMemory and truncateOOO.
// It runs GC on the Head and truncates the ChunkDiskMapper accordingly.
func (h *Head) truncateSeriesAndChunkDiskMapper(caller string) error {
start := time.Now()
headMaxt := h.MaxTime()
actualMint, minOOOTime, minMmapFile := h.gc()
level.Info(h.logger).Log("msg", "Head GC completed", "caller", caller, "duration", time.Since(start))
h.metrics.gcDuration.Observe(time.Since(start).Seconds())
if actualMint > h.minTime.Load() {
// The actual mint of the head is higher than the one asked to truncate.
appendableMinValidTime := h.appendableMinValidTime()
if actualMint < appendableMinValidTime {
h.minTime.Store(actualMint)
h.minValidTime.Store(actualMint)
} else {
// The actual min time is in the appendable window.
// So we set the mint to the appendableMinValidTime.
h.minTime.Store(appendableMinValidTime)
h.minValidTime.Store(appendableMinValidTime)
}
}
if headMaxt-h.opts.OutOfOrderTimeWindow.Load() < minOOOTime {
// The allowed OOO window is lower than the min OOO time seen during GC.
// So it is possible that some OOO sample was inserted that was less that minOOOTime.
// So we play safe and set it to the min that was possible.
minOOOTime = headMaxt - h.opts.OutOfOrderTimeWindow.Load()
}
h.minOOOTime.Store(minOOOTime)
// Truncate the chunk m-mapper.
if err := h.chunkDiskMapper.Truncate(uint32(minMmapFile)); err != nil {
return errors.Wrap(err, "truncate chunks.HeadReadWriter by file number")
}
return nil
}
type Stats struct {
NumSeries uint64
MinTime, MaxTime int64
@ -1149,14 +1346,20 @@ func (h *Head) Delete(mint, maxt int64, ms ...*labels.Matcher) error {
}
// gc removes data before the minimum timestamp from the head.
// It returns the actual min times of the chunks present in the Head.
func (h *Head) gc() int64 {
// It returns
// * The actual min times of the chunks present in the Head.
// * The min OOO time seen during the GC.
// * Min mmap file number seen in the series (in-order and out-of-order) after gc'ing the series.
func (h *Head) gc() (actualInOrderMint, minOOOTime int64, minMmapFile int) {
// Only data strictly lower than this timestamp must be deleted.
mint := h.MinTime()
// Only ooo m-map chunks strictly lower than or equal to this ref
// must be deleted.
minOOOMmapRef := chunks.ChunkDiskMapperRef(h.minOOOMmapRef.Load())
// Drop old chunks and remember series IDs and hashes if they can be
// deleted entirely.
deleted, chunksRemoved, actualMint := h.series.gc(mint)
deleted, chunksRemoved, actualInOrderMint, minOOOTime, minMmapFile := h.series.gc(mint, minOOOMmapRef)
seriesRemoved := len(deleted)
h.metrics.seriesRemoved.Add(float64(seriesRemoved))
@ -1186,7 +1389,7 @@ func (h *Head) gc() int64 {
h.deletedMtx.Unlock()
}
return actualMint
return actualInOrderMint, minOOOTime, minMmapFile
}
// Tombstones returns a new reader over the head's tombstones
@ -1224,6 +1427,18 @@ func (h *Head) MaxTime() int64 {
return h.maxTime.Load()
}
// MinOOOTime returns the lowest time bound on visible data in the out of order
// head.
func (h *Head) MinOOOTime() int64 {
return h.minOOOTime.Load()
}
// MaxOOOTime returns the highest timestamp on visible data in the out of order
// head.
func (h *Head) MaxOOOTime() int64 {
return h.maxOOOTime.Load()
}
// compactable returns whether the head has a compactable range.
// The head has a compactable range when the head time range is 1.5 times the chunk range.
// The 0.5 acts as a buffer of the appendable window.
@ -1241,6 +1456,9 @@ func (h *Head) Close() error {
if h.wal != nil {
errs.Add(h.wal.Close())
}
if h.wbl != nil {
errs.Add(h.wbl.Close())
}
if errs.Err() == nil && h.opts.EnableMemorySnapshotOnShutdown {
errs.Add(h.performChunkSnapshot())
}
@ -1271,7 +1489,7 @@ func (h *Head) getOrCreate(hash uint64, lset labels.Labels) (*memSeries, bool, e
func (h *Head) getOrCreateWithID(id chunks.HeadSeriesRef, hash uint64, lset labels.Labels) (*memSeries, bool, error) {
s, created, err := h.series.getOrSet(hash, lset, func() *memSeries {
return newMemSeries(lset, id, h.chunkRange.Load(), h.opts.IsolationDisabled)
return newMemSeries(lset, id, h.chunkRange.Load(), h.opts.OutOfOrderCapMax.Load(), h.opts.IsolationDisabled)
})
if err != nil {
return nil, false, err
@ -1333,7 +1551,7 @@ const (
)
// stripeSeries holds series by HeadSeriesRef ("ID") and also by hash of their labels.
// ID-based lookups via (getByID()) are preferred over getByHash() for performance reasons.
// ID-based lookups via getByID() are preferred over getByHash() for performance reasons.
// It locks modulo ranges of IDs and hashes to reduce lock contention.
// The locks are padded to not be on the same cache line. Filling the padded space
// with the maps was profiled to be slower likely due to the additional pointer
@ -1375,13 +1593,16 @@ func newStripeSeries(stripeSize int, seriesCallback SeriesLifecycleCallback) *st
// note: returning map[chunks.HeadSeriesRef]struct{} would be more accurate,
// but the returned map goes into postings.Delete() which expects a map[storage.SeriesRef]struct
// and there's no easy way to cast maps.
func (s *stripeSeries) gc(mint int64) (map[storage.SeriesRef]struct{}, int, int64) {
// minMmapFile is the min mmap file number seen in the series (in-order and out-of-order) after gc'ing the series.
func (s *stripeSeries) gc(mint int64, minOOOMmapRef chunks.ChunkDiskMapperRef) (_ map[storage.SeriesRef]struct{}, _ int, _, _ int64, minMmapFile int) {
var (
deleted = map[storage.SeriesRef]struct{}{}
deletedForCallback = []labels.Labels{}
rmChunks = 0
actualMint int64 = math.MaxInt64
minOOOTime int64 = math.MaxInt64
)
minMmapFile = math.MaxInt32
// Run through all series and truncate old chunks. Mark those with no
// chunks left as deleted and store their ID.
for i := 0; i < s.size; i++ {
@ -1390,9 +1611,32 @@ func (s *stripeSeries) gc(mint int64) (map[storage.SeriesRef]struct{}, int, int6
for hash, all := range s.hashes[i] {
for _, series := range all {
series.Lock()
rmChunks += series.truncateChunksBefore(mint)
rmChunks += series.truncateChunksBefore(mint, minOOOMmapRef)
if len(series.mmappedChunks) > 0 || series.headChunk != nil || series.pendingCommit {
if len(series.mmappedChunks) > 0 {
seq, _ := series.mmappedChunks[0].ref.Unpack()
if seq < minMmapFile {
minMmapFile = seq
}
}
if len(series.oooMmappedChunks) > 0 {
seq, _ := series.oooMmappedChunks[0].ref.Unpack()
if seq < minMmapFile {
minMmapFile = seq
}
for _, ch := range series.oooMmappedChunks {
if ch.minTime < minOOOTime {
minOOOTime = ch.minTime
}
}
}
if series.oooHeadChunk != nil {
if series.oooHeadChunk.minTime < minOOOTime {
minOOOTime = series.oooHeadChunk.minTime
}
}
if len(series.mmappedChunks) > 0 || len(series.oooMmappedChunks) > 0 ||
series.headChunk != nil || series.oooHeadChunk != nil || series.pendingCommit {
seriesMint := series.minTime()
if seriesMint < actualMint {
actualMint = seriesMint
@ -1435,7 +1679,7 @@ func (s *stripeSeries) gc(mint int64) (map[storage.SeriesRef]struct{}, int, int6
actualMint = mint
}
return deleted, rmChunks, actualMint
return deleted, rmChunks, actualMint, minOOOTime, minMmapFile
}
func (s *stripeSeries) getByID(id chunks.HeadSeriesRef) *memSeries {
@ -1528,11 +1772,16 @@ type memSeries struct {
//
// pN is the pointer to the mmappedChunk referered to by HeadChunkID=N
mmappedChunks []*mmappedChunk
headChunk *memChunk // Most recent chunk in memory that's still being built.
firstChunkID chunks.HeadChunkID // HeadChunkID for mmappedChunks[0]
mmMaxTime int64 // Max time of any mmapped chunk, only used during WAL replay.
headChunk *memChunk // Most recent chunk in memory that's still being built.
chunkRange int64
firstChunkID chunks.HeadChunkID // HeadChunkID for mmappedChunks[0]
oooMmappedChunks []*mmappedChunk // Immutable chunks on disk containing OOO samples.
oooHeadChunk *oooHeadChunk // Most recent chunk for ooo samples in memory that's still being built.
firstOOOChunkID chunks.HeadChunkID // HeadOOOChunkID for oooMmappedChunks[0]
mmMaxTime int64 // Max time of any mmapped chunk, only used during WAL replay.
chunkRange int64
oooCapMax uint8
nextAt int64 // Timestamp at which to cut the next chunk.
@ -1551,12 +1800,13 @@ type memSeries struct {
pendingCommit bool // Whether there are samples waiting to be committed to this series.
}
func newMemSeries(lset labels.Labels, id chunks.HeadSeriesRef, chunkRange int64, isolationDisabled bool) *memSeries {
func newMemSeries(lset labels.Labels, id chunks.HeadSeriesRef, chunkRange, oooCapMax int64, isolationDisabled bool) *memSeries {
s := &memSeries{
lset: lset,
ref: id,
chunkRange: chunkRange,
nextAt: math.MinInt64,
oooCapMax: uint8(oooCapMax),
}
if !isolationDisabled {
s.txs = newTxRing(4)
@ -1575,6 +1825,7 @@ func (s *memSeries) minTime() int64 {
}
func (s *memSeries) maxTime() int64 {
// The highest timestamps will always be in the regular (non-OOO) chunks, even if OOO is enabled.
c := s.head()
if c != nil {
return c.maxTime
@ -1588,26 +1839,39 @@ func (s *memSeries) maxTime() int64 {
// truncateChunksBefore removes all chunks from the series that
// have no timestamp at or after mint.
// Chunk IDs remain unchanged.
func (s *memSeries) truncateChunksBefore(mint int64) (removed int) {
func (s *memSeries) truncateChunksBefore(mint int64, minOOOMmapRef chunks.ChunkDiskMapperRef) int {
var removedInOrder int
if s.headChunk != nil && s.headChunk.maxTime < mint {
// If head chunk is truncated, we can truncate all mmapped chunks.
removed = 1 + len(s.mmappedChunks)
s.firstChunkID += chunks.HeadChunkID(removed)
removedInOrder = 1 + len(s.mmappedChunks)
s.firstChunkID += chunks.HeadChunkID(removedInOrder)
s.headChunk = nil
s.mmappedChunks = nil
return removed
}
if len(s.mmappedChunks) > 0 {
for i, c := range s.mmappedChunks {
if c.maxTime >= mint {
break
}
removed = i + 1
removedInOrder = i + 1
}
s.mmappedChunks = append(s.mmappedChunks[:0], s.mmappedChunks[removed:]...)
s.firstChunkID += chunks.HeadChunkID(removed)
s.mmappedChunks = append(s.mmappedChunks[:0], s.mmappedChunks[removedInOrder:]...)
s.firstChunkID += chunks.HeadChunkID(removedInOrder)
}
return removed
var removedOOO int
if len(s.oooMmappedChunks) > 0 {
for i, c := range s.oooMmappedChunks {
if c.ref.GreaterThan(minOOOMmapRef) {
break
}
removedOOO = i + 1
}
s.oooMmappedChunks = append(s.oooMmappedChunks[:0], s.oooMmappedChunks[removedOOO:]...)
s.firstOOOChunkID += chunks.HeadChunkID(removedOOO)
}
return removedInOrder + removedOOO
}
// cleanupAppendIDsBelow cleans up older appendIDs. Has to be called after
@ -1627,6 +1891,16 @@ type memChunk struct {
minTime, maxTime int64
}
type oooHeadChunk struct {
chunk *OOOChunk
minTime, maxTime int64 // can probably be removed and pulled out of the chunk instead
}
// OverlapsClosedInterval returns true if the chunk overlaps [mint, maxt].
func (mc *oooHeadChunk) OverlapsClosedInterval(mint, maxt int64) bool {
return overlapsClosedInterval(mc.minTime, mc.maxTime, mint, maxt)
}
// OverlapsClosedInterval returns true if the chunk overlaps [mint, maxt].
func (mc *memChunk) OverlapsClosedInterval(mint, maxt int64) bool {
return overlapsClosedInterval(mc.minTime, mc.maxTime, mint, maxt)
@ -1655,12 +1929,15 @@ func (noopSeriesLifecycleCallback) PostCreation(labels.Labels) {}
func (noopSeriesLifecycleCallback) PostDeletion(...labels.Labels) {}
func (h *Head) Size() int64 {
var walSize int64
var walSize, wblSize int64
if h.wal != nil {
walSize, _ = h.wal.Size()
}
if h.wbl != nil {
wblSize, _ = h.wbl.Size()
}
cdmSize, _ := h.chunkDiskMapper.Size()
return walSize + cdmSize
return walSize + wblSize + cdmSize
}
func (h *RangeHead) Size() int64 {

View file

@ -137,6 +137,8 @@ func (h *Head) appender() *headAppender {
minValidTime: h.appendableMinValidTime(),
mint: math.MaxInt64,
maxt: math.MinInt64,
headMaxt: h.MaxTime(),
oooTimeWindow: h.opts.OutOfOrderTimeWindow.Load(),
samples: h.getAppendBuffer(),
sampleSeries: h.getSeriesBuffer(),
exemplars: exemplarsBuf,
@ -252,9 +254,11 @@ type exemplarWithSeriesRef struct {
}
type headAppender struct {
head *Head
minValidTime int64 // No samples below this timestamp are allowed.
mint, maxt int64
head *Head
minValidTime int64 // No samples below this timestamp are allowed.
mint, maxt int64
headMaxt int64 // We track it here to not take the lock for every sample appended.
oooTimeWindow int64 // Use the same for the entire append, and don't load the atomic for each sample.
series []record.RefSeries // New series held by this appender.
metadata []record.RefMetadata // New metadata held by this appender.
@ -268,7 +272,9 @@ type headAppender struct {
}
func (a *headAppender) Append(ref storage.SeriesRef, lset labels.Labels, t int64, v float64) (storage.SeriesRef, error) {
if t < a.minValidTime {
// For OOO inserts, this restriction is irrelevant and will be checked later once we confirm the sample is an in-order append.
// If OOO inserts are disabled, we may as well as check this as early as we can and avoid more work.
if a.oooTimeWindow == 0 && t < a.minValidTime {
a.head.metrics.outOfBoundSamples.Inc()
return 0, storage.ErrOutOfBounds
}
@ -300,15 +306,25 @@ func (a *headAppender) Append(ref storage.SeriesRef, lset labels.Labels, t int64
}
s.Lock()
if err := s.appendable(t, v); err != nil {
s.Unlock()
if err == storage.ErrOutOfOrderSample {
// TODO(codesome): If we definitely know at this point that the sample is ooo, then optimise
// to skip that sample from the WAL and write only in the WBL.
_, delta, err := s.appendable(t, v, a.headMaxt, a.minValidTime, a.oooTimeWindow)
if err == nil {
s.pendingCommit = true
}
s.Unlock()
if delta > 0 {
a.head.metrics.oooHistogram.Observe(float64(delta))
}
if err != nil {
switch err {
case storage.ErrOutOfOrderSample:
a.head.metrics.outOfOrderSamples.Inc()
case storage.ErrTooOldSample:
a.head.metrics.tooOldSamples.Inc()
}
return 0, err
}
s.pendingCommit = true
s.Unlock()
if t < a.mint {
a.mint = t
@ -326,25 +342,46 @@ func (a *headAppender) Append(ref storage.SeriesRef, lset labels.Labels, t int64
return storage.SeriesRef(s.ref), nil
}
// appendable checks whether the given sample is valid for appending to the series.
func (s *memSeries) appendable(t int64, v float64) error {
c := s.head()
if c == nil {
return nil
// appendable checks whether the given sample is valid for appending to the series. (if we return false and no error)
// The sample belongs to the out of order chunk if we return true and no error.
// An error signifies the sample cannot be handled.
func (s *memSeries) appendable(t int64, v float64, headMaxt, minValidTime, oooTimeWindow int64) (isOOO bool, oooDelta int64, err error) {
// Check if we can append in the in-order chunk.
if t >= minValidTime {
if s.head() == nil {
// The series has no sample and was freshly created.
return false, 0, nil
}
msMaxt := s.maxTime()
if t > msMaxt {
return false, 0, nil
}
if t == msMaxt {
// We are allowing exact duplicates as we can encounter them in valid cases
// like federation and erroring out at that time would be extremely noisy.
// This only checks against the latest in-order sample.
// The OOO headchunk has its own method to detect these duplicates.
if math.Float64bits(s.sampleBuf[3].v) != math.Float64bits(v) {
return false, 0, storage.ErrDuplicateSampleForTimestamp
}
// Sample is identical (ts + value) with most current (highest ts) sample in sampleBuf.
return false, 0, nil
}
}
if t > c.maxTime {
return nil
// The sample cannot go in the in-order chunk. Check if it can go in the out-of-order chunk.
if oooTimeWindow > 0 && t >= headMaxt-oooTimeWindow {
return true, headMaxt - t, nil
}
if t < c.maxTime {
return storage.ErrOutOfOrderSample
// The sample cannot go in both in-order and out-of-order chunk.
if oooTimeWindow > 0 {
return true, headMaxt - t, storage.ErrTooOldSample
}
// We are allowing exact duplicates as we can encounter them in valid cases
// like federation and erroring out at that time would be extremely noisy.
if math.Float64bits(s.sampleBuf[3].v) != math.Float64bits(v) {
return storage.ErrDuplicateSampleForTimestamp
if t < minValidTime {
return false, headMaxt - t, storage.ErrOutOfBounds
}
return nil
return false, headMaxt - t, storage.ErrOutOfOrderSample
}
// AppendExemplar for headAppender assumes the series ref already exists, and so it doesn't
@ -487,6 +524,7 @@ func exemplarsForEncoding(es []exemplarWithSeriesRef) []record.RefExemplar {
}
// Commit writes to the WAL and adds the data to the Head.
// TODO(codesome): Refactor this method to reduce indentation and make it more readable.
func (a *headAppender) Commit() (err error) {
if a.closed {
return ErrAppenderClosed
@ -517,24 +555,143 @@ func (a *headAppender) Commit() (err error) {
defer a.head.putMetadataBuffer(a.metadata)
defer a.head.iso.closeAppend(a.appendID)
total := len(a.samples)
var series *memSeries
var (
samplesAppended = len(a.samples)
oooAccepted int // number of samples out of order but accepted: with ooo enabled and within time window
oooRejected int // number of samples rejected due to: out of order but OOO support disabled.
tooOldRejected int // number of samples rejected due to: that are out of order but too old (OOO support enabled, but outside time window)
oobRejected int // number of samples rejected due to: out of bounds: with t < minValidTime (OOO support disabled)
inOrderMint int64 = math.MaxInt64
inOrderMaxt int64 = math.MinInt64
ooomint int64 = math.MaxInt64
ooomaxt int64 = math.MinInt64
wblSamples []record.RefSample
oooMmapMarkers map[chunks.HeadSeriesRef]chunks.ChunkDiskMapperRef
oooRecords [][]byte
series *memSeries
enc record.Encoder
)
defer func() {
for i := range oooRecords {
a.head.putBytesBuffer(oooRecords[i][:0])
}
}()
collectOOORecords := func() {
if a.head.wbl == nil {
// WBL is not enabled. So no need to collect.
wblSamples = nil
oooMmapMarkers = nil
return
}
// The m-map happens before adding a new sample. So we collect
// the m-map markers first, and then samples.
// WBL Graphically:
// WBL Before this Commit(): [old samples before this commit for chunk 1]
// WBL After this Commit(): [old samples before this commit for chunk 1][new samples in this commit for chunk 1]mmapmarker1[samples for chunk 2]mmapmarker2[samples for chunk 3]
if oooMmapMarkers != nil {
markers := make([]record.RefMmapMarker, 0, len(oooMmapMarkers))
for ref, mmapRef := range oooMmapMarkers {
markers = append(markers, record.RefMmapMarker{
Ref: ref,
MmapRef: mmapRef,
})
}
r := enc.MmapMarkers(markers, a.head.getBytesBuffer())
oooRecords = append(oooRecords, r)
}
if len(wblSamples) > 0 {
r := enc.Samples(wblSamples, a.head.getBytesBuffer())
oooRecords = append(oooRecords, r)
}
wblSamples = nil
oooMmapMarkers = nil
}
for i, s := range a.samples {
series = a.sampleSeries[i]
series.Lock()
ok, chunkCreated := series.append(s.T, s.V, a.appendID, a.head.chunkDiskMapper)
series.cleanupAppendIDsBelow(a.cleanupAppendIDsBelow)
series.pendingCommit = false
series.Unlock()
if !ok {
total--
a.head.metrics.outOfOrderSamples.Inc()
oooSample, _, err := series.appendable(s.T, s.V, a.headMaxt, a.minValidTime, a.oooTimeWindow)
switch err {
case storage.ErrOutOfOrderSample:
samplesAppended--
oooRejected++
case storage.ErrOutOfBounds:
samplesAppended--
oobRejected++
case storage.ErrTooOldSample:
samplesAppended--
tooOldRejected++
case nil:
// Do nothing.
default:
samplesAppended--
}
var ok, chunkCreated bool
if err == nil && oooSample {
// Sample is OOO and OOO handling is enabled
// and the delta is within the OOO tolerance.
var mmapRef chunks.ChunkDiskMapperRef
ok, chunkCreated, mmapRef = series.insert(s.T, s.V, a.head.chunkDiskMapper)
if chunkCreated {
r, ok := oooMmapMarkers[series.ref]
if !ok || r != 0 {
// !ok means there are no markers collected for these samples yet. So we first flush the samples
// before setting this m-map marker.
// r != 0 means we have already m-mapped a chunk for this series in the same Commit().
// Hence, before we m-map again, we should add the samples and m-map markers
// seen till now to the WBL records.
collectOOORecords()
}
if oooMmapMarkers == nil {
oooMmapMarkers = make(map[chunks.HeadSeriesRef]chunks.ChunkDiskMapperRef)
}
oooMmapMarkers[series.ref] = mmapRef
}
if ok {
wblSamples = append(wblSamples, s)
if s.T < ooomint {
ooomint = s.T
}
if s.T > ooomaxt {
ooomaxt = s.T
}
oooAccepted++
} else {
// Sample is an exact duplicate of the last sample.
// NOTE: We can only detect updates if they clash with a sample in the OOOHeadChunk,
// not with samples in already flushed OOO chunks.
// TODO(codesome): Add error reporting? It depends on addressing https://github.com/prometheus/prometheus/discussions/10305.
samplesAppended--
}
} else if err == nil {
ok, chunkCreated = series.append(s.T, s.V, a.appendID, a.head.chunkDiskMapper)
if ok {
if s.T < inOrderMint {
inOrderMint = s.T
}
if s.T > inOrderMaxt {
inOrderMaxt = s.T
}
} else {
// The sample is an exact duplicate, and should be silently dropped.
samplesAppended--
}
}
if chunkCreated {
a.head.metrics.chunks.Inc()
a.head.metrics.chunksCreated.Inc()
}
series.cleanupAppendIDsBelow(a.cleanupAppendIDsBelow)
series.pendingCommit = false
series.Unlock()
}
for i, m := range a.metadata {
@ -544,12 +701,48 @@ func (a *headAppender) Commit() (err error) {
series.Unlock()
}
a.head.metrics.samplesAppended.Add(float64(total))
a.head.updateMinMaxTime(a.mint, a.maxt)
a.head.metrics.outOfOrderSamples.Add(float64(oooRejected))
a.head.metrics.outOfBoundSamples.Add(float64(oobRejected))
a.head.metrics.tooOldSamples.Add(float64(tooOldRejected))
a.head.metrics.samplesAppended.Add(float64(samplesAppended))
a.head.metrics.outOfOrderSamplesAppended.Add(float64(oooAccepted))
a.head.updateMinMaxTime(inOrderMint, inOrderMaxt)
a.head.updateMinOOOMaxOOOTime(ooomint, ooomaxt)
collectOOORecords()
if a.head.wbl != nil {
if err := a.head.wbl.Log(oooRecords...); err != nil {
// TODO(codesome): Currently WBL logging of ooo samples is best effort here since we cannot try logging
// until we have found what samples become OOO. We can try having a metric for this failure.
// Returning the error here is not correct because we have already put the samples into the memory,
// hence the append/insert was a success.
level.Error(a.head.logger).Log("msg", "Failed to log out of order samples into the WAL", "err", err)
}
}
return nil
}
// insert is like append, except it inserts. Used for OOO samples.
func (s *memSeries) insert(t int64, v float64, chunkDiskMapper *chunks.ChunkDiskMapper) (inserted, chunkCreated bool, mmapRef chunks.ChunkDiskMapperRef) {
c := s.oooHeadChunk
if c == nil || c.chunk.NumSamples() == int(s.oooCapMax) {
// Note: If no new samples come in then we rely on compaction to clean up stale in-memory OOO chunks.
c, mmapRef = s.cutNewOOOHeadChunk(t, chunkDiskMapper)
chunkCreated = true
}
ok := c.chunk.Insert(t, v)
if ok {
if chunkCreated || t < c.minTime {
c.minTime = t
}
if chunkCreated || t > c.maxTime {
c.maxTime = t
}
}
return ok, chunkCreated, mmapRef
}
// append adds the sample (t, v) to the series. The caller also has to provide
// the appendID for isolation. (The appendID can be zero, which results in no
// isolation for this append.)
@ -567,7 +760,7 @@ func (s *memSeries) append(t int64, v float64, appendID uint64, chunkDiskMapper
// Out of order sample. Sample timestamp is already in the mmapped chunks, so ignore it.
return false, false
}
// There is no chunk in this series yet, create the first chunk for the sample.
// There is no head chunk in this series yet, create the first chunk for the sample.
c = s.cutNewHeadChunk(t, chunkDiskMapper)
chunkCreated = true
}
@ -651,6 +844,36 @@ func (s *memSeries) cutNewHeadChunk(mint int64, chunkDiskMapper *chunks.ChunkDis
return s.headChunk
}
func (s *memSeries) cutNewOOOHeadChunk(mint int64, chunkDiskMapper *chunks.ChunkDiskMapper) (*oooHeadChunk, chunks.ChunkDiskMapperRef) {
ref := s.mmapCurrentOOOHeadChunk(chunkDiskMapper)
s.oooHeadChunk = &oooHeadChunk{
chunk: NewOOOChunk(),
minTime: mint,
maxTime: math.MinInt64,
}
return s.oooHeadChunk, ref
}
func (s *memSeries) mmapCurrentOOOHeadChunk(chunkDiskMapper *chunks.ChunkDiskMapper) chunks.ChunkDiskMapperRef {
if s.oooHeadChunk == nil {
// There is no head chunk, so nothing to m-map here.
return 0
}
xor, _ := s.oooHeadChunk.chunk.ToXOR() // Encode to XorChunk which is more compact and implements all of the needed functionality.
oooXor := &chunkenc.OOOXORChunk{XORChunk: xor}
chunkRef := chunkDiskMapper.WriteChunk(s.ref, s.oooHeadChunk.minTime, s.oooHeadChunk.maxTime, oooXor, handleChunkWriteError)
s.oooMmappedChunks = append(s.oooMmappedChunks, &mmappedChunk{
ref: chunkRef,
numSamples: uint16(xor.NumSamples()),
minTime: s.oooHeadChunk.minTime,
maxTime: s.oooHeadChunk.maxTime,
})
s.oooHeadChunk = nil
return chunkRef
}
func (s *memSeries) mmapCurrentHeadChunk(chunkDiskMapper *chunks.ChunkDiskMapper) {
if s.headChunk == nil {
// There is no head chunk, so nothing to m-map here.

View file

@ -30,7 +30,7 @@ func BenchmarkHeadStripeSeriesCreate(b *testing.B) {
opts := DefaultHeadOptions()
opts.ChunkRange = 1000
opts.ChunkDirRoot = chunkDir
h, err := NewHead(nil, nil, nil, opts, nil)
h, err := NewHead(nil, nil, nil, nil, opts, nil)
require.NoError(b, err)
defer h.Close()
@ -45,7 +45,7 @@ func BenchmarkHeadStripeSeriesCreateParallel(b *testing.B) {
opts := DefaultHeadOptions()
opts.ChunkRange = 1000
opts.ChunkDirRoot = chunkDir
h, err := NewHead(nil, nil, nil, opts, nil)
h, err := NewHead(nil, nil, nil, nil, opts, nil)
require.NoError(b, err)
defer h.Close()
@ -69,7 +69,7 @@ func BenchmarkHeadStripeSeriesCreate_PreCreationFailure(b *testing.B) {
// Mock the PreCreation() callback to fail on each series.
opts.SeriesCallback = failingSeriesLifecycleCallback{}
h, err := NewHead(nil, nil, nil, opts, nil)
h, err := NewHead(nil, nil, nil, nil, opts, nil)
require.NoError(b, err)
defer h.Close()

View file

@ -183,11 +183,20 @@ func (h *headIndexReader) Series(ref storage.SeriesRef, lbls *labels.Labels, chk
return nil
}
// headChunkID returns the HeadChunkID corresponding to .mmappedChunks[pos]
// headChunkID returns the HeadChunkID referred to by the given position.
// * 0 <= pos < len(s.mmappedChunks) refer to s.mmappedChunks[pos]
// * pos == len(s.mmappedChunks) refers to s.headChunk
func (s *memSeries) headChunkID(pos int) chunks.HeadChunkID {
return chunks.HeadChunkID(pos) + s.firstChunkID
}
// oooHeadChunkID returns the HeadChunkID referred to by the given position.
// * 0 <= pos < len(s.oooMmappedChunks) refer to s.oooMmappedChunks[pos]
// * pos == len(s.oooMmappedChunks) refers to s.oooHeadChunk
func (s *memSeries) oooHeadChunkID(pos int) chunks.HeadChunkID {
return chunks.HeadChunkID(pos) + s.firstOOOChunkID
}
// LabelValueFor returns label value for the given label name in the series referred to by ID.
func (h *headIndexReader) LabelValueFor(id storage.SeriesRef, label string) (string, error) {
memSeries := h.head.series.getByID(chunks.HeadSeriesRef(id))
@ -258,8 +267,8 @@ func (h *headChunkReader) Close() error {
}
// Chunk returns the chunk for the reference number.
func (h *headChunkReader) Chunk(ref chunks.ChunkRef) (chunkenc.Chunk, error) {
sid, cid := chunks.HeadChunkRef(ref).Unpack()
func (h *headChunkReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
sid, cid := chunks.HeadChunkRef(meta.Ref).Unpack()
s := h.head.series.getByID(sid)
// This means that the series has been garbage collected.
@ -330,6 +339,260 @@ func (s *memSeries) chunk(id chunks.HeadChunkID, chunkDiskMapper *chunks.ChunkDi
return mc, true, nil
}
// oooMergedChunk returns the requested chunk based on the given chunks.Meta
// reference from memory or by m-mapping it from the disk. The returned chunk
// might be a merge of all the overlapping chunks, if any, amongst all the
// chunks in the OOOHead.
// This function is not thread safe unless the caller holds a lock.
func (s *memSeries) oooMergedChunk(meta chunks.Meta, cdm *chunks.ChunkDiskMapper, mint, maxt int64) (chunk *mergedOOOChunks, err error) {
_, cid := chunks.HeadChunkRef(meta.Ref).Unpack()
// ix represents the index of chunk in the s.mmappedChunks slice. The chunk meta's are
// incremented by 1 when new chunk is created, hence (meta - firstChunkID) gives the slice index.
// The max index for the s.mmappedChunks slice can be len(s.mmappedChunks)-1, hence if the ix
// is len(s.mmappedChunks), it represents the next chunk, which is the head chunk.
ix := int(cid) - int(s.firstOOOChunkID)
if ix < 0 || ix > len(s.oooMmappedChunks) {
return nil, storage.ErrNotFound
}
if ix == len(s.oooMmappedChunks) {
if s.oooHeadChunk == nil {
return nil, errors.New("invalid ooo head chunk")
}
}
// We create a temporary slice of chunk metas to hold the information of all
// possible chunks that may overlap with the requested chunk.
tmpChks := make([]chunkMetaAndChunkDiskMapperRef, 0, len(s.oooMmappedChunks))
oooHeadRef := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(len(s.oooMmappedChunks))))
if s.oooHeadChunk != nil && s.oooHeadChunk.OverlapsClosedInterval(mint, maxt) {
// We only want to append the head chunk if this chunk existed when
// Series() was called. This brings consistency in case new data
// is added in between Series() and Chunk() calls.
if oooHeadRef == meta.OOOLastRef {
tmpChks = append(tmpChks, chunkMetaAndChunkDiskMapperRef{
meta: chunks.Meta{
// Ignoring samples added before and after the last known min and max time for this chunk.
MinTime: meta.OOOLastMinTime,
MaxTime: meta.OOOLastMaxTime,
Ref: oooHeadRef,
},
})
}
}
for i, c := range s.oooMmappedChunks {
chunkRef := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(i)))
// We can skip chunks that came in later than the last known OOOLastRef.
if chunkRef > meta.OOOLastRef {
break
}
if chunkRef == meta.OOOLastRef {
tmpChks = append(tmpChks, chunkMetaAndChunkDiskMapperRef{
meta: chunks.Meta{
MinTime: meta.OOOLastMinTime,
MaxTime: meta.OOOLastMaxTime,
Ref: chunkRef,
},
ref: c.ref,
origMinT: c.minTime,
origMaxT: c.maxTime,
})
} else if c.OverlapsClosedInterval(mint, maxt) {
tmpChks = append(tmpChks, chunkMetaAndChunkDiskMapperRef{
meta: chunks.Meta{
MinTime: c.minTime,
MaxTime: c.maxTime,
Ref: chunkRef,
},
ref: c.ref,
})
}
}
// Next we want to sort all the collected chunks by min time so we can find
// those that overlap and stop when we know the rest don't.
sort.Sort(byMinTimeAndMinRef(tmpChks))
mc := &mergedOOOChunks{}
absoluteMax := int64(math.MinInt64)
for _, c := range tmpChks {
if c.meta.Ref != meta.Ref && (len(mc.chunks) == 0 || c.meta.MinTime > absoluteMax) {
continue
}
if c.meta.Ref == oooHeadRef {
var xor *chunkenc.XORChunk
// If head chunk min and max time match the meta OOO markers
// that means that the chunk has not expanded so we can append
// it as it is.
if s.oooHeadChunk.minTime == meta.OOOLastMinTime && s.oooHeadChunk.maxTime == meta.OOOLastMaxTime {
xor, err = s.oooHeadChunk.chunk.ToXOR() // TODO(jesus.vazquez) (This is an optimization idea that has no priority and might not be that useful) See if we could use a copy of the underlying slice. That would leave the more expensive ToXOR() function only for the usecase where Bytes() is called.
} else {
// We need to remove samples that are outside of the markers
xor, err = s.oooHeadChunk.chunk.ToXORBetweenTimestamps(meta.OOOLastMinTime, meta.OOOLastMaxTime)
}
if err != nil {
return nil, errors.Wrap(err, "failed to convert ooo head chunk to xor chunk")
}
c.meta.Chunk = xor
} else {
chk, err := cdm.Chunk(c.ref)
if err != nil {
if _, ok := err.(*chunks.CorruptionErr); ok {
return nil, errors.Wrap(err, "invalid ooo mmapped chunk")
}
return nil, err
}
if c.meta.Ref == meta.OOOLastRef &&
(c.origMinT != meta.OOOLastMinTime || c.origMaxT != meta.OOOLastMaxTime) {
// The head expanded and was memory mapped so now we need to
// wrap the chunk within a chunk that doesnt allows us to iterate
// through samples out of the OOOLastMinT and OOOLastMaxT
// markers.
c.meta.Chunk = boundedChunk{chk, meta.OOOLastMinTime, meta.OOOLastMaxTime}
} else {
c.meta.Chunk = chk
}
}
mc.chunks = append(mc.chunks, c.meta)
if c.meta.MaxTime > absoluteMax {
absoluteMax = c.meta.MaxTime
}
}
return mc, nil
}
var _ chunkenc.Chunk = &mergedOOOChunks{}
// mergedOOOChunks holds the list of overlapping chunks. This struct satisfies
// chunkenc.Chunk.
type mergedOOOChunks struct {
chunks []chunks.Meta
}
// Bytes is a very expensive method because its calling the iterator of all the
// chunks in the mergedOOOChunk and building a new chunk with the samples.
func (o mergedOOOChunks) Bytes() []byte {
xc := chunkenc.NewXORChunk()
app, err := xc.Appender()
if err != nil {
panic(err)
}
it := o.Iterator(nil)
for it.Next() {
t, v := it.At()
app.Append(t, v)
}
return xc.Bytes()
}
func (o mergedOOOChunks) Encoding() chunkenc.Encoding {
return chunkenc.EncXOR
}
func (o mergedOOOChunks) Appender() (chunkenc.Appender, error) {
return nil, errors.New("can't append to mergedOOOChunks")
}
func (o mergedOOOChunks) Iterator(iterator chunkenc.Iterator) chunkenc.Iterator {
iterators := make([]chunkenc.Iterator, 0, len(o.chunks))
for _, c := range o.chunks {
iterators = append(iterators, c.Chunk.Iterator(nil))
}
return storage.NewChainSampleIterator(iterators)
}
func (o mergedOOOChunks) NumSamples() int {
samples := 0
for _, c := range o.chunks {
samples += c.Chunk.NumSamples()
}
return samples
}
func (o mergedOOOChunks) Compact() {}
var _ chunkenc.Chunk = &boundedChunk{}
// boundedChunk is an implementation of chunkenc.Chunk that uses a
// boundedIterator that only iterates through samples which timestamps are
// >= minT and <= maxT
type boundedChunk struct {
chunkenc.Chunk
minT int64
maxT int64
}
func (b boundedChunk) Bytes() []byte {
xor := chunkenc.NewXORChunk()
a, _ := xor.Appender()
it := b.Iterator(nil)
for it.Next() {
t, v := it.At()
a.Append(t, v)
}
return xor.Bytes()
}
func (b boundedChunk) Iterator(iterator chunkenc.Iterator) chunkenc.Iterator {
it := b.Chunk.Iterator(iterator)
if it == nil {
panic("iterator shouldn't be nil")
}
return boundedIterator{it, b.minT, b.maxT}
}
var _ chunkenc.Iterator = &boundedIterator{}
// boundedIterator is an implementation of Iterator that only iterates through
// samples which timestamps are >= minT and <= maxT
type boundedIterator struct {
chunkenc.Iterator
minT int64
maxT int64
}
// Next the first time its called it will advance as many positions as necessary
// until its able to find a sample within the bounds minT and maxT.
// If there are samples within bounds it will advance one by one amongst them.
// If there are no samples within bounds it will return false.
func (b boundedIterator) Next() bool {
for b.Iterator.Next() {
t, _ := b.Iterator.At()
if t < b.minT {
continue
} else if t > b.maxT {
return false
}
return true
}
return false
}
func (b boundedIterator) Seek(t int64) bool {
if t < b.minT {
// We must seek at least up to b.minT if it is asked for something before that.
ok := b.Iterator.Seek(b.minT)
if !ok {
return false
}
t, _ := b.Iterator.At()
return t <= b.maxT
}
if t > b.maxT {
// We seek anyway so that the subsequent Next() calls will also return false.
b.Iterator.Seek(t)
return false
}
return b.Iterator.Seek(t)
}
// safeChunk makes sure that the chunk can be accessed without a race condition
type safeChunk struct {
chunkenc.Chunk
s *memSeries

178
tsdb/head_read_test.go Normal file
View file

@ -0,0 +1,178 @@
// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdb
import (
"fmt"
"testing"
"github.com/stretchr/testify/require"
"github.com/prometheus/prometheus/tsdb/chunkenc"
)
func TestBoundedChunk(t *testing.T) {
tests := []struct {
name string
inputChunk chunkenc.Chunk
inputMinT int64
inputMaxT int64
initialSeek int64
seekIsASuccess bool
expSamples []sample
}{
{
name: "if there are no samples it returns nothing",
inputChunk: newTestChunk(0),
expSamples: nil,
},
{
name: "bounds represent a single sample",
inputChunk: newTestChunk(10),
expSamples: []sample{
{0, 0},
},
},
{
name: "if there are bounds set only samples within them are returned",
inputChunk: newTestChunk(10),
inputMinT: 1,
inputMaxT: 8,
expSamples: []sample{
{1, 1},
{2, 2},
{3, 3},
{4, 4},
{5, 5},
{6, 6},
{7, 7},
{8, 8},
},
},
{
name: "if bounds set and only maxt is less than actual maxt",
inputChunk: newTestChunk(10),
inputMinT: 0,
inputMaxT: 5,
expSamples: []sample{
{0, 0},
{1, 1},
{2, 2},
{3, 3},
{4, 4},
{5, 5},
},
},
{
name: "if bounds set and only mint is more than actual mint",
inputChunk: newTestChunk(10),
inputMinT: 5,
inputMaxT: 9,
expSamples: []sample{
{5, 5},
{6, 6},
{7, 7},
{8, 8},
{9, 9},
},
},
{
name: "if there are bounds set with seek before mint",
inputChunk: newTestChunk(10),
inputMinT: 3,
inputMaxT: 7,
initialSeek: 1,
seekIsASuccess: true,
expSamples: []sample{
{3, 3},
{4, 4},
{5, 5},
{6, 6},
{7, 7},
},
},
{
name: "if there are bounds set with seek between mint and maxt",
inputChunk: newTestChunk(10),
inputMinT: 3,
inputMaxT: 7,
initialSeek: 5,
seekIsASuccess: true,
expSamples: []sample{
{5, 5},
{6, 6},
{7, 7},
},
},
{
name: "if there are bounds set with seek after maxt",
inputChunk: newTestChunk(10),
inputMinT: 3,
inputMaxT: 7,
initialSeek: 8,
seekIsASuccess: false,
},
}
for _, tc := range tests {
t.Run(fmt.Sprintf("name=%s", tc.name), func(t *testing.T) {
chunk := boundedChunk{tc.inputChunk, tc.inputMinT, tc.inputMaxT}
// Testing Bytes()
expChunk := chunkenc.NewXORChunk()
if tc.inputChunk.NumSamples() > 0 {
app, err := expChunk.Appender()
require.NoError(t, err)
for ts := tc.inputMinT; ts <= tc.inputMaxT; ts++ {
app.Append(ts, float64(ts))
}
}
require.Equal(t, expChunk.Bytes(), chunk.Bytes())
var samples []sample
it := chunk.Iterator(nil)
if tc.initialSeek != 0 {
// Testing Seek()
ok := it.Seek(tc.initialSeek)
require.Equal(t, tc.seekIsASuccess, ok)
if ok {
t, v := it.At()
samples = append(samples, sample{t, v})
}
}
// Testing Next()
for it.Next() {
t, v := it.At()
samples = append(samples, sample{t, v})
}
// it.Next() should keep returning false.
for i := 0; i < 10; i++ {
require.False(t, it.Next())
}
require.Equal(t, tc.expSamples, samples)
})
}
}
func newTestChunk(numSamples int) chunkenc.Chunk {
xor := chunkenc.NewXORChunk()
a, _ := xor.Appender()
for i := 0; i < numSamples; i++ {
a.Append(int64(i), float64(i))
}
return xor
}

View file

@ -49,7 +49,7 @@ import (
"github.com/prometheus/prometheus/tsdb/wal"
)
func newTestHead(t testing.TB, chunkRange int64, compressWAL bool) (*Head, *wal.WAL) {
func newTestHead(t testing.TB, chunkRange int64, compressWAL, oooEnabled bool) (*Head, *wal.WAL) {
dir := t.TempDir()
wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, compressWAL)
require.NoError(t, err)
@ -59,18 +59,23 @@ func newTestHead(t testing.TB, chunkRange int64, compressWAL bool) (*Head, *wal.
opts.ChunkDirRoot = dir
opts.EnableExemplarStorage = true
opts.MaxExemplars.Store(config.DefaultExemplarsConfig.MaxExemplars)
if oooEnabled {
opts.OutOfOrderTimeWindow.Store(10 * time.Minute.Milliseconds())
}
h, err := NewHead(nil, nil, wlog, opts, nil)
h, err := NewHead(nil, nil, wlog, nil, opts, nil)
require.NoError(t, err)
require.NoError(t, h.chunkDiskMapper.IterateAllChunks(func(_ chunks.HeadSeriesRef, _ chunks.ChunkDiskMapperRef, _, _ int64, _ uint16) error { return nil }))
require.NoError(t, h.chunkDiskMapper.IterateAllChunks(func(_ chunks.HeadSeriesRef, _ chunks.ChunkDiskMapperRef, _, _ int64, _ uint16, _ chunkenc.Encoding) error {
return nil
}))
return h, wlog
}
func BenchmarkCreateSeries(b *testing.B) {
series := genSeries(b.N, 10, 0, 0)
h, _ := newTestHead(b, 10000, false)
h, _ := newTestHead(b, 10000, false, false)
defer func() {
require.NoError(b, h.Close())
}()
@ -224,7 +229,7 @@ func BenchmarkLoadWAL(b *testing.B) {
require.NoError(b, err)
for k := 0; k < c.batches*c.seriesPerBatch; k++ {
// Create one mmapped chunk per series, with one sample at the given time.
s := newMemSeries(labels.Labels{}, chunks.HeadSeriesRef(k)*101, c.mmappedChunkT, defaultIsolationDisabled)
s := newMemSeries(labels.Labels{}, chunks.HeadSeriesRef(k)*101, c.mmappedChunkT, 1, defaultIsolationDisabled)
s.append(c.mmappedChunkT, 42, 0, chunkDiskMapper)
s.mmapCurrentHeadChunk(chunkDiskMapper)
}
@ -255,7 +260,7 @@ func BenchmarkLoadWAL(b *testing.B) {
opts := DefaultHeadOptions()
opts.ChunkRange = 1000
opts.ChunkDirRoot = w.Dir()
h, err := NewHead(nil, nil, w, opts, nil)
h, err := NewHead(nil, nil, w, nil, opts, nil)
require.NoError(b, err)
h.Init(0)
}
@ -271,7 +276,7 @@ func BenchmarkLoadWAL(b *testing.B) {
// While appending the samples to the head it concurrently queries them from multiple go routines and verifies that the
// returned results are correct.
func TestHead_HighConcurrencyReadAndWrite(t *testing.T) {
head, _ := newTestHead(t, DefaultBlockDuration, false)
head, _ := newTestHead(t, DefaultBlockDuration, false, false)
defer func() {
require.NoError(t, head.Close())
}()
@ -487,7 +492,7 @@ func TestHead_ReadWAL(t *testing.T) {
},
}
head, w := newTestHead(t, 1000, compress)
head, w := newTestHead(t, 1000, compress, false)
defer func() {
require.NoError(t, head.Close())
}()
@ -531,7 +536,7 @@ func TestHead_ReadWAL(t *testing.T) {
}
func TestHead_WALMultiRef(t *testing.T) {
head, w := newTestHead(t, 1000, false)
head, w := newTestHead(t, 1000, false, false)
require.NoError(t, head.Init(0))
@ -572,7 +577,7 @@ func TestHead_WALMultiRef(t *testing.T) {
opts := DefaultHeadOptions()
opts.ChunkRange = 1000
opts.ChunkDirRoot = w.Dir()
head, err = NewHead(nil, nil, w, opts, nil)
head, err = NewHead(nil, nil, w, nil, opts, nil)
require.NoError(t, err)
require.NoError(t, head.Init(0))
defer func() {
@ -591,7 +596,7 @@ func TestHead_WALMultiRef(t *testing.T) {
}
func TestHead_ActiveAppenders(t *testing.T) {
head, _ := newTestHead(t, 1000, false)
head, _ := newTestHead(t, 1000, false, false)
defer head.Close()
require.NoError(t, head.Init(0))
@ -624,14 +629,14 @@ func TestHead_ActiveAppenders(t *testing.T) {
}
func TestHead_UnknownWALRecord(t *testing.T) {
head, w := newTestHead(t, 1000, false)
head, w := newTestHead(t, 1000, false, false)
w.Log([]byte{255, 42})
require.NoError(t, head.Init(0))
require.NoError(t, head.Close())
}
func TestHead_Truncate(t *testing.T) {
h, _ := newTestHead(t, 1000, false)
h, _ := newTestHead(t, 1000, false, false)
defer func() {
require.NoError(t, h.Close())
}()
@ -733,7 +738,7 @@ func TestMemSeries_truncateChunks(t *testing.T) {
},
}
s := newMemSeries(labels.FromStrings("a", "b"), 1, 2000, defaultIsolationDisabled)
s := newMemSeries(labels.FromStrings("a", "b"), 1, 2000, 1, defaultIsolationDisabled)
for i := 0; i < 4000; i += 5 {
ok, _ := s.append(int64(i), float64(i), 0, chunkDiskMapper)
@ -752,7 +757,7 @@ func TestMemSeries_truncateChunks(t *testing.T) {
require.NotNil(t, chk)
require.NoError(t, err)
s.truncateChunksBefore(2000)
s.truncateChunksBefore(2000, 0)
require.Equal(t, int64(2000), s.mmappedChunks[0].minTime)
_, _, err = s.chunk(0, chunkDiskMapper, &memChunkPool)
@ -789,7 +794,7 @@ func TestHeadDeleteSeriesWithoutSamples(t *testing.T) {
{Ref: 50, T: 90, V: 1},
},
}
head, w := newTestHead(t, 1000, compress)
head, w := newTestHead(t, 1000, compress, false)
defer func() {
require.NoError(t, head.Close())
}()
@ -857,7 +862,8 @@ func TestHeadDeleteSimple(t *testing.T) {
for _, compress := range []bool{false, true} {
t.Run(fmt.Sprintf("compress=%t", compress), func(t *testing.T) {
for _, c := range cases {
head, w := newTestHead(t, 1000, compress)
head, w := newTestHead(t, 1000, compress, false)
require.NoError(t, head.Init(0))
app := head.Appender(context.Background())
for _, smpl := range smplsAll {
@ -887,7 +893,7 @@ func TestHeadDeleteSimple(t *testing.T) {
opts := DefaultHeadOptions()
opts.ChunkRange = 1000
opts.ChunkDirRoot = reloadedW.Dir()
reloadedHead, err := NewHead(nil, nil, reloadedW, opts, nil)
reloadedHead, err := NewHead(nil, nil, reloadedW, nil, opts, nil)
require.NoError(t, err)
require.NoError(t, reloadedHead.Init(0))
@ -937,7 +943,7 @@ func TestHeadDeleteSimple(t *testing.T) {
}
func TestDeleteUntilCurMax(t *testing.T) {
hb, _ := newTestHead(t, 1000000, false)
hb, _ := newTestHead(t, 1000000, false, false)
defer func() {
require.NoError(t, hb.Close())
}()
@ -990,7 +996,7 @@ func TestDeletedSamplesAndSeriesStillInWALAfterCheckpoint(t *testing.T) {
numSamples := 10000
// Enough samples to cause a checkpoint.
hb, w := newTestHead(t, int64(numSamples)*10, false)
hb, w := newTestHead(t, int64(numSamples)*10, false, false)
for i := 0; i < numSamples; i++ {
app := hb.Appender(context.Background())
@ -1082,7 +1088,7 @@ func TestDelete_e2e(t *testing.T) {
seriesMap[labels.New(l...).String()] = []tsdbutil.Sample{}
}
hb, _ := newTestHead(t, 100000, false)
hb, _ := newTestHead(t, 100000, false, false)
defer func() {
require.NoError(t, hb.Close())
}()
@ -1271,7 +1277,7 @@ func TestMemSeries_append(t *testing.T) {
require.NoError(t, chunkDiskMapper.Close())
}()
s := newMemSeries(labels.Labels{}, 1, 500, defaultIsolationDisabled)
s := newMemSeries(labels.Labels{}, 1, 500, 1, defaultIsolationDisabled)
// Add first two samples at the very end of a chunk range and the next two
// on and after it.
@ -1325,7 +1331,7 @@ func TestMemSeries_append_atVariableRate(t *testing.T) {
require.NoError(t, chunkDiskMapper.Close())
})
s := newMemSeries(labels.Labels{}, 1, DefaultBlockDuration, defaultIsolationDisabled)
s := newMemSeries(labels.Labels{}, 1, DefaultBlockDuration, 0, defaultIsolationDisabled)
// At this slow rate, we will fill the chunk in two block durations.
slowRate := (DefaultBlockDuration * 2) / samplesPerChunk
@ -1361,7 +1367,7 @@ func TestMemSeries_append_atVariableRate(t *testing.T) {
func TestGCChunkAccess(t *testing.T) {
// Put a chunk, select it. GC it and then access it.
h, _ := newTestHead(t, 1000, false)
h, _ := newTestHead(t, 1000, false, false)
defer func() {
require.NoError(t, h.Close())
}()
@ -1398,22 +1404,22 @@ func TestGCChunkAccess(t *testing.T) {
cr, err := h.chunksRange(0, 1500, nil)
require.NoError(t, err)
_, err = cr.Chunk(chunks[0].Ref)
_, err = cr.Chunk(chunks[0])
require.NoError(t, err)
_, err = cr.Chunk(chunks[1].Ref)
_, err = cr.Chunk(chunks[1])
require.NoError(t, err)
require.NoError(t, h.Truncate(1500)) // Remove a chunk.
_, err = cr.Chunk(chunks[0].Ref)
_, err = cr.Chunk(chunks[0])
require.Equal(t, storage.ErrNotFound, err)
_, err = cr.Chunk(chunks[1].Ref)
_, err = cr.Chunk(chunks[1])
require.NoError(t, err)
}
func TestGCSeriesAccess(t *testing.T) {
// Put a series, select it. GC it and then access it.
h, _ := newTestHead(t, 1000, false)
h, _ := newTestHead(t, 1000, false, false)
defer func() {
require.NoError(t, h.Close())
}()
@ -1450,23 +1456,23 @@ func TestGCSeriesAccess(t *testing.T) {
cr, err := h.chunksRange(0, 2000, nil)
require.NoError(t, err)
_, err = cr.Chunk(chunks[0].Ref)
_, err = cr.Chunk(chunks[0])
require.NoError(t, err)
_, err = cr.Chunk(chunks[1].Ref)
_, err = cr.Chunk(chunks[1])
require.NoError(t, err)
require.NoError(t, h.Truncate(2000)) // Remove the series.
require.Equal(t, (*memSeries)(nil), h.series.getByID(1))
_, err = cr.Chunk(chunks[0].Ref)
_, err = cr.Chunk(chunks[0])
require.Equal(t, storage.ErrNotFound, err)
_, err = cr.Chunk(chunks[1].Ref)
_, err = cr.Chunk(chunks[1])
require.Equal(t, storage.ErrNotFound, err)
}
func TestUncommittedSamplesNotLostOnTruncate(t *testing.T) {
h, _ := newTestHead(t, 1000, false)
h, _ := newTestHead(t, 1000, false, false)
defer func() {
require.NoError(t, h.Close())
}()
@ -1496,7 +1502,7 @@ func TestUncommittedSamplesNotLostOnTruncate(t *testing.T) {
}
func TestRemoveSeriesAfterRollbackAndTruncate(t *testing.T) {
h, _ := newTestHead(t, 1000, false)
h, _ := newTestHead(t, 1000, false, false)
defer func() {
require.NoError(t, h.Close())
}()
@ -1529,7 +1535,7 @@ func TestRemoveSeriesAfterRollbackAndTruncate(t *testing.T) {
func TestHead_LogRollback(t *testing.T) {
for _, compress := range []bool{false, true} {
t.Run(fmt.Sprintf("compress=%t", compress), func(t *testing.T) {
h, w := newTestHead(t, 1000, compress)
h, w := newTestHead(t, 1000, compress, false)
defer func() {
require.NoError(t, h.Close())
}()
@ -1606,7 +1612,7 @@ func TestWalRepair_DecodingError(t *testing.T) {
opts := DefaultHeadOptions()
opts.ChunkRange = 1
opts.ChunkDirRoot = w.Dir()
h, err := NewHead(nil, nil, w, opts, nil)
h, err := NewHead(nil, nil, w, nil, opts, nil)
require.NoError(t, err)
require.Equal(t, 0.0, prom_testutil.ToFloat64(h.metrics.walCorruptionsTotal))
initErr := h.Init(math.MinInt64)
@ -1660,7 +1666,8 @@ func TestHeadReadWriterRepair(t *testing.T) {
opts := DefaultHeadOptions()
opts.ChunkRange = chunkRange
opts.ChunkDirRoot = dir
h, err := NewHead(nil, nil, w, opts, nil)
opts.ChunkWriteQueueSize = 1 // We need to set this option so that we use the async queue. Upstream prometheus uses the queue directly.
h, err := NewHead(nil, nil, w, nil, opts, nil)
require.NoError(t, err)
require.Equal(t, 0.0, prom_testutil.ToFloat64(h.metrics.mmapChunkCorruptionTotal))
require.NoError(t, h.Init(math.MinInt64))
@ -1715,7 +1722,7 @@ func TestHeadReadWriterRepair(t *testing.T) {
}
func TestNewWalSegmentOnTruncate(t *testing.T) {
h, wlog := newTestHead(t, 1000, false)
h, wlog := newTestHead(t, 1000, false, false)
defer func() {
require.NoError(t, h.Close())
}()
@ -1745,7 +1752,7 @@ func TestNewWalSegmentOnTruncate(t *testing.T) {
}
func TestAddDuplicateLabelName(t *testing.T) {
h, _ := newTestHead(t, 1000, false)
h, _ := newTestHead(t, 1000, false, false)
defer func() {
require.NoError(t, h.Close())
}()
@ -1828,7 +1835,7 @@ func TestMemSeriesIsolation(t *testing.T) {
}
// Test isolation without restart of Head.
hb, _ := newTestHead(t, 1000, false)
hb, _ := newTestHead(t, 1000, false, false)
i := addSamples(hb)
testIsolation(hb, i)
@ -1890,7 +1897,7 @@ func TestMemSeriesIsolation(t *testing.T) {
require.NoError(t, hb.Close())
// Test isolation with restart of Head. This is to verify the num samples of chunks after m-map chunk replay.
hb, w := newTestHead(t, 1000, false)
hb, w := newTestHead(t, 1000, false, false)
i = addSamples(hb)
require.NoError(t, hb.Close())
@ -1899,7 +1906,7 @@ func TestMemSeriesIsolation(t *testing.T) {
opts := DefaultHeadOptions()
opts.ChunkRange = 1000
opts.ChunkDirRoot = wlog.Dir()
hb, err = NewHead(nil, nil, wlog, opts, nil)
hb, err = NewHead(nil, nil, wlog, nil, opts, nil)
defer func() { require.NoError(t, hb.Close()) }()
require.NoError(t, err)
require.NoError(t, hb.Init(0))
@ -1943,7 +1950,7 @@ func TestIsolationRollback(t *testing.T) {
}
// Rollback after a failed append and test if the low watermark has progressed anyway.
hb, _ := newTestHead(t, 1000, false)
hb, _ := newTestHead(t, 1000, false, false)
defer func() {
require.NoError(t, hb.Close())
}()
@ -1974,7 +1981,7 @@ func TestIsolationLowWatermarkMonotonous(t *testing.T) {
t.Skip("skipping test since tsdb isolation is disabled")
}
hb, _ := newTestHead(t, 1000, false)
hb, _ := newTestHead(t, 1000, false, false)
defer func() {
require.NoError(t, hb.Close())
}()
@ -2011,7 +2018,7 @@ func TestIsolationAppendIDZeroIsNoop(t *testing.T) {
t.Skip("skipping test since tsdb isolation is disabled")
}
h, _ := newTestHead(t, 1000, false)
h, _ := newTestHead(t, 1000, false, false)
defer func() {
require.NoError(t, h.Close())
}()
@ -2036,7 +2043,7 @@ func TestIsolationWithoutAdd(t *testing.T) {
t.Skip("skipping test since tsdb isolation is disabled")
}
hb, _ := newTestHead(t, 1000, false)
hb, _ := newTestHead(t, 1000, false, false)
defer func() {
require.NoError(t, hb.Close())
}()
@ -2131,7 +2138,7 @@ func TestOutOfOrderSamplesMetric(t *testing.T) {
}
func testHeadSeriesChunkRace(t *testing.T) {
h, _ := newTestHead(t, 1000, false)
h, _ := newTestHead(t, 1000, false, false)
defer func() {
require.NoError(t, h.Close())
}()
@ -2166,7 +2173,7 @@ func testHeadSeriesChunkRace(t *testing.T) {
}
func TestHeadLabelNamesValuesWithMinMaxRange(t *testing.T) {
head, _ := newTestHead(t, 1000, false)
head, _ := newTestHead(t, 1000, false, false)
defer func() {
require.NoError(t, head.Close())
}()
@ -2226,7 +2233,7 @@ func TestHeadLabelNamesValuesWithMinMaxRange(t *testing.T) {
}
func TestHeadLabelValuesWithMatchers(t *testing.T) {
head, _ := newTestHead(t, 1000, false)
head, _ := newTestHead(t, 1000, false, false)
t.Cleanup(func() { require.NoError(t, head.Close()) })
app := head.Appender(context.Background())
@ -2285,7 +2292,7 @@ func TestHeadLabelValuesWithMatchers(t *testing.T) {
}
func TestHeadLabelNamesWithMatchers(t *testing.T) {
head, _ := newTestHead(t, 1000, false)
head, _ := newTestHead(t, 1000, false, false)
defer func() {
require.NoError(t, head.Close())
}()
@ -2353,7 +2360,7 @@ func TestHeadLabelNamesWithMatchers(t *testing.T) {
}
func TestErrReuseAppender(t *testing.T) {
head, _ := newTestHead(t, 1000, false)
head, _ := newTestHead(t, 1000, false, false)
defer func() {
require.NoError(t, head.Close())
}()
@ -2389,7 +2396,7 @@ func TestErrReuseAppender(t *testing.T) {
func TestHeadMintAfterTruncation(t *testing.T) {
chunkRange := int64(2000)
head, _ := newTestHead(t, chunkRange, false)
head, _ := newTestHead(t, chunkRange, false, false)
app := head.Appender(context.Background())
_, err := app.Append(0, labels.FromStrings("a", "b"), 100, 100)
@ -2423,7 +2430,7 @@ func TestHeadMintAfterTruncation(t *testing.T) {
func TestHeadExemplars(t *testing.T) {
chunkRange := int64(2000)
head, _ := newTestHead(t, chunkRange, false)
head, _ := newTestHead(t, chunkRange, false, false)
app := head.Appender(context.Background())
l := labels.FromStrings("traceId", "123")
@ -2445,7 +2452,7 @@ func TestHeadExemplars(t *testing.T) {
func BenchmarkHeadLabelValuesWithMatchers(b *testing.B) {
chunkRange := int64(2000)
head, _ := newTestHead(b, chunkRange, false)
head, _ := newTestHead(b, chunkRange, false, false)
b.Cleanup(func() { require.NoError(b, head.Close()) })
app := head.Appender(context.Background())
@ -2483,7 +2490,7 @@ func TestMemSafeIteratorSeekIntoBuffer(t *testing.T) {
require.NoError(t, chunkDiskMapper.Close())
}()
s := newMemSeries(labels.Labels{}, 1, 500, defaultIsolationDisabled)
s := newMemSeries(labels.Labels{}, 1, 500, 1, defaultIsolationDisabled)
for i := 0; i < 7; i++ {
ok, _ := s.append(int64(i), float64(i), 0, chunkDiskMapper)
@ -2754,7 +2761,7 @@ func TestWaitForPendingReadersInTimeRange(t *testing.T) {
}
func TestChunkSnapshot(t *testing.T) {
head, _ := newTestHead(t, 120*4, false)
head, _ := newTestHead(t, 120*4, false, false)
defer func() {
head.opts.EnableMemorySnapshotOnShutdown = false
require.NoError(t, head.Close())
@ -2833,7 +2840,7 @@ func TestChunkSnapshot(t *testing.T) {
openHeadAndCheckReplay := func() {
w, err := wal.NewSize(nil, nil, head.wal.Dir(), 32768, false)
require.NoError(t, err)
head, err = NewHead(nil, nil, w, head.opts, nil)
head, err = NewHead(nil, nil, w, nil, head.opts, nil)
require.NoError(t, err)
require.NoError(t, head.Init(math.MinInt64))
@ -2996,7 +3003,7 @@ func TestChunkSnapshot(t *testing.T) {
}
func TestSnapshotError(t *testing.T) {
head, _ := newTestHead(t, 120*4, false)
head, _ := newTestHead(t, 120*4, false, false)
defer func() {
head.opts.EnableMemorySnapshotOnShutdown = false
require.NoError(t, head.Close())
@ -3043,7 +3050,7 @@ func TestSnapshotError(t *testing.T) {
w, err := wal.NewSize(nil, nil, head.wal.Dir(), 32768, false)
require.NoError(t, err)
// Testing https://github.com/prometheus/prometheus/issues/9437 with the registry.
head, err = NewHead(prometheus.NewRegistry(), nil, w, head.opts, nil)
head, err = NewHead(prometheus.NewRegistry(), nil, w, nil, head.opts, nil)
require.NoError(t, err)
require.NoError(t, head.Init(math.MinInt64))
@ -3102,7 +3109,7 @@ func TestChunkSnapshotReplayBug(t *testing.T) {
opts := DefaultHeadOptions()
opts.ChunkDirRoot = dir
opts.EnableMemorySnapshotOnShutdown = true
head, err := NewHead(nil, nil, wlog, opts, nil)
head, err := NewHead(nil, nil, wlog, nil, opts, nil)
require.NoError(t, err)
require.NoError(t, head.Init(math.MinInt64))
defer func() {
@ -3136,7 +3143,7 @@ func TestChunkSnapshotTakenAfterIncompleteSnapshot(t *testing.T) {
opts := DefaultHeadOptions()
opts.ChunkDirRoot = dir
opts.EnableMemorySnapshotOnShutdown = true
head, err := NewHead(nil, nil, wlog, opts, nil)
head, err := NewHead(nil, nil, wlog, nil, opts, nil)
require.NoError(t, err)
require.NoError(t, head.Init(math.MinInt64))
@ -3159,6 +3166,251 @@ func TestChunkSnapshotTakenAfterIncompleteSnapshot(t *testing.T) {
require.Greater(t, offset, 0)
}
// TestOOOWalReplay checks the replay at a low level.
// TODO(codesome): Needs test for ooo WAL repair.
func TestOOOWalReplay(t *testing.T) {
dir := t.TempDir()
wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
require.NoError(t, err)
oooWlog, err := wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
require.NoError(t, err)
opts := DefaultHeadOptions()
opts.ChunkRange = 1000
opts.ChunkDirRoot = dir
opts.OutOfOrderTimeWindow.Store(30 * time.Minute.Milliseconds())
h, err := NewHead(nil, nil, wlog, oooWlog, opts, nil)
require.NoError(t, err)
require.NoError(t, h.Init(0))
var expOOOSamples []sample
l := labels.FromStrings("foo", "bar")
appendSample := func(mins int64, isOOO bool) {
app := h.Appender(context.Background())
ts, v := mins*time.Minute.Milliseconds(), float64(mins)
_, err := app.Append(0, l, ts, v)
require.NoError(t, err)
require.NoError(t, app.Commit())
if isOOO {
expOOOSamples = append(expOOOSamples, sample{t: ts, v: v})
}
}
// In-order sample.
appendSample(60, false)
// Out of order samples.
appendSample(40, true)
appendSample(35, true)
appendSample(50, true)
appendSample(55, true)
appendSample(59, true)
appendSample(31, true)
// Check that Head's time ranges are set properly.
require.Equal(t, 60*time.Minute.Milliseconds(), h.MinTime())
require.Equal(t, 60*time.Minute.Milliseconds(), h.MaxTime())
require.Equal(t, 31*time.Minute.Milliseconds(), h.MinOOOTime())
require.Equal(t, 59*time.Minute.Milliseconds(), h.MaxOOOTime())
// Restart head.
require.NoError(t, h.Close())
wlog, err = wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
require.NoError(t, err)
oooWlog, err = wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
require.NoError(t, err)
h, err = NewHead(nil, nil, wlog, oooWlog, opts, nil)
require.NoError(t, err)
require.NoError(t, h.Init(0)) // Replay happens here.
// Get the ooo samples from the Head.
ms, ok, err := h.getOrCreate(l.Hash(), l)
require.NoError(t, err)
require.False(t, ok)
require.NotNil(t, ms)
xor, err := ms.oooHeadChunk.chunk.ToXOR()
require.NoError(t, err)
it := xor.Iterator(nil)
actOOOSamples := make([]sample, 0, len(expOOOSamples))
for it.Next() {
ts, v := it.At()
actOOOSamples = append(actOOOSamples, sample{t: ts, v: v})
}
// OOO chunk will be sorted. Hence sort the expected samples.
sort.Slice(expOOOSamples, func(i, j int) bool {
return expOOOSamples[i].t < expOOOSamples[j].t
})
require.Equal(t, expOOOSamples, actOOOSamples)
require.NoError(t, h.Close())
}
// TestOOOMmapReplay checks the replay at a low level.
func TestOOOMmapReplay(t *testing.T) {
dir := t.TempDir()
wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
require.NoError(t, err)
oooWlog, err := wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
require.NoError(t, err)
opts := DefaultHeadOptions()
opts.ChunkRange = 1000
opts.ChunkDirRoot = dir
opts.OutOfOrderCapMax.Store(30)
opts.OutOfOrderTimeWindow.Store(1000 * time.Minute.Milliseconds())
h, err := NewHead(nil, nil, wlog, oooWlog, opts, nil)
require.NoError(t, err)
require.NoError(t, h.Init(0))
l := labels.FromStrings("foo", "bar")
appendSample := func(mins int64) {
app := h.Appender(context.Background())
ts, v := mins*time.Minute.Milliseconds(), float64(mins)
_, err := app.Append(0, l, ts, v)
require.NoError(t, err)
require.NoError(t, app.Commit())
}
// In-order sample.
appendSample(200)
// Out of order samples. 92 samples to create 3 m-map chunks.
for mins := int64(100); mins <= 191; mins++ {
appendSample(mins)
}
ms, ok, err := h.getOrCreate(l.Hash(), l)
require.NoError(t, err)
require.False(t, ok)
require.NotNil(t, ms)
require.Len(t, ms.oooMmappedChunks, 3)
// Verify that we can access the chunks without error.
for _, m := range ms.oooMmappedChunks {
chk, err := h.chunkDiskMapper.Chunk(m.ref)
require.NoError(t, err)
require.Equal(t, int(m.numSamples), chk.NumSamples())
}
expMmapChunks := make([]*mmappedChunk, 3)
copy(expMmapChunks, ms.oooMmappedChunks)
// Restart head.
require.NoError(t, h.Close())
wlog, err = wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
require.NoError(t, err)
oooWlog, err = wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
require.NoError(t, err)
h, err = NewHead(nil, nil, wlog, oooWlog, opts, nil)
require.NoError(t, err)
require.NoError(t, h.Init(0)) // Replay happens here.
// Get the mmap chunks from the Head.
ms, ok, err = h.getOrCreate(l.Hash(), l)
require.NoError(t, err)
require.False(t, ok)
require.NotNil(t, ms)
require.Len(t, ms.oooMmappedChunks, len(expMmapChunks))
// Verify that we can access the chunks without error.
for _, m := range ms.oooMmappedChunks {
chk, err := h.chunkDiskMapper.Chunk(m.ref)
require.NoError(t, err)
require.Equal(t, int(m.numSamples), chk.NumSamples())
}
actMmapChunks := make([]*mmappedChunk, len(expMmapChunks))
copy(actMmapChunks, ms.oooMmappedChunks)
require.Equal(t, expMmapChunks, actMmapChunks)
require.NoError(t, h.Close())
}
func TestHeadInit_DiscardChunksWithUnsupportedEncoding(t *testing.T) {
h, _ := newTestHead(t, 1000, false, false)
defer func() {
require.NoError(t, h.Close())
}()
require.NoError(t, h.Init(0))
ctx := context.Background()
app := h.Appender(ctx)
seriesLabels := labels.FromStrings("a", "1")
var seriesRef storage.SeriesRef
var err error
for i := 0; i < 400; i++ {
seriesRef, err = app.Append(0, seriesLabels, int64(i), float64(i))
require.NoError(t, err)
}
require.NoError(t, app.Commit())
require.Greater(t, prom_testutil.ToFloat64(h.metrics.chunksCreated), 1.0)
uc := newUnsupportedChunk()
// Make this chunk not overlap with the previous and the next
h.chunkDiskMapper.WriteChunk(chunks.HeadSeriesRef(seriesRef), 500, 600, uc, func(err error) { require.NoError(t, err) })
app = h.Appender(ctx)
for i := 700; i < 1200; i++ {
_, err := app.Append(0, seriesLabels, int64(i), float64(i))
require.NoError(t, err)
}
require.NoError(t, app.Commit())
require.Greater(t, prom_testutil.ToFloat64(h.metrics.chunksCreated), 4.0)
series, created, err := h.getOrCreate(seriesLabels.Hash(), seriesLabels)
require.NoError(t, err)
require.False(t, created, "should already exist")
require.NotNil(t, series, "should return the series we created above")
expChunks := make([]*mmappedChunk, len(series.mmappedChunks))
copy(expChunks, series.mmappedChunks)
require.NoError(t, h.Close())
wlog, err := wal.NewSize(nil, nil, filepath.Join(h.opts.ChunkDirRoot, "wal"), 32768, false)
require.NoError(t, err)
h, err = NewHead(nil, nil, wlog, nil, h.opts, nil)
require.NoError(t, err)
require.NoError(t, h.Init(0))
series, created, err = h.getOrCreate(seriesLabels.Hash(), seriesLabels)
require.NoError(t, err)
require.False(t, created, "should already exist")
require.NotNil(t, series, "should return the series we created above")
require.Equal(t, expChunks, series.mmappedChunks)
}
const (
UnsupportedMask = 0b10000000
EncUnsupportedXOR = chunkenc.EncXOR | UnsupportedMask
)
// unsupportedChunk holds a XORChunk and overrides the Encoding() method.
type unsupportedChunk struct {
*chunkenc.XORChunk
}
func newUnsupportedChunk() *unsupportedChunk {
return &unsupportedChunk{chunkenc.NewXORChunk()}
}
func (c *unsupportedChunk) Encoding() chunkenc.Encoding {
return EncUnsupportedXOR
}
// Tests https://github.com/prometheus/prometheus/issues/10277.
func TestMmapPanicAfterMmapReplayCorruption(t *testing.T) {
dir := t.TempDir()
@ -3171,7 +3423,7 @@ func TestMmapPanicAfterMmapReplayCorruption(t *testing.T) {
opts.EnableExemplarStorage = true
opts.MaxExemplars.Store(config.DefaultExemplarsConfig.MaxExemplars)
h, err := NewHead(nil, nil, wlog, opts, nil)
h, err := NewHead(nil, nil, wlog, nil, opts, nil)
require.NoError(t, err)
require.NoError(t, h.Init(0))
@ -3205,7 +3457,7 @@ func TestMmapPanicAfterMmapReplayCorruption(t *testing.T) {
require.NoError(t, err)
require.NoError(t, f.Close())
h, err = NewHead(nil, nil, wlog, opts, nil)
h, err = NewHead(nil, nil, wlog, nil, opts, nil)
require.NoError(t, err)
require.NoError(t, h.Init(0))
@ -3230,7 +3482,7 @@ func TestReplayAfterMmapReplayError(t *testing.T) {
opts.EnableMemorySnapshotOnShutdown = true
opts.MaxExemplars.Store(config.DefaultExemplarsConfig.MaxExemplars)
h, err = NewHead(nil, nil, wlog, opts, nil)
h, err = NewHead(nil, nil, wlog, nil, opts, nil)
require.NoError(t, err)
require.NoError(t, h.Init(0))
}
@ -3292,3 +3544,131 @@ func TestReplayAfterMmapReplayError(t *testing.T) {
require.NoError(t, h.Close())
}
func TestOOOAppendWithNoSeries(t *testing.T) {
dir := t.TempDir()
wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
require.NoError(t, err)
oooWlog, err := wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
require.NoError(t, err)
opts := DefaultHeadOptions()
opts.ChunkDirRoot = dir
opts.OutOfOrderCapMax.Store(30)
opts.OutOfOrderTimeWindow.Store(120 * time.Minute.Milliseconds())
h, err := NewHead(nil, nil, wlog, oooWlog, opts, nil)
require.NoError(t, err)
t.Cleanup(func() {
require.NoError(t, h.Close())
})
require.NoError(t, h.Init(0))
appendSample := func(lbls labels.Labels, ts int64) {
app := h.Appender(context.Background())
_, err := app.Append(0, lbls, ts*time.Minute.Milliseconds(), float64(ts))
require.NoError(t, err)
require.NoError(t, app.Commit())
}
verifyOOOSamples := func(lbls labels.Labels, expSamples int) {
ms, created, err := h.getOrCreate(lbls.Hash(), lbls)
require.NoError(t, err)
require.False(t, created)
require.NotNil(t, ms)
require.Nil(t, ms.headChunk)
require.NotNil(t, ms.oooHeadChunk)
require.Equal(t, expSamples, ms.oooHeadChunk.chunk.NumSamples())
}
verifyInOrderSamples := func(lbls labels.Labels, expSamples int) {
ms, created, err := h.getOrCreate(lbls.Hash(), lbls)
require.NoError(t, err)
require.False(t, created)
require.NotNil(t, ms)
require.Nil(t, ms.oooHeadChunk)
require.NotNil(t, ms.headChunk)
require.Equal(t, expSamples, ms.headChunk.chunk.NumSamples())
}
newLabels := func(idx int) labels.Labels { return labels.FromStrings("foo", fmt.Sprintf("%d", idx)) }
s1 := newLabels(1)
appendSample(s1, 300) // At 300m.
verifyInOrderSamples(s1, 1)
// At 239m, the sample cannot be appended to in-order chunk since it is
// beyond the minValidTime. So it should go in OOO chunk.
// Series does not exist for s2 yet.
s2 := newLabels(2)
appendSample(s2, 239) // OOO sample.
verifyOOOSamples(s2, 1)
// Similar for 180m.
s3 := newLabels(3)
appendSample(s3, 180) // OOO sample.
verifyOOOSamples(s3, 1)
// Now 179m is too old.
s4 := newLabels(4)
app := h.Appender(context.Background())
_, err = app.Append(0, s4, 179*time.Minute.Milliseconds(), float64(179))
require.Equal(t, storage.ErrTooOldSample, err)
require.NoError(t, app.Rollback())
verifyOOOSamples(s3, 1)
// Samples still go into in-order chunk for samples within
// appendable minValidTime.
s5 := newLabels(5)
appendSample(s5, 240)
verifyInOrderSamples(s5, 1)
}
func TestHeadMinOOOTimeUpdate(t *testing.T) {
dir := t.TempDir()
wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
require.NoError(t, err)
oooWlog, err := wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
require.NoError(t, err)
opts := DefaultHeadOptions()
opts.ChunkDirRoot = dir
opts.OutOfOrderTimeWindow.Store(10 * time.Minute.Milliseconds())
h, err := NewHead(nil, nil, wlog, oooWlog, opts, nil)
require.NoError(t, err)
t.Cleanup(func() {
require.NoError(t, h.Close())
})
require.NoError(t, h.Init(0))
appendSample := func(ts int64) {
lbls := labels.FromStrings("foo", "bar")
app := h.Appender(context.Background())
_, err := app.Append(0, lbls, ts*time.Minute.Milliseconds(), float64(ts))
require.NoError(t, err)
require.NoError(t, app.Commit())
}
appendSample(300) // In-order sample.
require.Equal(t, int64(math.MaxInt64), h.MinOOOTime())
appendSample(295) // OOO sample.
require.Equal(t, 295*time.Minute.Milliseconds(), h.MinOOOTime())
// Allowed window for OOO is >=290, which is before the earliest ooo sample 295, so it gets set to the lower value.
require.NoError(t, h.truncateOOO(0, 1))
require.Equal(t, 290*time.Minute.Milliseconds(), h.MinOOOTime())
appendSample(310) // In-order sample.
appendSample(305) // OOO sample.
require.Equal(t, 290*time.Minute.Milliseconds(), h.MinOOOTime())
// Now the OOO sample 295 was not gc'ed yet. And allowed window for OOO is now >=300.
// So the lowest among them, 295, is set as minOOOTime.
require.NoError(t, h.truncateOOO(0, 2))
require.Equal(t, 295*time.Minute.Milliseconds(), h.MinOOOTime())
}

View file

@ -42,7 +42,7 @@ import (
"github.com/prometheus/prometheus/tsdb/wal"
)
func (h *Head) loadWAL(r *wal.Reader, multiRef map[chunks.HeadSeriesRef]chunks.HeadSeriesRef, mmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk) (err error) {
func (h *Head) loadWAL(r *wal.Reader, multiRef map[chunks.HeadSeriesRef]chunks.HeadSeriesRef, mmappedChunks, oooMmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk) (err error) {
// Track number of samples that referenced a series we don't know about
// for error reporting.
var unknownRefs atomic.Uint64
@ -107,7 +107,7 @@ func (h *Head) loadWAL(r *wal.Reader, multiRef map[chunks.HeadSeriesRef]chunks.H
processors[i].setup()
go func(wp *walSubsetProcessor) {
unknown, overlapping := wp.processWALSamples(h, mmappedChunks)
unknown, overlapping := wp.processWALSamples(h, mmappedChunks, oooMmappedChunks)
unknownRefs.Add(unknown)
mmapOverlappingChunks.Add(overlapping)
wg.Done()
@ -343,7 +343,7 @@ Outer:
}
// resetSeriesWithMMappedChunks is only used during the WAL replay.
func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc []*mmappedChunk, walSeriesRef chunks.HeadSeriesRef) (overlapped bool) {
func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc, oooMmc []*mmappedChunk, walSeriesRef chunks.HeadSeriesRef) (overlapped bool) {
if mSeries.ref != walSeriesRef {
// Checking if the new m-mapped chunks overlap with the already existing ones.
if len(mSeries.mmappedChunks) > 0 && len(mmc) > 0 {
@ -368,10 +368,11 @@ func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc []*mmappedCh
}
}
h.metrics.chunksCreated.Add(float64(len(mmc)))
h.metrics.chunksCreated.Add(float64(len(mmc) + len(oooMmc)))
h.metrics.chunksRemoved.Add(float64(len(mSeries.mmappedChunks)))
h.metrics.chunks.Add(float64(len(mmc) - len(mSeries.mmappedChunks)))
h.metrics.chunks.Add(float64(len(mmc) + len(oooMmc) - len(mSeries.mmappedChunks)))
mSeries.mmappedChunks = mmc
mSeries.oooMmappedChunks = oooMmc
// Cache the last mmapped chunk time, so we can skip calling append() for samples it will reject.
if len(mmc) == 0 {
mSeries.mmMaxTime = math.MinInt64
@ -379,6 +380,19 @@ func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc []*mmappedCh
mSeries.mmMaxTime = mmc[len(mmc)-1].maxTime
h.updateMinMaxTime(mmc[0].minTime, mSeries.mmMaxTime)
}
if len(oooMmc) != 0 {
// Mint and maxt can be in any chunk, they are not sorted.
mint, maxt := int64(math.MaxInt64), int64(math.MinInt64)
for _, ch := range oooMmc {
if ch.minTime < mint {
mint = ch.minTime
}
if ch.maxTime > maxt {
maxt = ch.maxTime
}
}
h.updateMinOOOMaxOOOTime(mint, maxt)
}
// Any samples replayed till now would already be compacted. Resetting the head chunk.
mSeries.nextAt = 0
@ -421,7 +435,7 @@ func (wp *walSubsetProcessor) reuseBuf() []record.RefSample {
// processWALSamples adds the samples it receives to the head and passes
// the buffer received to an output channel for reuse.
func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk) (unknownRefs, mmapOverlappingChunks uint64) {
func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks, oooMmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk) (unknownRefs, mmapOverlappingChunks uint64) {
defer close(wp.output)
mint, maxt := int64(math.MaxInt64), int64(math.MinInt64)
@ -429,7 +443,8 @@ func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks map[chunk
for in := range wp.input {
if in.existingSeries != nil {
mmc := mmappedChunks[in.walSeriesRef]
if h.resetSeriesWithMMappedChunks(in.existingSeries, mmc, in.walSeriesRef) {
oooMmc := oooMmappedChunks[in.walSeriesRef]
if h.resetSeriesWithMMappedChunks(in.existingSeries, mmc, oooMmc, in.walSeriesRef) {
mmapOverlappingChunks++
}
continue
@ -465,6 +480,292 @@ func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks map[chunk
return unknownRefs, mmapOverlappingChunks
}
func (h *Head) loadWBL(r *wal.Reader, multiRef map[chunks.HeadSeriesRef]chunks.HeadSeriesRef, lastMmapRef chunks.ChunkDiskMapperRef) (err error) {
// Track number of samples, m-map markers, that referenced a series we don't know about
// for error reporting.
var unknownRefs, mmapMarkerUnknownRefs atomic.Uint64
lastSeq, lastOff := lastMmapRef.Unpack()
// Start workers that each process samples for a partition of the series ID space.
var (
wg sync.WaitGroup
n = runtime.GOMAXPROCS(0)
processors = make([]wblSubsetProcessor, n)
dec record.Decoder
shards = make([][]record.RefSample, n)
decodedCh = make(chan interface{}, 10)
decodeErr error
samplesPool = sync.Pool{
New: func() interface{} {
return []record.RefSample{}
},
}
markersPool = sync.Pool{
New: func() interface{} {
return []record.RefMmapMarker{}
},
}
)
defer func() {
// For CorruptionErr ensure to terminate all workers before exiting.
// We also wrap it to identify OOO WBL corruption.
_, ok := err.(*wal.CorruptionErr)
if ok {
err = &errLoadWbl{err: err}
for i := 0; i < n; i++ {
processors[i].closeAndDrain()
}
wg.Wait()
}
}()
wg.Add(n)
for i := 0; i < n; i++ {
processors[i].setup()
go func(wp *wblSubsetProcessor) {
unknown := wp.processWBLSamples(h)
unknownRefs.Add(unknown)
wg.Done()
}(&processors[i])
}
go func() {
defer close(decodedCh)
for r.Next() {
rec := r.Record()
switch dec.Type(rec) {
case record.Samples:
samples := samplesPool.Get().([]record.RefSample)[:0]
samples, err = dec.Samples(rec, samples)
if err != nil {
decodeErr = &wal.CorruptionErr{
Err: errors.Wrap(err, "decode samples"),
Segment: r.Segment(),
Offset: r.Offset(),
}
return
}
decodedCh <- samples
case record.MmapMarkers:
markers := markersPool.Get().([]record.RefMmapMarker)[:0]
markers, err = dec.MmapMarkers(rec, markers)
if err != nil {
decodeErr = &wal.CorruptionErr{
Err: errors.Wrap(err, "decode mmap markers"),
Segment: r.Segment(),
Offset: r.Offset(),
}
return
}
decodedCh <- markers
default:
// Noop.
}
}
}()
// The records are always replayed from the oldest to the newest.
for d := range decodedCh {
switch v := d.(type) {
case []record.RefSample:
samples := v
// We split up the samples into parts of 5000 samples or less.
// With O(300 * #cores) in-flight sample batches, large scrapes could otherwise
// cause thousands of very large in flight buffers occupying large amounts
// of unused memory.
for len(samples) > 0 {
m := 5000
if len(samples) < m {
m = len(samples)
}
for i := 0; i < n; i++ {
shards[i] = processors[i].reuseBuf()
}
for _, sam := range samples[:m] {
if r, ok := multiRef[sam.Ref]; ok {
sam.Ref = r
}
mod := uint64(sam.Ref) % uint64(n)
shards[mod] = append(shards[mod], sam)
}
for i := 0; i < n; i++ {
processors[i].input <- shards[i]
}
samples = samples[m:]
}
//nolint:staticcheck // Ignore SA6002 relax staticcheck verification.
samplesPool.Put(d)
case []record.RefMmapMarker:
markers := v
for _, rm := range markers {
seq, off := rm.MmapRef.Unpack()
if seq > lastSeq || (seq == lastSeq && off > lastOff) {
// This m-map chunk from markers was not present during
// the load of mmapped chunks that happened in the head
// initialization.
continue
}
if r, ok := multiRef[rm.Ref]; ok {
rm.Ref = r
}
ms := h.series.getByID(rm.Ref)
if ms == nil {
mmapMarkerUnknownRefs.Inc()
continue
}
idx := uint64(ms.ref) % uint64(n)
// It is possible that some old sample is being processed in processWALSamples that
// could cause race below. So we wait for the goroutine to empty input the buffer and finish
// processing all old samples after emptying the buffer.
processors[idx].waitUntilIdle()
// Lock the subset so we can modify the series object
processors[idx].mx.Lock()
// All samples till now have been m-mapped. Hence clear out the headChunk.
// In case some samples slipped through and went into m-map chunks because of changed
// chunk size parameters, we are not taking care of that here.
// TODO(codesome): see if there is a way to avoid duplicate m-map chunks if
// the size of ooo chunk was reduced between restart.
ms.oooHeadChunk = nil
processors[idx].mx.Unlock()
}
default:
panic(fmt.Errorf("unexpected decodedCh type: %T", d))
}
}
if decodeErr != nil {
return decodeErr
}
// Signal termination to each worker and wait for it to close its output channel.
for i := 0; i < n; i++ {
processors[i].closeAndDrain()
}
wg.Wait()
if r.Err() != nil {
return errors.Wrap(r.Err(), "read records")
}
if unknownRefs.Load() > 0 || mmapMarkerUnknownRefs.Load() > 0 {
level.Warn(h.logger).Log("msg", "Unknown series references for ooo WAL replay", "samples", unknownRefs.Load(), "mmap_markers", mmapMarkerUnknownRefs.Load())
}
return nil
}
type errLoadWbl struct {
err error
}
func (e errLoadWbl) Error() string {
return e.err.Error()
}
// To support errors.Cause().
func (e errLoadWbl) Cause() error {
return e.err
}
// To support errors.Unwrap().
func (e errLoadWbl) Unwrap() error {
return e.err
}
// isErrLoadOOOWal returns a boolean if the error is errLoadWbl.
func isErrLoadOOOWal(err error) bool {
_, ok := err.(*errLoadWbl)
return ok
}
type wblSubsetProcessor struct {
mx sync.Mutex // Take this lock while modifying series in the subset.
input chan []record.RefSample
output chan []record.RefSample
}
func (wp *wblSubsetProcessor) setup() {
wp.output = make(chan []record.RefSample, 300)
wp.input = make(chan []record.RefSample, 300)
}
func (wp *wblSubsetProcessor) closeAndDrain() {
close(wp.input)
for range wp.output {
}
}
// If there is a buffer in the output chan, return it for reuse, otherwise return nil.
func (wp *wblSubsetProcessor) reuseBuf() []record.RefSample {
select {
case buf := <-wp.output:
return buf[:0]
default:
}
return nil
}
// processWBLSamples adds the samples it receives to the head and passes
// the buffer received to an output channel for reuse.
// Samples before the minValidTime timestamp are discarded.
func (wp *wblSubsetProcessor) processWBLSamples(h *Head) (unknownRefs uint64) {
defer close(wp.output)
// We don't check for minValidTime for ooo samples.
mint, maxt := int64(math.MaxInt64), int64(math.MinInt64)
for samples := range wp.input {
wp.mx.Lock()
for _, s := range samples {
ms := h.series.getByID(s.Ref)
if ms == nil {
unknownRefs++
continue
}
ok, chunkCreated, _ := ms.insert(s.T, s.V, h.chunkDiskMapper)
if chunkCreated {
h.metrics.chunksCreated.Inc()
h.metrics.chunks.Inc()
}
if ok {
if s.T < mint {
mint = s.T
}
if s.T > maxt {
maxt = s.T
}
}
}
wp.mx.Unlock()
wp.output <- samples
}
h.updateMinOOOMaxOOOTime(mint, maxt)
return unknownRefs
}
func (wp *wblSubsetProcessor) waitUntilIdle() {
select {
case <-wp.output: // Allow output side to drain to avoid deadlock.
default:
}
wp.input <- []record.RefSample{}
for len(wp.input) != 0 {
time.Sleep(10 * time.Microsecond)
select {
case <-wp.output: // Allow output side to drain to avoid deadlock.
default:
}
}
}
const (
chunkSnapshotRecordTypeSeries uint8 = 1
chunkSnapshotRecordTypeTombstones uint8 = 2

159
tsdb/ooo_head.go Normal file
View file

@ -0,0 +1,159 @@
// Copyright 2022 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdb
import (
"fmt"
"sort"
"github.com/prometheus/prometheus/tsdb/chunkenc"
"github.com/prometheus/prometheus/tsdb/tombstones"
)
// OOOChunk maintains samples in time-ascending order.
// Inserts for timestamps already seen, are dropped.
// Samples are stored uncompressed to allow easy sorting.
// Perhaps we can be more efficient later.
type OOOChunk struct {
samples []sample
}
func NewOOOChunk() *OOOChunk {
return &OOOChunk{samples: make([]sample, 0, 4)}
}
// Insert inserts the sample such that order is maintained.
// Returns false if insert was not possible due to the same timestamp already existing.
func (o *OOOChunk) Insert(t int64, v float64) bool {
// Find index of sample we should replace.
i := sort.Search(len(o.samples), func(i int) bool { return o.samples[i].t >= t })
if i >= len(o.samples) {
// none found. append it at the end
o.samples = append(o.samples, sample{t, v})
return true
}
if o.samples[i].t == t {
return false
}
// Expand length by 1 to make room. use a zero sample, we will overwrite it anyway.
o.samples = append(o.samples, sample{})
copy(o.samples[i+1:], o.samples[i:])
o.samples[i] = sample{t, v}
return true
}
func (o *OOOChunk) NumSamples() int {
return len(o.samples)
}
func (o *OOOChunk) ToXOR() (*chunkenc.XORChunk, error) {
x := chunkenc.NewXORChunk()
app, err := x.Appender()
if err != nil {
return nil, err
}
for _, s := range o.samples {
app.Append(s.t, s.v)
}
return x, nil
}
func (o *OOOChunk) ToXORBetweenTimestamps(mint, maxt int64) (*chunkenc.XORChunk, error) {
x := chunkenc.NewXORChunk()
app, err := x.Appender()
if err != nil {
return nil, err
}
for _, s := range o.samples {
if s.t < mint {
continue
}
if s.t > maxt {
break
}
app.Append(s.t, s.v)
}
return x, nil
}
var _ BlockReader = &OOORangeHead{}
// OOORangeHead allows querying Head out of order samples via BlockReader
// interface implementation.
type OOORangeHead struct {
head *Head
// mint and maxt are tracked because when a query is handled we only want
// the timerange of the query and having preexisting pointers to the first
// and last timestamp help with that.
mint, maxt int64
}
func NewOOORangeHead(head *Head, mint, maxt int64) *OOORangeHead {
return &OOORangeHead{
head: head,
mint: mint,
maxt: maxt,
}
}
func (oh *OOORangeHead) Index() (IndexReader, error) {
return NewOOOHeadIndexReader(oh.head, oh.mint, oh.maxt), nil
}
func (oh *OOORangeHead) Chunks() (ChunkReader, error) {
return NewOOOHeadChunkReader(oh.head, oh.mint, oh.maxt), nil
}
func (oh *OOORangeHead) Tombstones() (tombstones.Reader, error) {
// As stated in the design doc https://docs.google.com/document/d/1Kppm7qL9C-BJB1j6yb6-9ObG3AbdZnFUBYPNNWwDBYM/edit?usp=sharing
// Tombstones are not supported for out of order metrics.
return tombstones.NewMemTombstones(), nil
}
func (oh *OOORangeHead) Meta() BlockMeta {
var id [16]byte
copy(id[:], "____ooo_head____")
return BlockMeta{
MinTime: oh.mint,
MaxTime: oh.maxt,
ULID: id,
Stats: BlockStats{
NumSeries: oh.head.NumSeries(),
},
}
}
// Size returns the size taken by the Head block.
func (oh *OOORangeHead) Size() int64 {
return oh.head.Size()
}
// String returns an human readable representation of the out of order range
// head. It's important to keep this function in order to avoid the struct dump
// when the head is stringified in errors or logs.
func (oh *OOORangeHead) String() string {
return fmt.Sprintf("ooo range head (mint: %d, maxt: %d)", oh.MinTime(), oh.MaxTime())
}
func (oh *OOORangeHead) MinTime() int64 {
return oh.mint
}
func (oh *OOORangeHead) MaxTime() int64 {
return oh.maxt
}

433
tsdb/ooo_head_read.go Normal file
View file

@ -0,0 +1,433 @@
// Copyright 2022 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdb
import (
"errors"
"math"
"sort"
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/storage"
"github.com/prometheus/prometheus/tsdb/chunkenc"
"github.com/prometheus/prometheus/tsdb/chunks"
"github.com/prometheus/prometheus/tsdb/index"
"github.com/prometheus/prometheus/tsdb/tombstones"
)
var _ IndexReader = &OOOHeadIndexReader{}
// OOOHeadIndexReader implements IndexReader so ooo samples in the head can be
// accessed.
// It also has a reference to headIndexReader so we can leverage on its
// IndexReader implementation for all the methods that remain the same. We
// decided to do this to avoid code duplication.
// The only methods that change are the ones about getting Series and Postings.
type OOOHeadIndexReader struct {
*headIndexReader // A reference to the headIndexReader so we can reuse as many interface implementation as possible.
}
func NewOOOHeadIndexReader(head *Head, mint, maxt int64) *OOOHeadIndexReader {
hr := &headIndexReader{
head: head,
mint: mint,
maxt: maxt,
}
return &OOOHeadIndexReader{hr}
}
func (oh *OOOHeadIndexReader) Series(ref storage.SeriesRef, lbls *labels.Labels, chks *[]chunks.Meta) error {
return oh.series(ref, lbls, chks, 0)
}
// The passed lastMmapRef tells upto what max m-map chunk that we can consider.
// If it is 0, it means all chunks need to be considered.
// If it is non-0, then the oooHeadChunk must not be considered.
func (oh *OOOHeadIndexReader) series(ref storage.SeriesRef, lbls *labels.Labels, chks *[]chunks.Meta, lastMmapRef chunks.ChunkDiskMapperRef) error {
s := oh.head.series.getByID(chunks.HeadSeriesRef(ref))
if s == nil {
oh.head.metrics.seriesNotFound.Inc()
return storage.ErrNotFound
}
*lbls = append((*lbls)[:0], s.lset...)
if chks == nil {
return nil
}
s.Lock()
defer s.Unlock()
*chks = (*chks)[:0]
tmpChks := make([]chunks.Meta, 0, len(s.oooMmappedChunks))
// We define these markers to track the last chunk reference while we
// fill the chunk meta.
// These markers are useful to give consistent responses to repeated queries
// even if new chunks that might be overlapping or not are added afterwards.
// Also, lastMinT and lastMaxT are initialized to the max int as a sentinel
// value to know they are unset.
var lastChunkRef chunks.ChunkRef
lastMinT, lastMaxT := int64(math.MaxInt64), int64(math.MaxInt64)
addChunk := func(minT, maxT int64, ref chunks.ChunkRef) {
// the first time we get called is for the last included chunk.
// set the markers accordingly
if lastMinT == int64(math.MaxInt64) {
lastChunkRef = ref
lastMinT = minT
lastMaxT = maxT
}
tmpChks = append(tmpChks, chunks.Meta{
MinTime: minT,
MaxTime: maxT,
Ref: ref,
OOOLastRef: lastChunkRef,
OOOLastMinTime: lastMinT,
OOOLastMaxTime: lastMaxT,
})
}
// Collect all chunks that overlap the query range, in order from most recent to most old,
// so we can set the correct markers.
if s.oooHeadChunk != nil {
c := s.oooHeadChunk
if c.OverlapsClosedInterval(oh.mint, oh.maxt) && lastMmapRef == 0 {
ref := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(len(s.oooMmappedChunks))))
addChunk(c.minTime, c.maxTime, ref)
}
}
for i := len(s.oooMmappedChunks) - 1; i >= 0; i-- {
c := s.oooMmappedChunks[i]
if c.OverlapsClosedInterval(oh.mint, oh.maxt) && (lastMmapRef == 0 || lastMmapRef.GreaterThanOrEqualTo(c.ref)) {
ref := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(i)))
addChunk(c.minTime, c.maxTime, ref)
}
}
// There is nothing to do if we did not collect any chunk
if len(tmpChks) == 0 {
return nil
}
// Next we want to sort all the collected chunks by min time so we can find
// those that overlap.
sort.Sort(metaByMinTimeAndMinRef(tmpChks))
// Next we want to iterate the sorted collected chunks and only return the
// chunks Meta the first chunk that overlaps with others.
// Example chunks of a series: 5:(100, 200) 6:(500, 600) 7:(150, 250) 8:(550, 650)
// In the example 5 overlaps with 7 and 6 overlaps with 8 so we only want to
// to return chunk Metas for chunk 5 and chunk 6
*chks = append(*chks, tmpChks[0])
maxTime := tmpChks[0].MaxTime // tracks the maxTime of the previous "to be merged chunk"
for _, c := range tmpChks[1:] {
if c.MinTime > maxTime {
*chks = append(*chks, c)
maxTime = c.MaxTime
} else if c.MaxTime > maxTime {
maxTime = c.MaxTime
(*chks)[len(*chks)-1].MaxTime = c.MaxTime
}
}
return nil
}
// LabelValues needs to be overridden from the headIndexReader implementation due
// to the check that happens at the beginning where we make sure that the query
// interval overlaps with the head minooot and maxooot.
func (oh *OOOHeadIndexReader) LabelValues(name string, matchers ...*labels.Matcher) ([]string, error) {
if oh.maxt < oh.head.MinOOOTime() || oh.mint > oh.head.MaxOOOTime() {
return []string{}, nil
}
if len(matchers) == 0 {
return oh.head.postings.LabelValues(name), nil
}
return labelValuesWithMatchers(oh, name, matchers...)
}
type chunkMetaAndChunkDiskMapperRef struct {
meta chunks.Meta
ref chunks.ChunkDiskMapperRef
origMinT int64
origMaxT int64
}
type byMinTimeAndMinRef []chunkMetaAndChunkDiskMapperRef
func (b byMinTimeAndMinRef) Len() int { return len(b) }
func (b byMinTimeAndMinRef) Less(i, j int) bool {
if b[i].meta.MinTime == b[j].meta.MinTime {
return b[i].meta.Ref < b[j].meta.Ref
}
return b[i].meta.MinTime < b[j].meta.MinTime
}
func (b byMinTimeAndMinRef) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
type metaByMinTimeAndMinRef []chunks.Meta
func (b metaByMinTimeAndMinRef) Len() int { return len(b) }
func (b metaByMinTimeAndMinRef) Less(i, j int) bool {
if b[i].MinTime == b[j].MinTime {
return b[i].Ref < b[j].Ref
}
return b[i].MinTime < b[j].MinTime
}
func (b metaByMinTimeAndMinRef) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
func (oh *OOOHeadIndexReader) Postings(name string, values ...string) (index.Postings, error) {
switch len(values) {
case 0:
return index.EmptyPostings(), nil
case 1:
return oh.head.postings.Get(name, values[0]), nil // TODO(ganesh) Also call GetOOOPostings
default:
// TODO(ganesh) We want to only return postings for out of order series.
res := make([]index.Postings, 0, len(values))
for _, value := range values {
res = append(res, oh.head.postings.Get(name, value)) // TODO(ganesh) Also call GetOOOPostings
}
return index.Merge(res...), nil
}
}
type OOOHeadChunkReader struct {
head *Head
mint, maxt int64
}
func NewOOOHeadChunkReader(head *Head, mint, maxt int64) *OOOHeadChunkReader {
return &OOOHeadChunkReader{
head: head,
mint: mint,
maxt: maxt,
}
}
func (cr OOOHeadChunkReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
sid, _ := chunks.HeadChunkRef(meta.Ref).Unpack()
s := cr.head.series.getByID(sid)
// This means that the series has been garbage collected.
if s == nil {
return nil, storage.ErrNotFound
}
s.Lock()
c, err := s.oooMergedChunk(meta, cr.head.chunkDiskMapper, cr.mint, cr.maxt)
s.Unlock()
if err != nil {
return nil, err
}
// This means that the query range did not overlap with the requested chunk.
if len(c.chunks) == 0 {
return nil, storage.ErrNotFound
}
return c, nil
}
func (cr OOOHeadChunkReader) Close() error {
return nil
}
type OOOCompactionHead struct {
oooIR *OOOHeadIndexReader
lastMmapRef chunks.ChunkDiskMapperRef
lastWBLFile int
postings []storage.SeriesRef
chunkRange int64
mint, maxt int64 // Among all the compactable chunks.
}
// NewOOOCompactionHead does the following:
// 1. M-maps all the in-memory ooo chunks.
// 2. Compute the expected block ranges while iterating through all ooo series and store it.
// 3. Store the list of postings having ooo series.
// 4. Cuts a new WBL file for the OOO WBL.
// All the above together have a bit of CPU and memory overhead, and can have a bit of impact
// on the sample append latency. So call NewOOOCompactionHead only right before compaction.
func NewOOOCompactionHead(head *Head) (*OOOCompactionHead, error) {
newWBLFile, err := head.wbl.NextSegmentSync()
if err != nil {
return nil, err
}
ch := &OOOCompactionHead{
chunkRange: head.chunkRange.Load(),
mint: math.MaxInt64,
maxt: math.MinInt64,
lastWBLFile: newWBLFile,
}
ch.oooIR = NewOOOHeadIndexReader(head, math.MinInt64, math.MaxInt64)
n, v := index.AllPostingsKey()
// TODO: verify this gets only ooo samples.
p, err := ch.oooIR.Postings(n, v)
if err != nil {
return nil, err
}
p = ch.oooIR.SortedPostings(p)
var lastSeq, lastOff int
for p.Next() {
seriesRef := p.At()
ms := head.series.getByID(chunks.HeadSeriesRef(seriesRef))
if ms == nil {
continue
}
// M-map the in-memory chunk and keep track of the last one.
// Also build the block ranges -> series map.
// TODO: consider having a lock specifically for ooo data.
ms.Lock()
mmapRef := ms.mmapCurrentOOOHeadChunk(head.chunkDiskMapper)
if mmapRef == 0 && len(ms.oooMmappedChunks) > 0 {
// Nothing was m-mapped. So take the mmapRef from the existing slice if it exists.
mmapRef = ms.oooMmappedChunks[len(ms.oooMmappedChunks)-1].ref
}
seq, off := mmapRef.Unpack()
if seq > lastSeq || (seq == lastSeq && off > lastOff) {
ch.lastMmapRef, lastSeq, lastOff = mmapRef, seq, off
}
if len(ms.oooMmappedChunks) > 0 {
ch.postings = append(ch.postings, seriesRef)
for _, c := range ms.oooMmappedChunks {
if c.minTime < ch.mint {
ch.mint = c.minTime
}
if c.maxTime > ch.maxt {
ch.maxt = c.maxTime
}
}
}
ms.Unlock()
}
return ch, nil
}
func (ch *OOOCompactionHead) Index() (IndexReader, error) {
return NewOOOCompactionHeadIndexReader(ch), nil
}
func (ch *OOOCompactionHead) Chunks() (ChunkReader, error) {
return NewOOOHeadChunkReader(ch.oooIR.head, ch.oooIR.mint, ch.oooIR.maxt), nil
}
func (ch *OOOCompactionHead) Tombstones() (tombstones.Reader, error) {
return tombstones.NewMemTombstones(), nil
}
func (ch *OOOCompactionHead) Meta() BlockMeta {
var id [16]byte
copy(id[:], "copy(id[:], \"ooo_compact_head\")")
return BlockMeta{
MinTime: ch.mint,
MaxTime: ch.maxt,
ULID: id,
Stats: BlockStats{
NumSeries: uint64(len(ch.postings)),
},
}
}
// CloneForTimeRange clones the OOOCompactionHead such that the IndexReader and ChunkReader
// obtained from this only looks at the m-map chunks within the given time ranges while not looking
// beyond the ch.lastMmapRef.
// Only the method of BlockReader interface are valid for the cloned OOOCompactionHead.
func (ch *OOOCompactionHead) CloneForTimeRange(mint, maxt int64) *OOOCompactionHead {
return &OOOCompactionHead{
oooIR: NewOOOHeadIndexReader(ch.oooIR.head, mint, maxt),
lastMmapRef: ch.lastMmapRef,
postings: ch.postings,
chunkRange: ch.chunkRange,
mint: ch.mint,
maxt: ch.maxt,
}
}
func (ch *OOOCompactionHead) Size() int64 { return 0 }
func (ch *OOOCompactionHead) MinTime() int64 { return ch.mint }
func (ch *OOOCompactionHead) MaxTime() int64 { return ch.maxt }
func (ch *OOOCompactionHead) ChunkRange() int64 { return ch.chunkRange }
func (ch *OOOCompactionHead) LastMmapRef() chunks.ChunkDiskMapperRef { return ch.lastMmapRef }
func (ch *OOOCompactionHead) LastWBLFile() int { return ch.lastWBLFile }
type OOOCompactionHeadIndexReader struct {
ch *OOOCompactionHead
}
func NewOOOCompactionHeadIndexReader(ch *OOOCompactionHead) IndexReader {
return &OOOCompactionHeadIndexReader{ch: ch}
}
func (ir *OOOCompactionHeadIndexReader) Symbols() index.StringIter {
return ir.ch.oooIR.Symbols()
}
func (ir *OOOCompactionHeadIndexReader) Postings(name string, values ...string) (index.Postings, error) {
n, v := index.AllPostingsKey()
if name != n || len(values) != 1 || values[0] != v {
return nil, errors.New("only AllPostingsKey is supported")
}
return index.NewListPostings(ir.ch.postings), nil
}
func (ir *OOOCompactionHeadIndexReader) SortedPostings(p index.Postings) index.Postings {
// This will already be sorted from the Postings() call above.
return p
}
func (ir *OOOCompactionHeadIndexReader) Series(ref storage.SeriesRef, lset *labels.Labels, chks *[]chunks.Meta) error {
return ir.ch.oooIR.series(ref, lset, chks, ir.ch.lastMmapRef)
}
func (ir *OOOCompactionHeadIndexReader) SortedLabelValues(name string, matchers ...*labels.Matcher) ([]string, error) {
return nil, errors.New("not implemented")
}
func (ir *OOOCompactionHeadIndexReader) LabelValues(name string, matchers ...*labels.Matcher) ([]string, error) {
return nil, errors.New("not implemented")
}
func (ir *OOOCompactionHeadIndexReader) PostingsForMatchers(concurrent bool, ms ...*labels.Matcher) (index.Postings, error) {
return nil, errors.New("not implemented")
}
func (ir *OOOCompactionHeadIndexReader) LabelNames(matchers ...*labels.Matcher) ([]string, error) {
return nil, errors.New("not implemented")
}
func (ir *OOOCompactionHeadIndexReader) LabelValueFor(id storage.SeriesRef, label string) (string, error) {
return "", errors.New("not implemented")
}
func (ir *OOOCompactionHeadIndexReader) LabelNamesFor(ids ...storage.SeriesRef) ([]string, error) {
return nil, errors.New("not implemented")
}
func (ir *OOOCompactionHeadIndexReader) Close() error {
return ir.ch.oooIR.Close()
}

1207
tsdb/ooo_head_read_test.go Normal file

File diff suppressed because it is too large Load diff

93
tsdb/ooo_head_test.go Normal file
View file

@ -0,0 +1,93 @@
// Copyright 2022 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tsdb
import (
"testing"
"github.com/stretchr/testify/require"
)
const testMaxSize int = 32
// Formulas chosen to make testing easy:
func valEven(pos int) int { return pos*2 + 2 } // s[0]=2, s[1]=4, s[2]=6, ..., s[31]=64 - Predictable pre-existing values
func valOdd(pos int) int { return pos*2 + 1 } // s[0]=1, s[1]=3, s[2]=5, ..., s[31]=63 - New values will interject at chosen position because they sort before the pre-existing vals.
func samplify(v int) sample { return sample{int64(v), float64(v)} }
func makeEvenSampleSlice(n int) []sample {
s := make([]sample, n)
for i := 0; i < n; i++ {
s[i] = samplify(valEven(i))
}
return s
}
// TestOOOInsert tests the following cases:
// - Number of pre-existing samples anywhere from 0 to testMaxSize-1.
// - Insert new sample before first pre-existing samples, after the last, and anywhere in between.
// - With a chunk initial capacity of testMaxSize/8 and testMaxSize, which lets us test non-full and full chunks, and chunks that need to expand themselves.
// Note: In all samples used, t always equals v in numeric value. when we talk about 'value' we just refer to a value that will be used for both sample.t and sample.v.
func TestOOOInsert(t *testing.T) {
for numPreExisting := 0; numPreExisting <= testMaxSize; numPreExisting++ {
// For example, if we have numPreExisting 2, then:
// chunk.samples indexes filled 0 1
// chunk.samples with these values 2 4 // valEven
// we want to test inserting at index 0 1 2 // insertPos=0..numPreExisting
// we can do this by using values 1, 3 5 // valOdd(insertPos)
for insertPos := 0; insertPos <= numPreExisting; insertPos++ {
chunk := NewOOOChunk()
chunk.samples = makeEvenSampleSlice(numPreExisting)
newSample := samplify(valOdd(insertPos))
chunk.Insert(newSample.t, newSample.v)
var expSamples []sample
// Our expected new samples slice, will be first the original samples.
for i := 0; i < insertPos; i++ {
expSamples = append(expSamples, samplify(valEven(i)))
}
// Then the new sample.
expSamples = append(expSamples, newSample)
// Followed by any original samples that were pushed back by the new one.
for i := insertPos; i < numPreExisting; i++ {
expSamples = append(expSamples, samplify(valEven(i)))
}
require.Equal(t, expSamples, chunk.samples, "numPreExisting %d, insertPos %d", numPreExisting, insertPos)
}
}
}
// TestOOOInsertDuplicate tests the correct behavior when inserting a sample that is a duplicate of any
// pre-existing samples, with between 1 and testMaxSize pre-existing samples and
// with a chunk initial capacity of testMaxSize/8 and testMaxSize, which lets us test non-full and full chunks, and chunks that need to expand themselves.
func TestOOOInsertDuplicate(t *testing.T) {
for num := 1; num <= testMaxSize; num++ {
for dupPos := 0; dupPos < num; dupPos++ {
chunk := NewOOOChunk()
chunk.samples = makeEvenSampleSlice(num)
dupSample := chunk.samples[dupPos]
dupSample.v = 0.123
ok := chunk.Insert(dupSample.t, dupSample.v)
expSamples := makeEvenSampleSlice(num) // We expect no change.
require.False(t, ok)
require.Equal(t, expSamples, chunk.samples, "num %d, dupPos %d", num, dupPos)
}
}
}

View file

@ -569,7 +569,7 @@ func (p *populateWithDelGenericSeriesIterator) next() bool {
p.i++
p.currChkMeta = p.chks[p.i]
p.currChkMeta.Chunk, p.err = p.chunks.Chunk(p.currChkMeta.Ref)
p.currChkMeta.Chunk, p.err = p.chunks.Chunk(p.currChkMeta)
if p.err != nil {
p.err = errors.Wrapf(p.err, "cannot populate chunk %d", p.currChkMeta.Ref)
return false
@ -898,7 +898,7 @@ func newNopChunkReader() ChunkReader {
}
}
func (cr nopChunkReader) Chunk(ref chunks.ChunkRef) (chunkenc.Chunk, error) {
func (cr nopChunkReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
return cr.emptyChunk, nil
}

View file

@ -34,7 +34,7 @@ func BenchmarkQuerier(b *testing.B) {
opts := DefaultHeadOptions()
opts.ChunkRange = 1000
opts.ChunkDirRoot = chunkDir
h, err := NewHead(nil, nil, nil, opts, nil)
h, err := NewHead(nil, nil, nil, nil, opts, nil)
require.NoError(b, err)
defer func() {
require.NoError(b, h.Close())
@ -180,7 +180,7 @@ func BenchmarkQuerierSelect(b *testing.B) {
opts := DefaultHeadOptions()
opts.ChunkRange = 1000
opts.ChunkDirRoot = chunkDir
h, err := NewHead(nil, nil, nil, opts, nil)
h, err := NewHead(nil, nil, nil, nil, opts, nil)
require.NoError(b, err)
defer h.Close()
app := h.Appender(context.Background())

View file

@ -458,7 +458,7 @@ func TestBlockQuerier_AgainstHeadWithOpenChunks(t *testing.T) {
t.Run("", func(t *testing.T) {
opts := DefaultHeadOptions()
opts.ChunkRange = 2 * time.Hour.Milliseconds()
h, err := NewHead(nil, nil, nil, opts, nil)
h, err := NewHead(nil, nil, nil, nil, opts, nil)
require.NoError(t, err)
defer h.Close()
@ -627,10 +627,10 @@ func createFakeReaderAndNotPopulatedChunks(s ...[]tsdbutil.Sample) (*fakeChunksR
return f, chks
}
func (r *fakeChunksReader) Chunk(ref chunks.ChunkRef) (chunkenc.Chunk, error) {
chk, ok := r.chks[ref]
func (r *fakeChunksReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
chk, ok := r.chks[meta.Ref]
if !ok {
return nil, errors.Errorf("chunk not found at ref %v", ref)
return nil, errors.Errorf("chunk not found at ref %v", meta.Ref)
}
return chk, nil
}
@ -1016,8 +1016,8 @@ func BenchmarkMergedSeriesSet(b *testing.B) {
type mockChunkReader map[chunks.ChunkRef]chunkenc.Chunk
func (cr mockChunkReader) Chunk(id chunks.ChunkRef) (chunkenc.Chunk, error) {
chk, ok := cr[id]
func (cr mockChunkReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
chk, ok := cr[meta.Ref]
if ok {
return chk, nil
}
@ -1643,7 +1643,7 @@ func TestPostingsForMatchers(t *testing.T) {
opts := DefaultHeadOptions()
opts.ChunkRange = 1000
opts.ChunkDirRoot = chunkDir
h, err := NewHead(nil, nil, nil, opts, nil)
h, err := NewHead(nil, nil, nil, nil, opts, nil)
require.NoError(t, err)
defer func() {
require.NoError(t, h.Close())
@ -1944,13 +1944,17 @@ func BenchmarkQueries(b *testing.B) {
},
}
queryTypes := make(map[string]storage.Querier)
type qt struct {
typ string
querier storage.Querier
}
var queryTypes []qt // We use a slice instead of map to keep the order of test cases consistent.
defer func() {
for _, q := range queryTypes {
// Can't run a check for error here as some of these will fail as
// queryTypes is using the same slice for the different block queriers
// and would have been closed in the previous iteration.
q.Close()
q.querier.Close()
}
}()
@ -1991,21 +1995,38 @@ func BenchmarkQueries(b *testing.B) {
qs = append(qs, q)
}
queryTypes["_1-Block"] = storage.NewMergeQuerier(qs[:1], nil, storage.ChainedSeriesMerge)
queryTypes["_3-Blocks"] = storage.NewMergeQuerier(qs[0:3], nil, storage.ChainedSeriesMerge)
queryTypes["_10-Blocks"] = storage.NewMergeQuerier(qs, nil, storage.ChainedSeriesMerge)
queryTypes = append(queryTypes, qt{"_1-Block", storage.NewMergeQuerier(qs[:1], nil, storage.ChainedSeriesMerge)})
queryTypes = append(queryTypes, qt{"_3-Blocks", storage.NewMergeQuerier(qs[0:3], nil, storage.ChainedSeriesMerge)})
queryTypes = append(queryTypes, qt{"_10-Blocks", storage.NewMergeQuerier(qs, nil, storage.ChainedSeriesMerge)})
chunkDir := b.TempDir()
head := createHead(b, nil, series, chunkDir)
qHead, err := NewBlockQuerier(head, 1, nSamples)
qHead, err := NewBlockQuerier(NewRangeHead(head, 1, nSamples), 1, nSamples)
require.NoError(b, err)
queryTypes["_Head"] = qHead
queryTypes = append(queryTypes, qt{"_Head", qHead})
for qtype, querier := range queryTypes {
b.Run(title+qtype+"_nSeries:"+strconv.Itoa(nSeries)+"_nSamples:"+strconv.Itoa(int(nSamples)), func(b *testing.B) {
for _, oooPercentage := range []int{1, 3, 5, 10} {
chunkDir := b.TempDir()
totalOOOSamples := oooPercentage * int(nSamples) / 100
oooSampleFrequency := int(nSamples) / totalOOOSamples
head := createHeadWithOOOSamples(b, nil, series, chunkDir, oooSampleFrequency)
qHead, err := NewBlockQuerier(NewRangeHead(head, 1, nSamples), 1, nSamples)
require.NoError(b, err)
qOOOHead, err := NewBlockQuerier(NewOOORangeHead(head, 1, nSamples), 1, nSamples)
require.NoError(b, err)
queryTypes = append(queryTypes, qt{
fmt.Sprintf("_Head_oooPercent:%d", oooPercentage),
storage.NewMergeQuerier([]storage.Querier{qHead, qOOOHead}, nil, storage.ChainedSeriesMerge),
})
}
for _, q := range queryTypes {
b.Run(title+q.typ+"_nSeries:"+strconv.Itoa(nSeries)+"_nSamples:"+strconv.Itoa(int(nSamples)), func(b *testing.B) {
expExpansions, err := strconv.Atoi(string(title[len(title)-1]))
require.NoError(b, err)
benchQuery(b, expExpansions, querier, selectors)
benchQuery(b, expExpansions, q.querier, selectors)
})
}
require.NoError(b, head.Close())
@ -2025,6 +2046,7 @@ func benchQuery(b *testing.B, expExpansions int, q storage.Querier, selectors la
s.Labels()
it := s.Iterator()
for it.Next() {
_, _ = it.At()
}
actualExpansions++
}

View file

@ -43,6 +43,8 @@ const (
Tombstones Type = 3
// Exemplars is used to match WAL records of type Exemplars.
Exemplars Type = 4
// MmapMarkers is used to match OOO WBL records of type MmapMarkers.
MmapMarkers Type = 5
// Metadata is used to match WAL records of type Metadata.
Metadata Type = 6
)
@ -57,6 +59,8 @@ func (rt Type) String() string {
return "exemplars"
case Tombstones:
return "tombstones"
case MmapMarkers:
return "mmapmarkers"
case Metadata:
return "metadata"
default:
@ -157,6 +161,12 @@ type RefExemplar struct {
Labels labels.Labels
}
// RefMmapMarker marks that the all the samples of the given series until now have been m-mapped to disk.
type RefMmapMarker struct {
Ref chunks.HeadSeriesRef
MmapRef chunks.ChunkDiskMapperRef
}
// Decoder decodes series, sample, metadata and tombstone records.
// The zero value is ready to use.
type Decoder struct{}
@ -168,7 +178,7 @@ func (d *Decoder) Type(rec []byte) Type {
return Unknown
}
switch t := Type(rec[0]); t {
case Series, Samples, Tombstones, Exemplars, Metadata:
case Series, Samples, Tombstones, Exemplars, MmapMarkers, Metadata:
return t
}
return Unknown
@ -354,6 +364,34 @@ func (d *Decoder) ExemplarsFromBuffer(dec *encoding.Decbuf, exemplars []RefExemp
return exemplars, nil
}
func (d *Decoder) MmapMarkers(rec []byte, markers []RefMmapMarker) ([]RefMmapMarker, error) {
dec := encoding.Decbuf{B: rec}
t := Type(dec.Byte())
if t != MmapMarkers {
return nil, errors.New("invalid record type")
}
if dec.Len() == 0 {
return markers, nil
}
for len(dec.B) > 0 && dec.Err() == nil {
ref := chunks.HeadSeriesRef(dec.Be64())
mmapRef := chunks.ChunkDiskMapperRef(dec.Be64())
markers = append(markers, RefMmapMarker{
Ref: ref,
MmapRef: mmapRef,
})
}
if dec.Err() != nil {
return nil, errors.Wrapf(dec.Err(), "decode error after %d mmap markers", len(markers))
}
if len(dec.B) > 0 {
return nil, errors.Errorf("unexpected %d bytes left in entry", len(dec.B))
}
return markers, nil
}
// Encoder encodes series, sample, and tombstones records.
// The zero value is ready to use.
type Encoder struct{}
@ -467,3 +505,15 @@ func (e *Encoder) EncodeExemplarsIntoBuffer(exemplars []RefExemplar, buf *encodi
EncodeLabels(buf, ex.Labels)
}
}
func (e *Encoder) MmapMarkers(markers []RefMmapMarker, b []byte) []byte {
buf := encoding.Encbuf{B: b}
buf.PutByte(byte(MmapMarkers))
for _, s := range markers {
buf.PutBE64(uint64(s.Ref))
buf.PutBE64(uint64(s.MmapRef))
}
return buf.Get()
}

View file

@ -40,6 +40,7 @@ const (
DefaultSegmentSize = 128 * 1024 * 1024 // 128 MB
pageSize = 32 * 1024 // 32KB
recordHeaderSize = 7
WblDirName = "wbl"
)
// The table gets initialized with sync.Once but may still cause a race
@ -204,32 +205,32 @@ func newWALMetrics(r prometheus.Registerer) *walMetrics {
m := &walMetrics{}
m.fsyncDuration = prometheus.NewSummary(prometheus.SummaryOpts{
Name: "prometheus_tsdb_wal_fsync_duration_seconds",
Name: "fsync_duration_seconds",
Help: "Duration of WAL fsync.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
})
m.pageFlushes = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_page_flushes_total",
Name: "page_flushes_total",
Help: "Total number of page flushes.",
})
m.pageCompletions = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_completed_pages_total",
Name: "completed_pages_total",
Help: "Total number of completed pages.",
})
m.truncateFail = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_truncations_failed_total",
Name: "truncations_failed_total",
Help: "Total number of WAL truncations that failed.",
})
m.truncateTotal = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_truncations_total",
Name: "truncations_total",
Help: "Total number of WAL truncations attempted.",
})
m.currentSegment = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "prometheus_tsdb_wal_segment_current",
Name: "segment_current",
Help: "WAL segment index that TSDB is currently writing to.",
})
m.writesFailed = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_writes_failed_total",
Name: "writes_failed_total",
Help: "Total number of WAL writes that failed.",
})
@ -274,7 +275,11 @@ func NewSize(logger log.Logger, reg prometheus.Registerer, dir string, segmentSi
stopc: make(chan chan struct{}),
compress: compress,
}
w.metrics = newWALMetrics(reg)
prefix := "prometheus_tsdb_wal_"
if filepath.Base(dir) == WblDirName {
prefix = "prometheus_tsdb_out_of_order_wal_"
}
w.metrics = newWALMetrics(prometheus.WrapRegistererWithPrefix(prefix, reg))
_, last, err := Segments(w.Dir())
if err != nil {
@ -459,36 +464,46 @@ func SegmentName(dir string, i int) string {
return filepath.Join(dir, fmt.Sprintf("%08d", i))
}
// NextSegment creates the next segment and closes the previous one.
func (w *WAL) NextSegment() error {
// NextSegment creates the next segment and closes the previous one asynchronously.
// It returns the file number of the new file.
func (w *WAL) NextSegment() (int, error) {
w.mtx.Lock()
defer w.mtx.Unlock()
return w.nextSegment()
return w.nextSegment(true)
}
// NextSegmentSync creates the next segment and closes the previous one in sync.
// It returns the file number of the new file.
func (w *WAL) NextSegmentSync() (int, error) {
w.mtx.Lock()
defer w.mtx.Unlock()
return w.nextSegment(false)
}
// nextSegment creates the next segment and closes the previous one.
func (w *WAL) nextSegment() error {
// It returns the file number of the new file.
func (w *WAL) nextSegment(async bool) (int, error) {
if w.closed {
return errors.New("wal is closed")
return 0, errors.New("wal is closed")
}
// Only flush the current page if it actually holds data.
if w.page.alloc > 0 {
if err := w.flushPage(true); err != nil {
return err
return 0, err
}
}
next, err := CreateSegment(w.Dir(), w.segment.Index()+1)
if err != nil {
return errors.Wrap(err, "create new segment file")
return 0, errors.Wrap(err, "create new segment file")
}
prev := w.segment
if err := w.setSegment(next); err != nil {
return err
return 0, err
}
// Don't block further writes by fsyncing the last segment.
w.actorc <- func() {
f := func() {
if err := w.fsync(prev); err != nil {
level.Error(w.logger).Log("msg", "sync previous segment", "err", err)
}
@ -496,7 +511,12 @@ func (w *WAL) nextSegment() error {
level.Error(w.logger).Log("msg", "close previous segment", "err", err)
}
}
return nil
if async {
w.actorc <- f
} else {
f()
}
return next.Index(), nil
}
func (w *WAL) setSegment(segment *Segment) error {
@ -638,7 +658,7 @@ func (w *WAL) log(rec []byte, final bool) error {
left += (pageSize - recordHeaderSize) * (w.pagesPerSegment() - w.donePages - 1) // Free pages in the active segment.
if len(rec) > left {
if err := w.nextSegment(); err != nil {
if _, err := w.nextSegment(true); err != nil {
return err
}
}
@ -745,6 +765,13 @@ func (w *WAL) fsync(f *Segment) error {
return err
}
// Sync forces a file sync on the current wal segment. This function is meant
// to be used only on tests due to different behaviour on Operating Systems
// like windows and linux
func (w *WAL) Sync() error {
return w.fsync(w.segment)
}
// Close flushes all writes and closes active segment.
func (w *WAL) Close() (err error) {
w.mtx.Lock()

View file

@ -364,14 +364,16 @@ func TestReadCheckpoint(t *testing.T) {
err := os.Mkdir(wdir, 0o777)
require.NoError(t, err)
os.Create(SegmentName(wdir, 30))
f, err := os.Create(SegmentName(wdir, 30))
require.NoError(t, err)
require.NoError(t, f.Close())
enc := record.Encoder{}
w, err := NewSize(nil, nil, wdir, 128*pageSize, compress)
require.NoError(t, err)
defer func() {
t.Cleanup(func() {
require.NoError(t, w.Close())
}()
})
// Write to the initial segment then checkpoint.
for i := 0; i < seriesCount; i++ {
@ -396,8 +398,11 @@ func TestReadCheckpoint(t *testing.T) {
require.NoError(t, w.Log(sample))
}
}
Checkpoint(log.NewNopLogger(), w, 30, 31, func(x chunks.HeadSeriesRef) bool { return true }, 0)
w.Truncate(32)
_, err = w.NextSegmentSync()
require.NoError(t, err)
_, err = Checkpoint(log.NewNopLogger(), w, 30, 31, func(x chunks.HeadSeriesRef) bool { return true }, 0)
require.NoError(t, err)
require.NoError(t, w.Truncate(32))
// Start read after checkpoint, no more data written.
_, _, err = Segments(w.Dir())

View file

@ -2314,7 +2314,7 @@ func (f *fakeDB) Stats(statsByLabelName string) (_ *tsdb.Stats, retErr error) {
}()
opts := tsdb.DefaultHeadOptions()
opts.ChunkRange = 1000
h, _ := tsdb.NewHead(nil, nil, nil, opts, nil)
h, _ := tsdb.NewHead(nil, nil, nil, nil, opts, nil)
return h.Stats(statsByLabelName), nil
}