Add out-of-order sample support to the TSDB (#11075)

* Introduce out-of-order TSDB support This implementation is based on this design doc: https://docs.google.com/document/d/1Kppm7qL9C-BJB1j6yb6-9ObG3AbdZnFUBYPNNWwDBYM/edit?usp=sharing This commit adds support to accept out-of-order ("OOO") sample into the TSDB up to a configurable time allowance. If OOO is enabled, overlapping querying are automatically enabled. Most of the additions have been borrowed from https://github.com/grafana/mimir-prometheus/ Here is the list ist of the original commits cherry picked from mimir-prometheus into this branch: - 4b2198d7ec - 2836e5513f - 00b379c3a5 - ff0dc75758 - a632c73352 - c6f3d4ab33 - 5e8406a1d4 - abde1e0ba1 - e70e769889 - df59320886 Co-authored-by: Jesus Vazquez <jesus.vazquez@grafana.com> Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com> Co-authored-by: Dieter Plaetinck <dieter@grafana.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * gofumpt files Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Add license header to missing files Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix OOO tests due to existing chunk disk mapper implementation Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix truncate int overflow Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Add Sync method to the WAL and update tests Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * remove useless sync Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Update minOOOTime after truncating Head * Update minOOOTime after truncating Head Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix lint Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Add a unit test Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Load OutOfOrderTimeWindow only once per appender Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix OOO Head LabelValues and PostingsForMatchers Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix replay of OOO mmap chunks Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Remove unnecessary err check Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Prevent panic with ApplyConfig Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Run OOO compaction after restart if there is OOO data from WBL Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Apply Bartek's suggestions Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Refactor OOO compaction Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Address comments and TODOs - Added a comment explaining why we need the allow overlapping compaction toggle - Clarified TSDBConfig OutOfOrderTimeWindow doc - Added an owner to all the TODOs in the code Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Run go format Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix remaining review comments Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix tests Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Change wbl reference when truncating ooo in TestHeadMinOOOTimeUpdate Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix TestWBLAndMmapReplay test failure on windows Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Address most of the feedback Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Refactor the block meta for out of order Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix windows error Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix review comments Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com> Co-authored-by: Dieter Plaetinck <dieter@grafana.com> Co-authored-by: Oleg Zaytsev <mail@olegzaytsev.com> Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com>
2025-03-05 20:59:13 -08:00 · 2022-09-20 19:05:50 +02:00 · 2022-09-20 19:05:50 +02:00 · c1b669bf9b
parent af6167df58
commit c1b669bf9b
38 changed files with 6655 additions and 380 deletions
--- a/cmd/prometheus/main.go
+++ b/cmd/prometheus/main.go
@ -463,6 +463,9 @@ func main() {
 		}
 		cfg.tsdb.MaxExemplars = int64(cfgFile.StorageConfig.ExemplarsConfig.MaxExemplars)
 	}
 	if cfgFile.StorageConfig.TSDBConfig != nil {
 		cfg.tsdb.OutOfOrderTimeWindow = cfgFile.StorageConfig.TSDBConfig.OutOfOrderTimeWindow
 	}
 	// Now that the validity of the config is established, set the config
 	// success metrics accordingly, although the config isn't really loaded
@ -1537,6 +1540,7 @@ type tsdbOptions struct {
 	StripeSize                     int
 	MinBlockDuration               model.Duration
 	MaxBlockDuration               model.Duration
 	OutOfOrderTimeWindow           int64
 	EnableExemplarStorage          bool
 	MaxExemplars                   int64
 	EnableMemorySnapshotOnShutdown bool
@ -1549,7 +1553,8 @@ func (opts tsdbOptions) ToTSDBOptions() tsdb.Options {
 		RetentionDuration:              int64(time.Duration(opts.RetentionDuration) / time.Millisecond),
 		MaxBytes:                       int64(opts.MaxBytes),
 		NoLockfile:                     opts.NoLockfile,
-		AllowOverlappingBlocks:         opts.AllowOverlappingBlocks,
+		AllowOverlappingCompaction:     opts.AllowOverlappingBlocks,
 		AllowOverlappingQueries:        opts.AllowOverlappingBlocks,
 		WALCompression:                 opts.WALCompression,
 		HeadChunksWriteQueueSize:       opts.HeadChunksWriteQueueSize,
 		StripeSize:                     opts.StripeSize,
@ -1558,6 +1563,7 @@ func (opts tsdbOptions) ToTSDBOptions() tsdb.Options {
 		EnableExemplarStorage:          opts.EnableExemplarStorage,
 		MaxExemplars:                   opts.MaxExemplars,
 		EnableMemorySnapshotOnShutdown: opts.EnableMemorySnapshotOnShutdown,
 		OutOfOrderTimeWindow:           opts.OutOfOrderTimeWindow,
 	}
 }
--- a/cmd/promtool/rules_test.go
+++ b/cmd/promtool/rules_test.go
@ -117,7 +117,8 @@ func TestBackfillRuleIntegration(t *testing.T) {
 				}
 				opts := tsdb.DefaultOptions()
-				opts.AllowOverlappingBlocks = true
+				opts.AllowOverlappingQueries = true
 				opts.AllowOverlappingCompaction = true
 				db, err := tsdb.Open(tmpDir, nil, nil, opts, nil)
 				require.NoError(t, err)
@ -245,7 +246,8 @@ func TestBackfillLabels(t *testing.T) {
 	}
 	opts := tsdb.DefaultOptions()
-	opts.AllowOverlappingBlocks = true
+	opts.AllowOverlappingQueries = true
 	opts.AllowOverlappingCompaction = true
 	db, err := tsdb.Open(tmpDir, nil, nil, opts, nil)
 	require.NoError(t, err)
--- a/cmd/promtool/tsdb.go
+++ b/cmd/promtool/tsdb.go
@ -597,7 +597,7 @@ func analyzeCompaction(block tsdb.BlockReader, indexr tsdb.IndexReader) (err err
 		for _, chk := range chks {
 			// Load the actual data of the chunk.
-			chk, err := chunkr.Chunk(chk.Ref)
+			chk, err := chunkr.Chunk(chk)
 			if err != nil {
 				return err
 			}
--- a/config/config.go
+++ b/config/config.go
@ -501,9 +501,37 @@ func (c *ScrapeConfig) MarshalYAML() (interface{}, error) {
 // StorageConfig configures runtime reloadable configuration options.
 type StorageConfig struct {
 	TSDBConfig      *TSDBConfig      `yaml:"tsdb,omitempty"`
 	ExemplarsConfig *ExemplarsConfig `yaml:"exemplars,omitempty"`
 }
 // TSDBConfig configures runtime reloadable configuration options.
 type TSDBConfig struct {
 	// OutOfOrderTimeWindow sets how long back in time an out-of-order sample can be inserted
 	// into the TSDB. This flag is typically set while unmarshaling the configuration file and translating
 	// OutOfOrderTimeWindowFlag's duration. The unit of this flag is expected to be the same as any
 	// other timestamp in the TSDB.
 	OutOfOrderTimeWindow int64
 	// OutOfOrderTimeWindowFlag holds the parsed duration from the config file.
 	// During unmarshall, this is converted into milliseconds and stored in OutOfOrderTimeWindow.
 	// This should not be used directly and must be converted into OutOfOrderTimeWindow.
 	OutOfOrderTimeWindowFlag model.Duration `yaml:"out_of_order_time_window,omitempty"`
 }
 // UnmarshalYAML implements the yaml.Unmarshaler interface.
 func (t *TSDBConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
 	*t = TSDBConfig{}
 	type plain TSDBConfig
 	if err := unmarshal((*plain)(t)); err != nil {
 		return err
 	}
 	t.OutOfOrderTimeWindow = time.Duration(t.OutOfOrderTimeWindowFlag).Milliseconds()
 	return nil
 }
 type TracingClientType string
 const (
--- a/storage/interface.go
+++ b/storage/interface.go
@ -27,10 +27,15 @@ import (
 // The errors exposed.
 var (
-	ErrNotFound                    = errors.New("not found")
+	ErrNotFound = errors.New("not found")
-	ErrOutOfOrderSample            = errors.New("out of order sample")
+	// ErrOutOfOrderSample is when out of order support is disabled and the sample is out of order.
 	ErrOutOfOrderSample = errors.New("out of order sample")
 	// ErrOutOfBounds is when out of order support is disabled and the sample is older than the min valid time for the append.
 	ErrOutOfBounds = errors.New("out of bounds")
 	// ErrTooOldSample is when out of order support is enabled but the sample is outside the time window allowed.
 	ErrTooOldSample = errors.New("too old sample")
 	// ErrDuplicateSampleForTimestamp is when the sample has same timestamp but different value.
 	ErrDuplicateSampleForTimestamp = errors.New("duplicate sample for timestamp")
 	ErrOutOfBounds                 = errors.New("out of bounds")
 	ErrOutOfOrderExemplar          = errors.New("out of order exemplar")
 	ErrDuplicateExemplar           = errors.New("duplicate exemplar")
 	ErrExemplarLabelLength         = fmt.Errorf("label length for exemplar exceeds maximum of %d UTF-8 characters", exemplar.ExemplarMaxLabelSetLength)
--- a/storage/merge.go
+++ b/storage/merge.go
@ -717,3 +717,56 @@ func (h *chunkIteratorHeap) Pop() interface{} {
 	*h = old[0 : n-1]
 	return x
 }
 // NewConcatenatingChunkSeriesMerger returns a VerticalChunkSeriesMergeFunc that simply concatenates the
 // chunks from the series. The resultant stream of chunks for a series might be overlapping and unsorted.
 func NewConcatenatingChunkSeriesMerger() VerticalChunkSeriesMergeFunc {
 	return func(series ...ChunkSeries) ChunkSeries {
 		if len(series) == 0 {
 			return nil
 		}
 		return &ChunkSeriesEntry{
 			Lset: series[0].Labels(),
 			ChunkIteratorFn: func() chunks.Iterator {
 				iterators := make([]chunks.Iterator, 0, len(series))
 				for _, s := range series {
 					iterators = append(iterators, s.Iterator())
 				}
 				return &concatenatingChunkIterator{
 					iterators: iterators,
 				}
 			},
 		}
 	}
 }
 type concatenatingChunkIterator struct {
 	iterators []chunks.Iterator
 	idx       int
 	curr chunks.Meta
 }
 func (c *concatenatingChunkIterator) At() chunks.Meta {
 	return c.curr
 }
 func (c *concatenatingChunkIterator) Next() bool {
 	if c.idx >= len(c.iterators) {
 		return false
 	}
 	if c.iterators[c.idx].Next() {
 		c.curr = c.iterators[c.idx].At()
 		return true
 	}
 	c.idx++
 	return c.Next()
 }
 func (c *concatenatingChunkIterator) Err() error {
 	errs := tsdb_errors.NewMulti()
 	for _, iter := range c.iterators {
 		errs.Add(iter.Err())
 	}
 	return errs.Err()
 }
--- a/storage/merge_test.go
+++ b/storage/merge_test.go
@ -499,6 +499,140 @@ func TestCompactingChunkSeriesMerger(t *testing.T) {
 	}
 }
 func TestConcatenatingChunkSeriesMerger(t *testing.T) {
 	m := NewConcatenatingChunkSeriesMerger()
 	for _, tc := range []struct {
 		name     string
 		input    []ChunkSeries
 		expected ChunkSeries
 	}{
 		{
 			name: "single empty series",
 			input: []ChunkSeries{
 				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil),
 			},
 			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil),
 		},
 		{
 			name: "single series",
 			input: []ChunkSeries{
 				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}}),
 			},
 			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}}),
 		},
 		{
 			name: "two empty series",
 			input: []ChunkSeries{
 				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil),
 				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil),
 			},
 			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil, nil),
 		},
 		{
 			name: "two non overlapping",
 			input: []ChunkSeries{
 				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}, sample{5, 5}}),
 				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{7, 7}, sample{9, 9}}, []tsdbutil.Sample{sample{10, 10}}),
 			},
 			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}, sample{5, 5}}, []tsdbutil.Sample{sample{7, 7}, sample{9, 9}}, []tsdbutil.Sample{sample{10, 10}}),
 		},
 		{
 			name: "two overlapping",
 			input: []ChunkSeries{
 				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}, sample{8, 8}}),
 				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{7, 7}, sample{9, 9}}, []tsdbutil.Sample{sample{10, 10}}),
 			},
 			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
 				[]tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}, sample{8, 8}},
 				[]tsdbutil.Sample{sample{7, 7}, sample{9, 9}}, []tsdbutil.Sample{sample{10, 10}},
 			),
 		},
 		{
 			name: "two duplicated",
 			input: []ChunkSeries{
 				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}}),
 				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{2, 2}, sample{3, 3}, sample{5, 5}}),
 			},
 			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
 				[]tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}},
 				[]tsdbutil.Sample{sample{2, 2}, sample{3, 3}, sample{5, 5}},
 			),
 		},
 		{
 			name: "three overlapping",
 			input: []ChunkSeries{
 				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}}),
 				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{2, 2}, sample{3, 3}, sample{6, 6}}),
 				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{0, 0}, sample{4, 4}}),
 			},
 			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
 				[]tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}},
 				[]tsdbutil.Sample{sample{2, 2}, sample{3, 3}, sample{6, 6}},
 				[]tsdbutil.Sample{sample{0, 0}, sample{4, 4}},
 			),
 		},
 		{
 			name: "three in chained overlap",
 			input: []ChunkSeries{
 				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}}),
 				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{4, 4}, sample{6, 66}}),
 				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{6, 6}, sample{10, 10}}),
 			},
 			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
 				[]tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}},
 				[]tsdbutil.Sample{sample{4, 4}, sample{6, 66}},
 				[]tsdbutil.Sample{sample{6, 6}, sample{10, 10}},
 			),
 		},
 		{
 			name: "three in chained overlap complex",
 			input: []ChunkSeries{
 				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{0, 0}, sample{5, 5}}, []tsdbutil.Sample{sample{10, 10}, sample{15, 15}}),
 				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{2, 2}, sample{20, 20}}, []tsdbutil.Sample{sample{25, 25}, sample{30, 30}}),
 				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{18, 18}, sample{26, 26}}, []tsdbutil.Sample{sample{31, 31}, sample{35, 35}}),
 			},
 			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
 				[]tsdbutil.Sample{sample{0, 0}, sample{5, 5}}, []tsdbutil.Sample{sample{10, 10}, sample{15, 15}},
 				[]tsdbutil.Sample{sample{2, 2}, sample{20, 20}}, []tsdbutil.Sample{sample{25, 25}, sample{30, 30}},
 				[]tsdbutil.Sample{sample{18, 18}, sample{26, 26}}, []tsdbutil.Sample{sample{31, 31}, sample{35, 35}},
 			),
 		},
 		{
 			name: "110 overlapping",
 			input: []ChunkSeries{
 				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), tsdbutil.GenerateSamples(0, 110)), // [0 - 110)
 				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), tsdbutil.GenerateSamples(60, 50)), // [60 - 110)
 			},
 			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
 				tsdbutil.GenerateSamples(0, 110),
 				tsdbutil.GenerateSamples(60, 50),
 			),
 		},
 		{
 			name: "150 overlapping samples, simply concatenated and no splits",
 			input: []ChunkSeries{
 				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), tsdbutil.GenerateSamples(0, 90)),  // [0 - 90)
 				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), tsdbutil.GenerateSamples(60, 90)), // [90 - 150)
 			},
 			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
 				tsdbutil.GenerateSamples(0, 90),
 				tsdbutil.GenerateSamples(60, 90),
 			),
 		},
 	} {
 		t.Run(tc.name, func(t *testing.T) {
 			merged := m(tc.input...)
 			require.Equal(t, tc.expected.Labels(), merged.Labels())
 			actChks, actErr := ExpandChunks(merged.Iterator())
 			expChks, expErr := ExpandChunks(tc.expected.Iterator())
 			require.Equal(t, expErr, actErr)
 			require.Equal(t, expChks, actChks)
 		})
 	}
 }
 type mockQuerier struct {
 	LabelQuerier
--- a/tsdb/agent/db.go
+++ b/tsdb/agent/db.go
@ -567,8 +567,7 @@ func (db *DB) truncate(mint int64) error {
 	// Start a new segment so low ingestion volume instances don't have more WAL
 	// than needed.
-	err = db.wal.NextSegment()
+	if _, err := db.wal.NextSegment(); err != nil {
 	if err != nil {
 		return errors.Wrap(err, "next segment")
 	}
--- a/tsdb/block.go
+++ b/tsdb/block.go
@ -116,7 +116,7 @@ type ChunkWriter interface {
 // ChunkReader provides reading access of serialized time series data.
 type ChunkReader interface {
 	// Chunk returns the series data chunk with the given reference.
-	Chunk(ref chunks.ChunkRef) (chunkenc.Chunk, error)
+	Chunk(meta chunks.Meta) (chunkenc.Chunk, error)
 	// Close releases all underlying resources of the reader.
 	Close() error
@ -189,12 +189,39 @@ type BlockMetaCompaction struct {
 	// this block.
 	Parents []BlockDesc `json:"parents,omitempty"`
 	Failed  bool        `json:"failed,omitempty"`
 	// Additional information about the compaction, for example, block created from out-of-order chunks.
 	Hints []string `json:"hints,omitempty"`
 }
 func (bm *BlockMetaCompaction) SetOutOfOrder() {
 	if bm.containsHint(CompactionHintFromOutOfOrder) {
 		return
 	}
 	bm.Hints = append(bm.Hints, CompactionHintFromOutOfOrder)
 	sort.Strings(bm.Hints)
 }
 func (bm *BlockMetaCompaction) FromOutOfOrder() bool {
 	return bm.containsHint(CompactionHintFromOutOfOrder)
 }
 func (bm *BlockMetaCompaction) containsHint(hint string) bool {
 	for _, h := range bm.Hints {
 		if h == hint {
 			return true
 		}
 	}
 	return false
 }
 const (
 	indexFilename = "index"
 	metaFilename  = "meta.json"
 	metaVersion1  = 1
 	// CompactionHintFromOutOfOrder is a hint noting that the block
 	// was created from out-of-order chunks.
 	CompactionHintFromOutOfOrder = "from-out-of-order"
 )
 func chunkDir(dir string) string { return filepath.Join(dir, "chunks") }
--- a/tsdb/block_test.go
+++ b/tsdb/block_test.go
@ -27,6 +27,7 @@ import (
 	"testing"
 	"github.com/go-kit/log"
 	prom_testutil "github.com/prometheus/client_golang/prometheus/testutil"
 	"github.com/stretchr/testify/require"
 	"github.com/prometheus/prometheus/model/labels"
@ -487,7 +488,7 @@ func createBlockFromHead(tb testing.TB, dir string, head *Head) string {
 func createHead(tb testing.TB, w *wal.WAL, series []storage.Series, chunkDir string) *Head {
 	opts := DefaultHeadOptions()
 	opts.ChunkDirRoot = chunkDir
-	head, err := NewHead(nil, nil, w, opts, nil)
+	head, err := NewHead(nil, nil, w, nil, opts, nil)
 	require.NoError(tb, err)
 	app := head.Appender(context.Background())
@ -506,6 +507,66 @@ func createHead(tb testing.TB, w *wal.WAL, series []storage.Series, chunkDir str
 	return head
 }
 func createHeadWithOOOSamples(tb testing.TB, w *wal.WAL, series []storage.Series, chunkDir string, oooSampleFrequency int) *Head {
 	opts := DefaultHeadOptions()
 	opts.ChunkDirRoot = chunkDir
 	opts.OutOfOrderTimeWindow.Store(10000000000)
 	head, err := NewHead(nil, nil, w, nil, opts, nil)
 	require.NoError(tb, err)
 	oooSampleLabels := make([]labels.Labels, 0, len(series))
 	oooSamples := make([]tsdbutil.SampleSlice, 0, len(series))
 	totalSamples := 0
 	app := head.Appender(context.Background())
 	for _, s := range series {
 		ref := storage.SeriesRef(0)
 		it := s.Iterator()
 		lset := s.Labels()
 		os := tsdbutil.SampleSlice{}
 		count := 0
 		for it.Next() {
 			totalSamples++
 			count++
 			t, v := it.At()
 			if count%oooSampleFrequency == 0 {
 				os = append(os, sample{t: t, v: v})
 				continue
 			}
 			ref, err = app.Append(ref, lset, t, v)
 			require.NoError(tb, err)
 		}
 		require.NoError(tb, it.Err())
 		if len(os) > 0 {
 			oooSampleLabels = append(oooSampleLabels, lset)
 			oooSamples = append(oooSamples, os)
 		}
 	}
 	require.NoError(tb, app.Commit())
 	oooSamplesAppended := 0
 	require.Equal(tb, float64(0), prom_testutil.ToFloat64(head.metrics.outOfOrderSamplesAppended))
 	app = head.Appender(context.Background())
 	for i, lset := range oooSampleLabels {
 		ref := storage.SeriesRef(0)
 		for _, sample := range oooSamples[i] {
 			ref, err = app.Append(ref, lset, sample.T(), sample.V())
 			require.NoError(tb, err)
 			oooSamplesAppended++
 		}
 	}
 	require.NoError(tb, app.Commit())
 	actOOOAppended := prom_testutil.ToFloat64(head.metrics.outOfOrderSamplesAppended)
 	require.GreaterOrEqual(tb, actOOOAppended, float64(oooSamplesAppended-len(series)))
 	require.LessOrEqual(tb, actOOOAppended, float64(oooSamplesAppended))
 	require.Equal(tb, float64(totalSamples), prom_testutil.ToFloat64(head.metrics.samplesAppended))
 	return head
 }
 const (
 	defaultLabelName  = "labelName"
 	defaultLabelValue = "labelValue"
--- a/tsdb/blockwriter.go
+++ b/tsdb/blockwriter.go
@ -39,7 +39,7 @@ type BlockWriter struct {
 }
 // ErrNoSeriesAppended is returned if the series count is zero while flushing blocks.
-var ErrNoSeriesAppended error = errors.New("no series appended, aborting")
+var ErrNoSeriesAppended = errors.New("no series appended, aborting")
 // NewBlockWriter create a new block writer.
 //
@ -71,7 +71,7 @@ func (w *BlockWriter) initHead() error {
 	opts := DefaultHeadOptions()
 	opts.ChunkRange = w.blockSize
 	opts.ChunkDirRoot = w.chunkDir
-	h, err := NewHead(nil, w.logger, nil, opts, NewHeadStats())
+	h, err := NewHead(nil, w.logger, nil, nil, opts, NewHeadStats())
 	if err != nil {
 		return errors.Wrap(err, "tsdb.NewHead")
 	}
--- a/tsdb/chunkenc/chunk.go
+++ b/tsdb/chunkenc/chunk.go
@ -39,6 +39,21 @@ const (
 	EncXOR
 )
 // Chunk encodings for out-of-order chunks.
 // These encodings must be only used by the Head block for its internal bookkeeping.
 const (
 	OutOfOrderMask = 0b10000000
 	EncOOOXOR      = EncXOR | OutOfOrderMask
 )
 func IsOutOfOrderChunk(e Encoding) bool {
 	return (e & OutOfOrderMask) != 0
 }
 func IsValidEncoding(e Encoding) bool {
 	return e == EncXOR || e == EncOOOXOR
 }
 // Chunk holds a sequence of sample pairs that can be iterated over and appended to.
 type Chunk interface {
 	// Bytes returns the underlying byte slice of the chunk.
@ -155,7 +170,7 @@ func NewPool() Pool {
 func (p *pool) Get(e Encoding, b []byte) (Chunk, error) {
 	switch e {
-	case EncXOR:
+	case EncXOR, EncOOOXOR:
 		c := p.xor.Get().(*XORChunk)
 		c.b.stream = b
 		c.b.count = 0
@ -166,7 +181,7 @@ func (p *pool) Get(e Encoding, b []byte) (Chunk, error) {
 func (p *pool) Put(c Chunk) error {
 	switch c.Encoding() {
-	case EncXOR:
+	case EncXOR, EncOOOXOR:
 		xc, ok := c.(*XORChunk)
 		// This may happen often with wrapped chunks. Nothing we can really do about
 		// it but returning an error would cause a lot of allocations again. Thus,
@ -188,7 +203,7 @@ func (p *pool) Put(c Chunk) error {
 // bytes.
 func FromData(e Encoding, d []byte) (Chunk, error) {
 	switch e {
-	case EncXOR:
+	case EncXOR, EncOOOXOR:
 		return &XORChunk{b: bstream{count: 0, stream: d}}, nil
 	}
 	return nil, errors.Errorf("invalid chunk encoding %q", e)
--- a/tsdb/chunkenc/xor.go
+++ b/tsdb/chunkenc/xor.go
@ -457,3 +457,12 @@ func (it *xorIterator) readValue() bool {
 	it.numRead++
 	return true
 }
 // OOOXORChunk holds a XORChunk and overrides the Encoding() method.
 type OOOXORChunk struct {
 	*XORChunk
 }
 func (c *OOOXORChunk) Encoding() Encoding {
 	return EncOOOXOR
 }
--- a/tsdb/chunks/chunks.go
+++ b/tsdb/chunks/chunks.go
@ -121,6 +121,15 @@ type Meta struct {
 	// Time range the data covers.
 	// When MaxTime == math.MaxInt64 the chunk is still open and being appended to.
 	MinTime, MaxTime int64
 	// OOOLastRef, OOOLastMinTime and OOOLastMaxTime are kept as markers for
 	// overlapping chunks.
 	// These fields point to the last created out of order Chunk (the head) that existed
 	// when Series() was called and was overlapping.
 	// Series() and Chunk() method responses should be consistent for the same
 	// query even if new data is added in between the calls.
 	OOOLastRef                     ChunkRef
 	OOOLastMinTime, OOOLastMaxTime int64
 }
 // Iterator iterates over the chunks of a single time series.
@ -556,8 +565,8 @@ func (s *Reader) Size() int64 {
 }
 // Chunk returns a chunk from a given reference.
-func (s *Reader) Chunk(ref ChunkRef) (chunkenc.Chunk, error) {
+func (s *Reader) Chunk(meta Meta) (chunkenc.Chunk, error) {
-	sgmIndex, chkStart := BlockChunkRef(ref).Unpack()
+	sgmIndex, chkStart := BlockChunkRef(meta.Ref).Unpack()
 	if sgmIndex >= len(s.bs) {
 		return nil, errors.Errorf("segment index %d out of range", sgmIndex)
--- a/tsdb/chunks/chunks_test.go
+++ b/tsdb/chunks/chunks_test.go
@ -23,6 +23,6 @@ func TestReaderWithInvalidBuffer(t *testing.T) {
 	b := realByteSlice([]byte{0x81, 0x81, 0x81, 0x81, 0x81, 0x81})
 	r := &Reader{bs: []ByteSlice{b}}
-	_, err := r.Chunk(0)
+	_, err := r.Chunk(Meta{Ref: 0})
 	require.Error(t, err)
 }
--- a/tsdb/chunks/head_chunks.go
+++ b/tsdb/chunks/head_chunks.go
@ -87,6 +87,18 @@ func (ref ChunkDiskMapperRef) Unpack() (seq, offset int) {
 	return seq, offset
 }
 func (ref ChunkDiskMapperRef) GreaterThanOrEqualTo(r ChunkDiskMapperRef) bool {
 	s1, o1 := ref.Unpack()
 	s2, o2 := r.Unpack()
 	return s1 > s2 || (s1 == s2 && o1 >= o2)
 }
 func (ref ChunkDiskMapperRef) GreaterThan(r ChunkDiskMapperRef) bool {
 	s1, o1 := ref.Unpack()
 	s2, o2 := r.Unpack()
 	return s1 > s2 || (s1 == s2 && o1 > o2)
 }
 // CorruptionErr is an error that's returned when corruption is encountered.
 type CorruptionErr struct {
 	Dir       string
@ -736,7 +748,7 @@ func (cdm *ChunkDiskMapper) Chunk(ref ChunkDiskMapperRef) (chunkenc.Chunk, error
 // and runs the provided function with information about each chunk. It returns on the first error encountered.
 // NOTE: This method needs to be called at least once after creating ChunkDiskMapper
 // to set the maxt of all the file.
-func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, numSamples uint16) error) (err error) {
+func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, numSamples uint16, encoding chunkenc.Encoding) error) (err error) {
 	cdm.writePathMtx.Lock()
 	defer cdm.writePathMtx.Unlock()
@ -799,7 +811,8 @@ func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chu
 				break
 			}
-			idx += ChunkEncodingSize // Skip encoding.
+			chkEnc := chunkenc.Encoding(mmapFile.byteSlice.Range(idx, idx+ChunkEncodingSize)[0])
 			idx += ChunkEncodingSize
 			dataLen, n := binary.Uvarint(mmapFile.byteSlice.Range(idx, idx+MaxChunkLengthFieldSize))
 			idx += n
@ -834,7 +847,7 @@ func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chu
 				mmapFile.maxt = maxt
 			}
-			if err := f(seriesRef, chunkRef, mint, maxt, numSamples); err != nil {
+			if err := f(seriesRef, chunkRef, mint, maxt, numSamples, chkEnc); err != nil {
 				if cerr, ok := err.(*CorruptionErr); ok {
 					cerr.Dir = cdm.dir.Name()
 					cerr.FileIndex = segID
@ -857,12 +870,8 @@ func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chu
 	return nil
 }
-// Truncate deletes the head chunk files which are strictly below the mint.
+// Truncate deletes the head chunk files whose file number is less than given fileNo.
-// mint should be in milliseconds.
+func (cdm *ChunkDiskMapper) Truncate(fileNo uint32) error {
 func (cdm *ChunkDiskMapper) Truncate(mint int64) error {
 	if !cdm.fileMaxtSet {
 		return errors.New("maxt of the files are not set")
 	}
 	cdm.readPathMtx.RLock()
 	// Sort the file indices, else if files deletion fails in between,
@ -875,12 +884,10 @@ func (cdm *ChunkDiskMapper) Truncate(mint int64) error {
 	var removedFiles []int
 	for _, seq := range chkFileIndices {
-		if seq == cdm.curFileSequence || cdm.mmappedChunkFiles[seq].maxt >= mint {
+		if seq == cdm.curFileSequence || uint32(seq) >= fileNo {
 			break
 		}
-		if cdm.mmappedChunkFiles[seq].maxt < mint {
+		removedFiles = append(removedFiles, seq)
 			removedFiles = append(removedFiles, seq)
 		}
 	}
 	cdm.readPathMtx.RUnlock()
--- a/tsdb/chunks/head_chunks_test.go
+++ b/tsdb/chunks/head_chunks_test.go
@ -58,6 +58,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) {
 		mint, maxt int64
 		numSamples uint16
 		chunk      chunkenc.Chunk
 		isOOO      bool
 	}
 	expectedData := []expectedDataType{}
@ -67,7 +68,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) {
 	for hrw.curFileSequence < 3 || hrw.chkWriter.Buffered() == 0 {
 		addChunks := func(numChunks int) {
 			for i := 0; i < numChunks; i++ {
-				seriesRef, chkRef, mint, maxt, chunk := createChunk(t, totalChunks, hrw)
+				seriesRef, chkRef, mint, maxt, chunk, isOOO := createChunk(t, totalChunks, hrw)
 				totalChunks++
 				expectedData = append(expectedData, expectedDataType{
 					seriesRef:  seriesRef,
@ -76,6 +77,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) {
 					chunkRef:   chkRef,
 					chunk:      chunk,
 					numSamples: uint16(chunk.NumSamples()),
 					isOOO:      isOOO,
 				})
 				if hrw.curFileSequence != 1 {
@ -147,7 +149,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) {
 	hrw = createChunkDiskMapper(t, dir)
 	idx := 0
-	require.NoError(t, hrw.IterateAllChunks(func(seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, numSamples uint16) error {
+	require.NoError(t, hrw.IterateAllChunks(func(seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, numSamples uint16, encoding chunkenc.Encoding) error {
 		t.Helper()
 		expData := expectedData[idx]
@ -156,6 +158,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) {
 		require.Equal(t, expData.maxt, maxt)
 		require.Equal(t, expData.maxt, maxt)
 		require.Equal(t, expData.numSamples, numSamples)
 		require.Equal(t, expData.isOOO, chunkenc.IsOutOfOrderChunk(encoding))
 		actChunk, err := hrw.Chunk(expData.chunkRef)
 		require.NoError(t, err)
@ -178,9 +181,7 @@ func TestChunkDiskMapper_Truncate(t *testing.T) {
 	}()
 	timeRange := 0
-	fileTimeStep := 100
+	addChunk := func() {
 	var thirdFileMinT, sixthFileMinT int64
 	addChunk := func() int {
 		t.Helper()
 		step := 100
@ -194,8 +195,6 @@ func TestChunkDiskMapper_Truncate(t *testing.T) {
 		<-awaitCb
 		require.NoError(t, err)
 		timeRange += step
 		return mint
 	}
 	verifyFiles := func(remainingFiles []int) {
@ -216,17 +215,12 @@ func TestChunkDiskMapper_Truncate(t *testing.T) {
 	// Create segments 1 to 7.
 	for i := 1; i <= 7; i++ {
 		hrw.CutNewFile()
-		mint := int64(addChunk())
+		addChunk()
 		if i == 3 {
 			thirdFileMinT = mint
 		} else if i == 6 {
 			sixthFileMinT = mint
 		}
 	}
 	verifyFiles([]int{1, 2, 3, 4, 5, 6, 7})
 	// Truncating files.
-	require.NoError(t, hrw.Truncate(thirdFileMinT))
+	require.NoError(t, hrw.Truncate(3))
 	// Add a chunk to trigger cutting of new file.
 	addChunk()
@ -245,11 +239,11 @@ func TestChunkDiskMapper_Truncate(t *testing.T) {
 	verifyFiles([]int{3, 4, 5, 6, 7, 8, 9})
 	// Truncating files after restart.
-	require.NoError(t, hrw.Truncate(sixthFileMinT))
+	require.NoError(t, hrw.Truncate(6))
 	verifyFiles([]int{6, 7, 8, 9})
 	// Truncating a second time without adding a chunk shouldn't create a new file.
-	require.NoError(t, hrw.Truncate(sixthFileMinT+1))
+	require.NoError(t, hrw.Truncate(6))
 	verifyFiles([]int{6, 7, 8, 9})
 	// Add a chunk to trigger cutting of new file.
@ -257,8 +251,12 @@ func TestChunkDiskMapper_Truncate(t *testing.T) {
 	verifyFiles([]int{6, 7, 8, 9, 10})
 	// Truncation by file number.
 	require.NoError(t, hrw.Truncate(8))
 	verifyFiles([]int{8, 9, 10})
 	// Truncating till current time should not delete the current active file.
-	require.NoError(t, hrw.Truncate(int64(timeRange+(2*fileTimeStep))))
+	require.NoError(t, hrw.Truncate(10))
 	// Add a chunk to trigger cutting of new file.
 	addChunk()
@ -335,8 +333,7 @@ func TestChunkDiskMapper_Truncate_PreservesFileSequence(t *testing.T) {
 	// Truncating files till 2. It should not delete anything after 3 (inclusive)
 	// though files 4 and 6 are empty.
-	file2Maxt := hrw.mmappedChunkFiles[2].maxt
+	require.NoError(t, hrw.Truncate(3))
 	require.NoError(t, hrw.Truncate(file2Maxt+1))
 	verifyFiles([]int{3, 4, 5, 6})
 	// Add chunk, so file 6 is not empty anymore.
@ -344,8 +341,7 @@ func TestChunkDiskMapper_Truncate_PreservesFileSequence(t *testing.T) {
 	verifyFiles([]int{3, 4, 5, 6})
 	// Truncating till file 3 should also delete file 4, because it is empty.
-	file3Maxt := hrw.mmappedChunkFiles[3].maxt
+	require.NoError(t, hrw.Truncate(5))
 	require.NoError(t, hrw.Truncate(file3Maxt+1))
 	addChunk()
 	verifyFiles([]int{5, 6, 7})
@ -381,7 +377,7 @@ func TestHeadReadWriter_TruncateAfterFailedIterateChunks(t *testing.T) {
 	hrw = createChunkDiskMapper(t, dir)
 	// Forcefully failing IterateAllChunks.
-	require.Error(t, hrw.IterateAllChunks(func(_ HeadSeriesRef, _ ChunkDiskMapperRef, _, _ int64, _ uint16) error {
+	require.Error(t, hrw.IterateAllChunks(func(_ HeadSeriesRef, _ ChunkDiskMapperRef, _, _ int64, _ uint16, _ chunkenc.Encoding) error {
 		return errors.New("random error")
 	}))
@ -471,7 +467,9 @@ func createChunkDiskMapper(t *testing.T, dir string) *ChunkDiskMapper {
 	hrw, err := NewChunkDiskMapper(nil, dir, chunkenc.NewPool(), DefaultWriteBufferSize, writeQueueSize)
 	require.NoError(t, err)
 	require.False(t, hrw.fileMaxtSet)
-	require.NoError(t, hrw.IterateAllChunks(func(_ HeadSeriesRef, _ ChunkDiskMapperRef, _, _ int64, _ uint16) error { return nil }))
+	require.NoError(t, hrw.IterateAllChunks(func(_ HeadSeriesRef, _ ChunkDiskMapperRef, _, _ int64, _ uint16, _ chunkenc.Encoding) error {
 		return nil
 	}))
 	require.True(t, hrw.fileMaxtSet)
 	return hrw
@ -488,13 +486,17 @@ func randomChunk(t *testing.T) chunkenc.Chunk {
 	return chunk
 }
-func createChunk(t *testing.T, idx int, hrw *ChunkDiskMapper) (seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, chunk chunkenc.Chunk) {
+func createChunk(t *testing.T, idx int, hrw *ChunkDiskMapper) (seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, chunk chunkenc.Chunk, isOOO bool) {
 	var err error
 	seriesRef = HeadSeriesRef(rand.Int63())
 	mint = int64((idx)*1000 + 1)
 	maxt = int64((idx + 1) * 1000)
 	chunk = randomChunk(t)
 	awaitCb := make(chan struct{})
 	if rand.Intn(2) == 0 {
 		isOOO = true
 		chunk = &chunkenc.OOOXORChunk{XORChunk: chunk.(*chunkenc.XORChunk)}
 	}
 	chunkRef = hrw.WriteChunk(seriesRef, mint, maxt, chunk, func(cbErr error) {
 		require.NoError(t, err)
 		close(awaitCb)
--- a/tsdb/compact_test.go
+++ b/tsdb/compact_test.go
@ -1080,7 +1080,7 @@ func BenchmarkCompactionFromHead(b *testing.B) {
 			opts := DefaultHeadOptions()
 			opts.ChunkRange = 1000
 			opts.ChunkDirRoot = chunkDir
-			h, err := NewHead(nil, nil, nil, opts, nil)
+			h, err := NewHead(nil, nil, nil, nil, opts, nil)
 			require.NoError(b, err)
 			for ln := 0; ln < labelNames; ln++ {
 				app := h.Appender(context.Background())
--- a/tsdb/db.go
+++ b/tsdb/db.go
@ -33,6 +33,7 @@ import (
 	"github.com/oklog/ulid"
 	"github.com/pkg/errors"
 	"github.com/prometheus/client_golang/prometheus"
 	"go.uber.org/atomic"
 	"golang.org/x/sync/errgroup"
 	"github.com/prometheus/prometheus/config"
@ -69,18 +70,19 @@ var ErrNotReady = errors.New("TSDB not ready")
 // millisecond precision timestamps.
 func DefaultOptions() *Options {
 	return &Options{
-		WALSegmentSize:            wal.DefaultSegmentSize,
+		WALSegmentSize:             wal.DefaultSegmentSize,
-		MaxBlockChunkSegmentSize:  chunks.DefaultChunkSegmentSize,
+		MaxBlockChunkSegmentSize:   chunks.DefaultChunkSegmentSize,
-		RetentionDuration:         int64(15 * 24 * time.Hour / time.Millisecond),
+		RetentionDuration:          int64(15 * 24 * time.Hour / time.Millisecond),
-		MinBlockDuration:          DefaultBlockDuration,
+		MinBlockDuration:           DefaultBlockDuration,
-		MaxBlockDuration:          DefaultBlockDuration,
+		MaxBlockDuration:           DefaultBlockDuration,
-		NoLockfile:                false,
+		NoLockfile:                 false,
-		AllowOverlappingBlocks:    false,
+		AllowOverlappingCompaction: false,
-		WALCompression:            false,
+		AllowOverlappingQueries:    false,
-		StripeSize:                DefaultStripeSize,
+		WALCompression:             false,
-		HeadChunksWriteBufferSize: chunks.DefaultWriteBufferSize,
+		StripeSize:                 DefaultStripeSize,
-		IsolationDisabled:         defaultIsolationDisabled,
+		HeadChunksWriteBufferSize:  chunks.DefaultWriteBufferSize,
-		HeadChunksWriteQueueSize:  chunks.DefaultWriteQueueSize,
+		IsolationDisabled:          defaultIsolationDisabled,
 		OutOfOrderCapMax:           DefaultOutOfOrderCapMax,
 	}
 }
@ -112,9 +114,19 @@ type Options struct {
 	// NoLockfile disables creation and consideration of a lock file.
 	NoLockfile bool
-	// Overlapping blocks are allowed if AllowOverlappingBlocks is true.
+	// Querying on overlapping blocks are allowed if AllowOverlappingQueries is true.
-	// This in-turn enables vertical compaction and vertical query merge.
+	// Since querying is a required operation for TSDB, if there are going to be
-	AllowOverlappingBlocks bool
+	// overlapping blocks, then this should be set to true.
 	// NOTE: Do not use this directly in DB. Use it via DB.AllowOverlappingQueries().
 	AllowOverlappingQueries bool
 	// Compaction of overlapping blocks are allowed if AllowOverlappingCompaction is true.
 	// This is an optional flag for overlapping blocks.
 	// The reason why this flag exists is because there are various users of the TSDB
 	// that do not want vertical compaction happening on ingest time. Instead,
 	// they'd rather keep overlapping blocks and let another component do the overlapping compaction later.
 	// For Prometheus, this will always be enabled if overlapping queries is enabled.
 	AllowOverlappingCompaction bool
 	// WALCompression will turn on Snappy compression for records on the WAL.
 	WALCompression bool
@ -160,6 +172,15 @@ type Options struct {
 	// Disables isolation between reads and in-flight appends.
 	IsolationDisabled bool
 	// OutOfOrderTimeWindow specifies how much out of order is allowed, if any.
 	// This can change during run-time, so this value from here should only be used
 	// while initialising.
 	OutOfOrderTimeWindow int64
 	// OutOfOrderCapMax is maximum capacity for OOO chunks (in samples).
 	// If it is <=0, the default value is assumed.
 	OutOfOrderCapMax int64
 }
 type BlocksToDeleteFunc func(blocks []*Block) map[ulid.ULID]struct{}
@ -197,6 +218,13 @@ type DB struct {
 	// Cancel a running compaction when a shutdown is initiated.
 	compactCancel context.CancelFunc
 	// oooWasEnabled is true if out of order support was enabled at least one time
 	// during the time TSDB was up. In which case we need to keep supporting
 	// out-of-order compaction and vertical queries.
 	oooWasEnabled atomic.Bool
 	registerer prometheus.Registerer
 }
 type dbMetrics struct {
@ -372,9 +400,17 @@ func (db *DBReadOnly) FlushWAL(dir string) (returnErr error) {
 	if err != nil {
 		return err
 	}
 	var wbl *wal.WAL
 	wblDir := filepath.Join(db.dir, wal.WblDirName)
 	if _, err := os.Stat(wblDir); !os.IsNotExist(err) {
 		wbl, err = wal.Open(db.logger, wblDir)
 		if err != nil {
 			return err
 		}
 	}
 	opts := DefaultHeadOptions()
 	opts.ChunkDirRoot = db.dir
-	head, err := NewHead(nil, db.logger, w, opts, NewHeadStats())
+	head, err := NewHead(nil, db.logger, w, wbl, opts, NewHeadStats())
 	if err != nil {
 		return err
 	}
@ -430,7 +466,7 @@ func (db *DBReadOnly) loadDataAsQueryable(maxt int64) (storage.SampleAndChunkQue
 	opts := DefaultHeadOptions()
 	opts.ChunkDirRoot = db.dir
-	head, err := NewHead(nil, db.logger, nil, opts, NewHeadStats())
+	head, err := NewHead(nil, db.logger, nil, nil, opts, NewHeadStats())
 	if err != nil {
 		return nil, err
 	}
@ -448,9 +484,17 @@ func (db *DBReadOnly) loadDataAsQueryable(maxt int64) (storage.SampleAndChunkQue
 		if err != nil {
 			return nil, err
 		}
 		var wbl *wal.WAL
 		wblDir := filepath.Join(db.dir, wal.WblDirName)
 		if _, err := os.Stat(wblDir); !os.IsNotExist(err) {
 			wbl, err = wal.Open(db.logger, wblDir)
 			if err != nil {
 				return nil, err
 			}
 		}
 		opts := DefaultHeadOptions()
 		opts.ChunkDirRoot = db.dir
-		head, err = NewHead(nil, db.logger, w, opts, NewHeadStats())
+		head, err = NewHead(nil, db.logger, w, wbl, opts, NewHeadStats())
 		if err != nil {
 			return nil, err
 		}
@ -598,6 +642,15 @@ func validateOpts(opts *Options, rngs []int64) (*Options, []int64) {
 	if opts.MinBlockDuration > opts.MaxBlockDuration {
 		opts.MaxBlockDuration = opts.MinBlockDuration
 	}
 	if opts.OutOfOrderTimeWindow > 0 {
 		opts.AllowOverlappingQueries = true
 	}
 	if opts.OutOfOrderCapMax <= 0 {
 		opts.OutOfOrderCapMax = DefaultOutOfOrderCapMax
 	}
 	if opts.OutOfOrderTimeWindow < 0 {
 		opts.OutOfOrderTimeWindow = 0
 	}
 	if len(rngs) == 0 {
 		// Start with smallest block duration and create exponential buckets until the exceed the
@ -634,6 +687,7 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
 	}
 	walDir := filepath.Join(dir, "wal")
 	wblDir := filepath.Join(dir, wal.WblDirName)
 	// Migrate old WAL if one exists.
 	if err := MigrateWAL(l, walDir); err != nil {
@ -656,6 +710,7 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
 		autoCompact:    true,
 		chunkPool:      chunkenc.NewPool(),
 		blocksToDelete: opts.BlocksToDelete,
 		registerer:     r,
 	}
 	defer func() {
 		// Close files if startup fails somewhere.
@ -694,7 +749,7 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
 	}
 	db.compactCancel = cancel
-	var wlog *wal.WAL
+	var wlog, wblog *wal.WAL
 	segmentSize := wal.DefaultSegmentSize
 	// Wal is enabled.
 	if opts.WALSegmentSize >= 0 {
@ -706,8 +761,19 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
 		if err != nil {
 			return nil, err
 		}
 		// Check if there is a WBL on disk, in which case we should replay that data.
 		wblSize, err := fileutil.DirSize(wblDir)
 		if err != nil && !os.IsNotExist(err) {
 			return nil, err
 		}
 		if opts.OutOfOrderTimeWindow > 0 || wblSize > 0 {
 			wblog, err = wal.NewSize(l, r, wblDir, segmentSize, opts.WALCompression)
 			if err != nil {
 				return nil, err
 			}
 		}
 	}
-
+	db.oooWasEnabled.Store(opts.OutOfOrderTimeWindow > 0)
 	headOpts := DefaultHeadOptions()
 	headOpts.ChunkRange = rngs[0]
 	headOpts.ChunkDirRoot = dir
@ -719,11 +785,13 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
 	headOpts.EnableExemplarStorage = opts.EnableExemplarStorage
 	headOpts.MaxExemplars.Store(opts.MaxExemplars)
 	headOpts.EnableMemorySnapshotOnShutdown = opts.EnableMemorySnapshotOnShutdown
 	headOpts.OutOfOrderTimeWindow.Store(opts.OutOfOrderTimeWindow)
 	headOpts.OutOfOrderCapMax.Store(opts.OutOfOrderCapMax)
 	if opts.IsolationDisabled {
 		// We only override this flag if isolation is disabled at DB level. We use the default otherwise.
 		headOpts.IsolationDisabled = opts.IsolationDisabled
 	}
-	db.head, err = NewHead(r, l, wlog, headOpts, stats.Head)
+	db.head, err = NewHead(r, l, wlog, wblog, headOpts, stats.Head)
 	if err != nil {
 		return nil, err
 	}
@ -741,20 +809,36 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
 	}
 	// Set the min valid time for the ingested samples
 	// to be no lower than the maxt of the last block.
 	blocks := db.Blocks()
 	minValidTime := int64(math.MinInt64)
-	if len(blocks) > 0 {
+	// We do not consider blocks created from out-of-order samples for Head's minValidTime
-		minValidTime = blocks[len(blocks)-1].Meta().MaxTime
+	// since minValidTime is only for the in-order data and we do not want to discard unnecessary
 	// samples from the Head.
 	inOrderMaxTime, ok := db.inOrderBlocksMaxTime()
 	if ok {
 		minValidTime = inOrderMaxTime
 	}
 	if initErr := db.head.Init(minValidTime); initErr != nil {
 		db.head.metrics.walCorruptionsTotal.Inc()
-		level.Warn(db.logger).Log("msg", "Encountered WAL read error, attempting repair", "err", initErr)
+		isOOOErr := isErrLoadOOOWal(initErr)
-		if err := wlog.Repair(initErr); err != nil {
+		if isOOOErr {
-			return nil, errors.Wrap(err, "repair corrupted WAL")
+			level.Warn(db.logger).Log("msg", "Encountered OOO WAL read error, attempting repair", "err", initErr)
 			if err := wblog.Repair(initErr); err != nil {
 				return nil, errors.Wrap(err, "repair corrupted OOO WAL")
 			}
 		} else {
 			level.Warn(db.logger).Log("msg", "Encountered WAL read error, attempting repair", "err", initErr)
 			if err := wlog.Repair(initErr); err != nil {
 				return nil, errors.Wrap(err, "repair corrupted WAL")
 			}
 		}
 	}
 	if db.head.MinOOOTime() != int64(math.MaxInt64) {
 		// Some OOO data was replayed from the disk that needs compaction and cleanup.
 		db.oooWasEnabled.Store(true)
 	}
 	go db.run()
 	return db, nil
@ -846,8 +930,58 @@ func (db *DB) Appender(ctx context.Context) storage.Appender {
 	return dbAppender{db: db, Appender: db.head.Appender(ctx)}
 }
 // ApplyConfig applies a new config to the DB.
 // Behaviour of 'OutOfOrderTimeWindow' is as follows:
 // OOO enabled = oooTimeWindow > 0. OOO disabled = oooTimeWindow is 0.
 // 1) Before: OOO disabled, Now: OOO enabled =>
 //   - A new WBL is created for the head block.
 //   - OOO compaction is enabled.
 //   - Overlapping queries are enabled.
 //
 // 2) Before: OOO enabled, Now: OOO enabled =>
 //   - Only the time window is updated.
 //
 // 3) Before: OOO enabled, Now: OOO disabled =>
 //   - Time Window set to 0. So no new OOO samples will be allowed.
 //   - OOO WBL will stay and will be eventually cleaned up.
 //   - OOO Compaction and overlapping queries will remain enabled until a restart or until all OOO samples are compacted.
 //
 // 4) Before: OOO disabled, Now: OOO disabled => no-op.
 func (db *DB) ApplyConfig(conf *config.Config) error {
-	return db.head.ApplyConfig(conf)
+	oooTimeWindow := int64(0)
 	if conf.StorageConfig.TSDBConfig != nil {
 		oooTimeWindow = conf.StorageConfig.TSDBConfig.OutOfOrderTimeWindow
 	}
 	if oooTimeWindow < 0 {
 		oooTimeWindow = 0
 	}
 	// Create WBL if it was not present and if OOO is enabled with WAL enabled.
 	var wblog *wal.WAL
 	var err error
 	if db.head.wbl != nil {
 		// The existing WBL from the disk might have been replayed while OOO was disabled.
 		wblog = db.head.wbl
 	} else if !db.oooWasEnabled.Load() && oooTimeWindow > 0 && db.opts.WALSegmentSize >= 0 {
 		segmentSize := wal.DefaultSegmentSize
 		// Wal is set to a custom size.
 		if db.opts.WALSegmentSize > 0 {
 			segmentSize = db.opts.WALSegmentSize
 		}
 		oooWalDir := filepath.Join(db.dir, wal.WblDirName)
 		wblog, err = wal.NewSize(db.logger, db.registerer, oooWalDir, segmentSize, db.opts.WALCompression)
 		if err != nil {
 			return err
 		}
 	}
 	db.opts.OutOfOrderTimeWindow = oooTimeWindow
 	db.head.ApplyConfig(conf, wblog)
 	if !db.oooWasEnabled.Load() {
 		db.oooWasEnabled.Store(oooTimeWindow > 0)
 	}
 	return nil
 }
 // dbAppender wraps the DB's head appender and triggers compactions on commit
@ -946,6 +1080,14 @@ func (db *DB) Compact() (returnErr error) {
 			"block_range", db.head.chunkRange.Load(),
 		)
 	}
 	if lastBlockMaxt != math.MinInt64 {
 		// The head was compacted, so we compact OOO head as well.
 		if err := db.compactOOOHead(); err != nil {
 			return errors.Wrap(err, "compact ooo head")
 		}
 	}
 	return db.compactBlocks()
 }
@ -964,6 +1106,102 @@ func (db *DB) CompactHead(head *RangeHead) error {
 	return nil
 }
 // CompactOOOHead compacts the OOO Head.
 func (db *DB) CompactOOOHead() error {
 	db.cmtx.Lock()
 	defer db.cmtx.Unlock()
 	return db.compactOOOHead()
 }
 func (db *DB) compactOOOHead() error {
 	if !db.oooWasEnabled.Load() {
 		return nil
 	}
 	oooHead, err := NewOOOCompactionHead(db.head)
 	if err != nil {
 		return errors.Wrap(err, "get ooo compaction head")
 	}
 	ulids, err := db.compactOOO(db.dir, oooHead)
 	if err != nil {
 		return errors.Wrap(err, "compact ooo head")
 	}
 	if err := db.reloadBlocks(); err != nil {
 		errs := tsdb_errors.NewMulti(err)
 		for _, uid := range ulids {
 			if errRemoveAll := os.RemoveAll(filepath.Join(db.dir, uid.String())); errRemoveAll != nil {
 				errs.Add(errRemoveAll)
 			}
 		}
 		return errors.Wrap(errs.Err(), "reloadBlocks blocks after failed compact ooo head")
 	}
 	lastWBLFile, minOOOMmapRef := oooHead.LastWBLFile(), oooHead.LastMmapRef()
 	if lastWBLFile != 0 || minOOOMmapRef != 0 {
 		if err := db.head.truncateOOO(lastWBLFile, minOOOMmapRef); err != nil {
 			return errors.Wrap(err, "truncate ooo wbl")
 		}
 	}
 	return nil
 }
 // compactOOO creates a new block per possible block range in the compactor's directory from the OOO Head given.
 // Each ULID in the result corresponds to a block in a unique time range.
 func (db *DB) compactOOO(dest string, oooHead *OOOCompactionHead) (_ []ulid.ULID, err error) {
 	start := time.Now()
 	blockSize := oooHead.ChunkRange()
 	oooHeadMint, oooHeadMaxt := oooHead.MinTime(), oooHead.MaxTime()
 	ulids := make([]ulid.ULID, 0)
 	defer func() {
 		if err != nil {
 			// Best effort removal of created block on any error.
 			for _, uid := range ulids {
 				_ = os.RemoveAll(filepath.Join(db.dir, uid.String()))
 			}
 		}
 	}()
 	for t := blockSize * (oooHeadMint / blockSize); t <= oooHeadMaxt; t = t + blockSize {
 		mint, maxt := t, t+blockSize
 		// Block intervals are half-open: [b.MinTime, b.MaxTime). Block intervals are always +1 than the total samples it includes.
 		uid, err := db.compactor.Write(dest, oooHead.CloneForTimeRange(mint, maxt-1), mint, maxt, nil)
 		if err != nil {
 			return nil, err
 		}
 		if uid.Compare(ulid.ULID{}) != 0 {
 			ulids = append(ulids, uid)
 			blockDir := filepath.Join(dest, uid.String())
 			meta, _, err := readMetaFile(blockDir)
 			if err != nil {
 				return ulids, errors.Wrap(err, "read meta")
 			}
 			meta.Compaction.SetOutOfOrder()
 			_, err = writeMetaFile(db.logger, blockDir, meta)
 			if err != nil {
 				return ulids, errors.Wrap(err, "write meta")
 			}
 		}
 	}
 	if len(ulids) == 0 {
 		level.Info(db.logger).Log(
 			"msg", "compact ooo head resulted in no blocks",
 			"duration", time.Since(start),
 		)
 		return nil, nil
 	}
 	level.Info(db.logger).Log(
 		"msg", "out-of-order compaction completed",
 		"duration", time.Since(start),
 		"ulids", fmt.Sprintf("%v", ulids),
 	)
 	return ulids, nil
 }
 // compactHead compacts the given RangeHead.
 // The compaction mutex should be held before calling this method.
 func (db *DB) compactHead(head *RangeHead) error {
@ -1038,10 +1276,11 @@ func (db *DB) reload() error {
 	if err := db.reloadBlocks(); err != nil {
 		return errors.Wrap(err, "reloadBlocks")
 	}
-	if len(db.blocks) == 0 {
+	maxt, ok := db.inOrderBlocksMaxTime()
 	if !ok {
 		return nil
 	}
-	if err := db.head.Truncate(db.blocks[len(db.blocks)-1].MaxTime()); err != nil {
+	if err := db.head.Truncate(maxt); err != nil {
 		return errors.Wrap(err, "head truncate")
 	}
 	return nil
@ -1121,7 +1360,7 @@ func (db *DB) reloadBlocks() (err error) {
 	sort.Slice(toLoad, func(i, j int) bool {
 		return toLoad[i].Meta().MinTime < toLoad[j].Meta().MinTime
 	})
-	if !db.opts.AllowOverlappingBlocks {
+	if !db.AllowOverlappingQueries() {
 		if err := validateBlockSequence(toLoad); err != nil {
 			return errors.Wrap(err, "invalid block sequence")
 		}
@ -1151,6 +1390,10 @@ func (db *DB) reloadBlocks() (err error) {
 	return nil
 }
 func (db *DB) AllowOverlappingQueries() bool {
 	return db.opts.AllowOverlappingQueries || db.oooWasEnabled.Load()
 }
 func openBlocks(l log.Logger, dir string, loaded []*Block, chunkPool chunkenc.Pool) (blocks []*Block, corrupted map[ulid.ULID]error, err error) {
 	bDirs, err := blockDirs(dir)
 	if err != nil {
@ -1428,6 +1671,21 @@ func (db *DB) Blocks() []*Block {
 	return db.blocks
 }
 // inOrderBlocksMaxTime returns the max time among the blocks that were not totally created
 // out of out-of-order data. If the returned boolean is true, it means there is at least
 // one such block.
 func (db *DB) inOrderBlocksMaxTime() (maxt int64, ok bool) {
 	maxt, ok = int64(math.MinInt64), false
 	// If blocks are overlapping, last block might not have the max time. So check all blocks.
 	for _, b := range db.Blocks() {
 		if !b.meta.Compaction.FromOutOfOrder() && b.meta.MaxTime > maxt {
 			ok = true
 			maxt = b.meta.MaxTime
 		}
 	}
 	return maxt, ok
 }
 // Head returns the databases's head.
 func (db *DB) Head() *Head {
 	return db.head
@ -1526,13 +1784,13 @@ func (db *DB) Querier(_ context.Context, mint, maxt int64) (storage.Querier, err
 			blocks = append(blocks, b)
 		}
 	}
-	var headQuerier storage.Querier
+	var inOrderHeadQuerier storage.Querier
 	if maxt >= db.head.MinTime() {
 		rh := NewRangeHead(db.head, mint, maxt)
 		var err error
-		headQuerier, err = NewBlockQuerier(rh, mint, maxt)
+		inOrderHeadQuerier, err = NewBlockQuerier(rh, mint, maxt)
 		if err != nil {
-			return nil, errors.Wrapf(err, "open querier for head %s", rh)
+			return nil, errors.Wrapf(err, "open block querier for head %s", rh)
 		}
 		// Getting the querier above registers itself in the queue that the truncation waits on.
@ -1540,20 +1798,30 @@ func (db *DB) Querier(_ context.Context, mint, maxt int64) (storage.Querier, err
 		// won't run into a race later since any truncation that comes after will wait on this querier if it overlaps.
 		shouldClose, getNew, newMint := db.head.IsQuerierCollidingWithTruncation(mint, maxt)
 		if shouldClose {
-			if err := headQuerier.Close(); err != nil {
+			if err := inOrderHeadQuerier.Close(); err != nil {
-				return nil, errors.Wrapf(err, "closing head querier %s", rh)
+				return nil, errors.Wrapf(err, "closing head block querier %s", rh)
 			}
-			headQuerier = nil
+			inOrderHeadQuerier = nil
 		}
 		if getNew {
 			rh := NewRangeHead(db.head, newMint, maxt)
-			headQuerier, err = NewBlockQuerier(rh, newMint, maxt)
+			inOrderHeadQuerier, err = NewBlockQuerier(rh, newMint, maxt)
 			if err != nil {
-				return nil, errors.Wrapf(err, "open querier for head while getting new querier %s", rh)
+				return nil, errors.Wrapf(err, "open block querier for head while getting new querier %s", rh)
 			}
 		}
 	}
 	var outOfOrderHeadQuerier storage.Querier
 	if overlapsClosedInterval(mint, maxt, db.head.MinOOOTime(), db.head.MaxOOOTime()) {
 		rh := NewOOORangeHead(db.head, mint, maxt)
 		var err error
 		outOfOrderHeadQuerier, err = NewBlockQuerier(rh, mint, maxt)
 		if err != nil {
 			return nil, errors.Wrapf(err, "open block querier for ooo head %s", rh)
 		}
 	}
 	blockQueriers := make([]storage.Querier, 0, len(blocks))
 	for _, b := range blocks {
 		q, err := NewBlockQuerier(b, mint, maxt)
@ -1568,14 +1836,18 @@ func (db *DB) Querier(_ context.Context, mint, maxt int64) (storage.Querier, err
 		}
 		return nil, errors.Wrapf(err, "open querier for block %s", b)
 	}
-	if headQuerier != nil {
+	if inOrderHeadQuerier != nil {
-		blockQueriers = append(blockQueriers, headQuerier)
+		blockQueriers = append(blockQueriers, inOrderHeadQuerier)
 	}
 	if outOfOrderHeadQuerier != nil {
 		blockQueriers = append(blockQueriers, outOfOrderHeadQuerier)
 	}
 	return storage.NewMergeQuerier(blockQueriers, nil, storage.ChainedSeriesMerge), nil
 }
-// ChunkQuerier returns a new chunk querier over the data partition for the given time range.
+// blockQueriersForRange returns individual block chunk queriers from the persistent blocks, in-order head block, and the
-func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQuerier, error) {
+// out-of-order head block, overlapping with the given time range.
 func (db *DB) blockChunkQuerierForRange(mint, maxt int64) ([]storage.ChunkQuerier, error) {
 	var blocks []BlockReader
 	db.mtx.RLock()
@ -1586,11 +1858,11 @@ func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQu
 			blocks = append(blocks, b)
 		}
 	}
-	var headQuerier storage.ChunkQuerier
+	var inOrderHeadQuerier storage.ChunkQuerier
 	if maxt >= db.head.MinTime() {
 		rh := NewRangeHead(db.head, mint, maxt)
 		var err error
-		headQuerier, err = NewBlockChunkQuerier(rh, mint, maxt)
+		inOrderHeadQuerier, err = NewBlockChunkQuerier(rh, mint, maxt)
 		if err != nil {
 			return nil, errors.Wrapf(err, "open querier for head %s", rh)
 		}
@ -1600,20 +1872,30 @@ func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQu
 		// won't run into a race later since any truncation that comes after will wait on this querier if it overlaps.
 		shouldClose, getNew, newMint := db.head.IsQuerierCollidingWithTruncation(mint, maxt)
 		if shouldClose {
-			if err := headQuerier.Close(); err != nil {
+			if err := inOrderHeadQuerier.Close(); err != nil {
 				return nil, errors.Wrapf(err, "closing head querier %s", rh)
 			}
-			headQuerier = nil
+			inOrderHeadQuerier = nil
 		}
 		if getNew {
 			rh := NewRangeHead(db.head, newMint, maxt)
-			headQuerier, err = NewBlockChunkQuerier(rh, newMint, maxt)
+			inOrderHeadQuerier, err = NewBlockChunkQuerier(rh, newMint, maxt)
 			if err != nil {
 				return nil, errors.Wrapf(err, "open querier for head while getting new querier %s", rh)
 			}
 		}
 	}
 	var outOfOrderHeadQuerier storage.ChunkQuerier
 	if overlapsClosedInterval(mint, maxt, db.head.MinOOOTime(), db.head.MaxOOOTime()) {
 		rh := NewOOORangeHead(db.head, mint, maxt)
 		var err error
 		outOfOrderHeadQuerier, err = NewBlockChunkQuerier(rh, mint, maxt)
 		if err != nil {
 			return nil, errors.Wrapf(err, "open block chunk querier for ooo head %s", rh)
 		}
 	}
 	blockQueriers := make([]storage.ChunkQuerier, 0, len(blocks))
 	for _, b := range blocks {
 		q, err := NewBlockChunkQuerier(b, mint, maxt)
@ -1628,10 +1910,22 @@ func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQu
 		}
 		return nil, errors.Wrapf(err, "open querier for block %s", b)
 	}
-	if headQuerier != nil {
+	if inOrderHeadQuerier != nil {
-		blockQueriers = append(blockQueriers, headQuerier)
+		blockQueriers = append(blockQueriers, inOrderHeadQuerier)
 	}
 	if outOfOrderHeadQuerier != nil {
 		blockQueriers = append(blockQueriers, outOfOrderHeadQuerier)
 	}
 	return blockQueriers, nil
 }
 // ChunkQuerier returns a new chunk querier over the data partition for the given time range.
 func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQuerier, error) {
 	blockQueriers, err := db.blockChunkQuerierForRange(mint, maxt)
 	if err != nil {
 		return nil, err
 	}
 	return storage.NewMergeChunkQuerier(blockQueriers, nil, storage.NewCompactingChunkSeriesMerger(storage.ChainedSeriesMerge)), nil
 }
--- a/tsdb/db_test.go
+++ b/tsdb/db_test.go
--- a/tsdb/head.go
+++ b/tsdb/head.go
@ -25,9 +25,10 @@ import (
 	"github.com/go-kit/log/level"
 	"github.com/oklog/ulid"
 	"github.com/pkg/errors"
 	"github.com/prometheus/client_golang/prometheus"
 	"go.uber.org/atomic"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/prometheus/config"
 	"github.com/prometheus/prometheus/model/exemplar"
 	"github.com/prometheus/prometheus/model/labels"
@ -62,15 +63,19 @@ var (
 type Head struct {
 	chunkRange               atomic.Int64
 	numSeries                atomic.Uint64
-	minTime, maxTime         atomic.Int64 // Current min and max of the samples included in the head.
+	minOOOTime, maxOOOTime   atomic.Int64 // TODO(jesusvazquez) These should be updated after garbage collection.
 	minTime, maxTime         atomic.Int64 // Current min and max of the samples included in the head. TODO(jesusvazquez) Ensure these are properly tracked.
 	minValidTime             atomic.Int64 // Mint allowed to be added to the head. It shouldn't be lower than the maxt of the last persisted block.
 	lastWALTruncationTime    atomic.Int64
 	lastMemoryTruncationTime atomic.Int64
 	lastSeriesID             atomic.Uint64
 	// All the ooo m-map chunks should be after this. This is used to truncate old ooo m-map chunks.
 	// This should be typecasted to chunks.ChunkDiskMapperRef after loading.
 	minOOOMmapRef atomic.Uint64
 	metrics         *headMetrics
 	opts            *HeadOptions
-	wal             *wal.WAL
+	wal, wbl        *wal.WAL
 	exemplarMetrics *ExemplarMetrics
 	exemplars       ExemplarStorage
 	logger          log.Logger
@ -87,6 +92,7 @@ type Head struct {
 	deletedMtx sync.Mutex
 	deleted    map[chunks.HeadSeriesRef]int // Deleted series, and what WAL segment they must be kept until.
 	// TODO(codesome): Extend MemPostings to return only OOOPostings, Set OOOStatus, ... Like an additional map of ooo postings.
 	postings *index.MemPostings // Postings lists for terms.
 	tombstones *tombstones.MemTombstones
@ -130,6 +136,8 @@ type HeadOptions struct {
 	ChunkPool            chunkenc.Pool
 	ChunkWriteBufferSize int
 	ChunkWriteQueueSize  int
 	OutOfOrderTimeWindow atomic.Int64
 	OutOfOrderCapMax     atomic.Int64
 	// StripeSize sets the number of entries in the hash map, it must be a power of 2.
 	// A larger StripeSize will allocate more memory up-front, but will increase performance when handling a large number of series.
@ -142,8 +150,13 @@ type HeadOptions struct {
 	IsolationDisabled bool
 }
 const (
 	// DefaultOutOfOrderCapMax is the default maximum size of an in-memory out-of-order chunk.
 	DefaultOutOfOrderCapMax int64 = 32
 )
 func DefaultHeadOptions() *HeadOptions {
-	return &HeadOptions{
+	ho := &HeadOptions{
 		ChunkRange:           DefaultBlockDuration,
 		ChunkDirRoot:         "",
 		ChunkPool:            chunkenc.NewPool(),
@ -153,6 +166,8 @@ func DefaultHeadOptions() *HeadOptions {
 		SeriesCallback:       &noopSeriesLifecycleCallback{},
 		IsolationDisabled:    defaultIsolationDisabled,
 	}
 	ho.OutOfOrderCapMax.Store(DefaultOutOfOrderCapMax)
 	return ho
 }
 // SeriesLifecycleCallback specifies a list of callbacks that will be called during a lifecycle of a series.
@ -171,11 +186,23 @@ type SeriesLifecycleCallback interface {
 }
 // NewHead opens the head block in dir.
-func NewHead(r prometheus.Registerer, l log.Logger, wal *wal.WAL, opts *HeadOptions, stats *HeadStats) (*Head, error) {
+func NewHead(r prometheus.Registerer, l log.Logger, wal, wbl *wal.WAL, opts *HeadOptions, stats *HeadStats) (*Head, error) {
 	var err error
 	if l == nil {
 		l = log.NewNopLogger()
 	}
 	if opts.OutOfOrderTimeWindow.Load() < 0 {
 		opts.OutOfOrderTimeWindow.Store(0)
 	}
 	// Time window can be set on runtime. So the capMin and capMax should be valid
 	// even if ooo is not enabled yet.
 	capMax := opts.OutOfOrderCapMax.Load()
 	if capMax <= 0 || capMax > 255 {
 		return nil, errors.Errorf("OOOCapMax of %d is invalid. must be > 0 and <= 255", capMax)
 	}
 	if opts.ChunkRange < 1 {
 		return nil, errors.Errorf("invalid chunk range %d", opts.ChunkRange)
 	}
@ -193,6 +220,7 @@ func NewHead(r prometheus.Registerer, l log.Logger, wal *wal.WAL, opts *HeadOpti
 	h := &Head{
 		wal:    wal,
 		wbl:    wbl,
 		logger: l,
 		opts:   opts,
 		memChunkPool: sync.Pool{
@ -254,35 +282,40 @@ func (h *Head) resetInMemoryState() error {
 	h.chunkRange.Store(h.opts.ChunkRange)
 	h.minTime.Store(math.MaxInt64)
 	h.maxTime.Store(math.MinInt64)
 	h.minOOOTime.Store(math.MaxInt64)
 	h.maxOOOTime.Store(math.MinInt64)
 	h.lastWALTruncationTime.Store(math.MinInt64)
 	h.lastMemoryTruncationTime.Store(math.MinInt64)
 	return nil
 }
 type headMetrics struct {
-	activeAppenders          prometheus.Gauge
+	activeAppenders           prometheus.Gauge
-	series                   prometheus.GaugeFunc
+	series                    prometheus.GaugeFunc
-	seriesCreated            prometheus.Counter
+	seriesCreated             prometheus.Counter
-	seriesRemoved            prometheus.Counter
+	seriesRemoved             prometheus.Counter
-	seriesNotFound           prometheus.Counter
+	seriesNotFound            prometheus.Counter
-	chunks                   prometheus.Gauge
+	chunks                    prometheus.Gauge
-	chunksCreated            prometheus.Counter
+	chunksCreated             prometheus.Counter
-	chunksRemoved            prometheus.Counter
+	chunksRemoved             prometheus.Counter
-	gcDuration               prometheus.Summary
+	gcDuration                prometheus.Summary
-	samplesAppended          prometheus.Counter
+	samplesAppended           prometheus.Counter
-	outOfBoundSamples        prometheus.Counter
+	outOfOrderSamplesAppended prometheus.Counter
-	outOfOrderSamples        prometheus.Counter
+	outOfBoundSamples         prometheus.Counter
-	walTruncateDuration      prometheus.Summary
+	outOfOrderSamples         prometheus.Counter
-	walCorruptionsTotal      prometheus.Counter
+	tooOldSamples             prometheus.Counter
-	walTotalReplayDuration   prometheus.Gauge
+	walTruncateDuration       prometheus.Summary
-	headTruncateFail         prometheus.Counter
+	walCorruptionsTotal       prometheus.Counter
-	headTruncateTotal        prometheus.Counter
+	dataTotalReplayDuration   prometheus.Gauge
-	checkpointDeleteFail     prometheus.Counter
+	headTruncateFail          prometheus.Counter
-	checkpointDeleteTotal    prometheus.Counter
+	headTruncateTotal         prometheus.Counter
-	checkpointCreationFail   prometheus.Counter
+	checkpointDeleteFail      prometheus.Counter
-	checkpointCreationTotal  prometheus.Counter
+	checkpointDeleteTotal     prometheus.Counter
-	mmapChunkCorruptionTotal prometheus.Counter
+	checkpointCreationFail    prometheus.Counter
-	snapshotReplayErrorTotal prometheus.Counter // Will be either 0 or 1.
+	checkpointCreationTotal   prometheus.Counter
 	mmapChunkCorruptionTotal  prometheus.Counter
 	snapshotReplayErrorTotal  prometheus.Counter // Will be either 0 or 1.
 	oooHistogram              prometheus.Histogram
 }
 func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
@ -333,7 +366,7 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
 			Name: "prometheus_tsdb_wal_corruptions_total",
 			Help: "Total number of WAL corruptions.",
 		}),
-		walTotalReplayDuration: prometheus.NewGauge(prometheus.GaugeOpts{
+		dataTotalReplayDuration: prometheus.NewGauge(prometheus.GaugeOpts{
 			Name: "prometheus_tsdb_data_replay_duration_seconds",
 			Help: "Time taken to replay the data on disk.",
 		}),
@ -341,13 +374,21 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
 			Name: "prometheus_tsdb_head_samples_appended_total",
 			Help: "Total number of appended samples.",
 		}),
 		outOfOrderSamplesAppended: prometheus.NewCounter(prometheus.CounterOpts{
 			Name: "prometheus_tsdb_head_out_of_order_samples_appended_total",
 			Help: "Total number of appended out of order samples.",
 		}),
 		outOfBoundSamples: prometheus.NewCounter(prometheus.CounterOpts{
 			Name: "prometheus_tsdb_out_of_bound_samples_total",
-			Help: "Total number of out of bound samples ingestion failed attempts.",
+			Help: "Total number of out of bound samples ingestion failed attempts with out of order support disabled.",
 		}),
 		outOfOrderSamples: prometheus.NewCounter(prometheus.CounterOpts{
 			Name: "prometheus_tsdb_out_of_order_samples_total",
-			Help: "Total number of out of order samples ingestion failed attempts.",
+			Help: "Total number of out of order samples ingestion failed attempts due to out of order being disabled.",
 		}),
 		tooOldSamples: prometheus.NewCounter(prometheus.CounterOpts{
 			Name: "prometheus_tsdb_too_old_samples_total",
 			Help: "Total number of out of order samples ingestion failed attempts with out of support enabled, but sample outside of time window.",
 		}),
 		headTruncateFail: prometheus.NewCounter(prometheus.CounterOpts{
 			Name: "prometheus_tsdb_head_truncations_failed_total",
@ -381,6 +422,19 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
 			Name: "prometheus_tsdb_snapshot_replay_error_total",
 			Help: "Total number snapshot replays that failed.",
 		}),
 		oooHistogram: prometheus.NewHistogram(prometheus.HistogramOpts{
 			Name: "prometheus_tsdb_sample_ooo_delta",
 			Help: "Delta in seconds by which a sample is considered out of order (reported regardless of OOO time window and whether sample is accepted or not).",
 			Buckets: []float64{
 				60 * 10,      // 10 min
 				60 * 30,      // 30 min
 				60 * 60,      // 60 min
 				60 * 60 * 2,  // 2h
 				60 * 60 * 3,  // 3h
 				60 * 60 * 6,  // 6h
 				60 * 60 * 12, // 12h
 			},
 		}),
 	}
 	if r != nil {
@ -396,10 +450,12 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
 			m.gcDuration,
 			m.walTruncateDuration,
 			m.walCorruptionsTotal,
-			m.walTotalReplayDuration,
+			m.dataTotalReplayDuration,
 			m.samplesAppended,
 			m.outOfOrderSamplesAppended,
 			m.outOfBoundSamples,
 			m.outOfOrderSamples,
 			m.tooOldSamples,
 			m.headTruncateFail,
 			m.headTruncateTotal,
 			m.checkpointDeleteFail,
@ -517,8 +573,9 @@ func (h *Head) Init(minValidTime int64) error {
 	}
 	mmapChunkReplayStart := time.Now()
-	mmappedChunks, err := h.loadMmappedChunks(refSeries)
+	mmappedChunks, oooMmappedChunks, lastMmapRef, err := h.loadMmappedChunks(refSeries)
 	if err != nil {
 		// TODO(codesome): clear out all m-map chunks here for refSeries.
 		level.Error(h.logger).Log("msg", "Loading on-disk chunks failed", "err", err)
 		if _, ok := errors.Cause(err).(*chunks.CorruptionErr); ok {
 			h.metrics.mmapChunkCorruptionTotal.Inc()
@ -529,7 +586,7 @@ func (h *Head) Init(minValidTime int64) error {
 		// If this fails, data will be recovered from WAL.
 		// Hence we wont lose any data (given WAL is not corrupt).
-		mmappedChunks, err = h.removeCorruptedMmappedChunks(err)
+		mmappedChunks, oooMmappedChunks, lastMmapRef, err = h.removeCorruptedMmappedChunks(err)
 		if err != nil {
 			return err
 		}
@ -572,7 +629,7 @@ func (h *Head) Init(minValidTime int64) error {
 		// A corrupted checkpoint is a hard error for now and requires user
 		// intervention. There's likely little data that can be recovered anyway.
-		if err := h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks); err != nil {
+		if err := h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks, oooMmappedChunks); err != nil {
 			return errors.Wrap(err, "backfill checkpoint")
 		}
 		h.updateWALReplayStatusRead(startFrom)
@ -605,7 +662,7 @@ func (h *Head) Init(minValidTime int64) error {
 		if err != nil {
 			return errors.Wrapf(err, "segment reader (offset=%d)", offset)
 		}
-		err = h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks)
+		err = h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks, oooMmappedChunks)
 		if err := sr.Close(); err != nil {
 			level.Warn(h.logger).Log("msg", "Error while closing the wal segments reader", "err", err)
 		}
@ -615,26 +672,94 @@ func (h *Head) Init(minValidTime int64) error {
 		level.Info(h.logger).Log("msg", "WAL segment loaded", "segment", i, "maxSegment", endAt)
 		h.updateWALReplayStatusRead(i)
 	}
 	walReplayDuration := time.Since(walReplayStart)
-	walReplayDuration := time.Since(start)
+	wblReplayStart := time.Now()
-	h.metrics.walTotalReplayDuration.Set(walReplayDuration.Seconds())
+	if h.wbl != nil {
 		// Replay OOO WAL.
 		startFrom, endAt, e = wal.Segments(h.wbl.Dir())
 		if e != nil {
 			return errors.Wrap(e, "finding OOO WAL segments")
 		}
 		h.startWALReplayStatus(startFrom, endAt)
 		for i := startFrom; i <= endAt; i++ {
 			s, err := wal.OpenReadSegment(wal.SegmentName(h.wbl.Dir(), i))
 			if err != nil {
 				return errors.Wrap(err, fmt.Sprintf("open WBL segment: %d", i))
 			}
 			sr := wal.NewSegmentBufReader(s)
 			err = h.loadWBL(wal.NewReader(sr), multiRef, lastMmapRef)
 			if err := sr.Close(); err != nil {
 				level.Warn(h.logger).Log("msg", "Error while closing the wbl segments reader", "err", err)
 			}
 			if err != nil {
 				return err
 			}
 			level.Info(h.logger).Log("msg", "WBL segment loaded", "segment", i, "maxSegment", endAt)
 			h.updateWALReplayStatusRead(i)
 		}
 	}
 	wblReplayDuration := time.Since(wblReplayStart)
 	totalReplayDuration := time.Since(start)
 	h.metrics.dataTotalReplayDuration.Set(totalReplayDuration.Seconds())
 	level.Info(h.logger).Log(
 		"msg", "WAL replay completed",
 		"checkpoint_replay_duration", checkpointReplayDuration.String(),
-		"wal_replay_duration", time.Since(walReplayStart).String(),
+		"wal_replay_duration", walReplayDuration.String(),
-		"total_replay_duration", walReplayDuration.String(),
+		"wbl_replay_duration", wblReplayDuration.String(),
 		"total_replay_duration", totalReplayDuration.String(),
 	)
 	return nil
 }
-func (h *Head) loadMmappedChunks(refSeries map[chunks.HeadSeriesRef]*memSeries) (map[chunks.HeadSeriesRef][]*mmappedChunk, error) {
+func (h *Head) loadMmappedChunks(refSeries map[chunks.HeadSeriesRef]*memSeries) (map[chunks.HeadSeriesRef][]*mmappedChunk, map[chunks.HeadSeriesRef][]*mmappedChunk, chunks.ChunkDiskMapperRef, error) {
 	mmappedChunks := map[chunks.HeadSeriesRef][]*mmappedChunk{}
-	if err := h.chunkDiskMapper.IterateAllChunks(func(seriesRef chunks.HeadSeriesRef, chunkRef chunks.ChunkDiskMapperRef, mint, maxt int64, numSamples uint16) error {
+	oooMmappedChunks := map[chunks.HeadSeriesRef][]*mmappedChunk{}
-		if maxt < h.minValidTime.Load() {
+	var lastRef, secondLastRef chunks.ChunkDiskMapperRef
 	if err := h.chunkDiskMapper.IterateAllChunks(func(seriesRef chunks.HeadSeriesRef, chunkRef chunks.ChunkDiskMapperRef, mint, maxt int64, numSamples uint16, encoding chunkenc.Encoding) error {
 		secondLastRef = lastRef
 		lastRef = chunkRef
 		isOOO := chunkenc.IsOutOfOrderChunk(encoding)
 		if !isOOO && maxt < h.minValidTime.Load() {
 			return nil
 		}
 		// We ignore any chunk that doesn't have a valid encoding
 		if !chunkenc.IsValidEncoding(encoding) {
 			return nil
 		}
 		ms, ok := refSeries[seriesRef]
 		if isOOO {
 			if !ok {
 				oooMmappedChunks[seriesRef] = append(oooMmappedChunks[seriesRef], &mmappedChunk{
 					ref:        chunkRef,
 					minTime:    mint,
 					maxTime:    maxt,
 					numSamples: numSamples,
 				})
 				return nil
 			}
 			h.metrics.chunks.Inc()
 			h.metrics.chunksCreated.Inc()
 			ms.oooMmappedChunks = append(ms.oooMmappedChunks, &mmappedChunk{
 				ref:        chunkRef,
 				minTime:    mint,
 				maxTime:    maxt,
 				numSamples: numSamples,
 			})
 			return nil
 		}
 		if !ok {
 			slice := mmappedChunks[seriesRef]
 			if len(slice) > 0 && slice[len(slice)-1].maxTime >= mint {
@ -677,45 +802,57 @@ func (h *Head) loadMmappedChunks(refSeries map[chunks.HeadSeriesRef]*memSeries)
 		}
 		return nil
 	}); err != nil {
-		return nil, errors.Wrap(err, "iterate on on-disk chunks")
+		// secondLastRef because the lastRef caused an error.
 		return nil, nil, secondLastRef, errors.Wrap(err, "iterate on on-disk chunks")
 	}
-	return mmappedChunks, nil
+	return mmappedChunks, oooMmappedChunks, lastRef, nil
 }
 // removeCorruptedMmappedChunks attempts to delete the corrupted mmapped chunks and if it fails, it clears all the previously
 // loaded mmapped chunks.
-func (h *Head) removeCorruptedMmappedChunks(err error) (map[chunks.HeadSeriesRef][]*mmappedChunk, error) {
+func (h *Head) removeCorruptedMmappedChunks(err error) (map[chunks.HeadSeriesRef][]*mmappedChunk, map[chunks.HeadSeriesRef][]*mmappedChunk, chunks.ChunkDiskMapperRef, error) {
 	level.Info(h.logger).Log("msg", "Deleting mmapped chunk files")
 	// We never want to preserve the in-memory series from snapshots if we are repairing m-map chunks.
 	if err := h.resetInMemoryState(); err != nil {
-		return nil, err
+		return map[chunks.HeadSeriesRef][]*mmappedChunk{}, map[chunks.HeadSeriesRef][]*mmappedChunk{}, 0, err
 	}
 	level.Info(h.logger).Log("msg", "Deleting mmapped chunk files")
 	if err := h.chunkDiskMapper.DeleteCorrupted(err); err != nil {
 		level.Info(h.logger).Log("msg", "Deletion of corrupted mmap chunk files failed, discarding chunk files completely", "err", err)
-		if err := h.chunkDiskMapper.Truncate(math.MaxInt64); err != nil {
+		if err := h.chunkDiskMapper.Truncate(math.MaxUint32); err != nil {
 			level.Error(h.logger).Log("msg", "Deletion of all mmap chunk files failed", "err", err)
 		}
-		return map[chunks.HeadSeriesRef][]*mmappedChunk{}, nil
+		return map[chunks.HeadSeriesRef][]*mmappedChunk{}, map[chunks.HeadSeriesRef][]*mmappedChunk{}, 0, nil
 	}
 	level.Info(h.logger).Log("msg", "Deletion of mmap chunk files successful, reattempting m-mapping the on-disk chunks")
-	mmappedChunks, err := h.loadMmappedChunks(make(map[chunks.HeadSeriesRef]*memSeries))
+	mmappedChunks, oooMmappedChunks, lastRef, err := h.loadMmappedChunks(make(map[chunks.HeadSeriesRef]*memSeries))
 	if err != nil {
 		level.Error(h.logger).Log("msg", "Loading on-disk chunks failed, discarding chunk files completely", "err", err)
-		if err := h.chunkDiskMapper.Truncate(math.MaxInt64); err != nil {
+		if err := h.chunkDiskMapper.Truncate(math.MaxUint32); err != nil {
 			level.Error(h.logger).Log("msg", "Deletion of all mmap chunk files failed after failed loading", "err", err)
 		}
 		mmappedChunks = map[chunks.HeadSeriesRef][]*mmappedChunk{}
 	}
-	return mmappedChunks, nil
+	return mmappedChunks, oooMmappedChunks, lastRef, nil
 }
-func (h *Head) ApplyConfig(cfg *config.Config) error {
+func (h *Head) ApplyConfig(cfg *config.Config, wbl *wal.WAL) {
 	oooTimeWindow := int64(0)
 	if cfg.StorageConfig.TSDBConfig != nil {
 		oooTimeWindow = cfg.StorageConfig.TSDBConfig.OutOfOrderTimeWindow
 	}
 	if oooTimeWindow < 0 {
 		oooTimeWindow = 0
 	}
 	h.SetOutOfOrderTimeWindow(oooTimeWindow, wbl)
 	if !h.opts.EnableExemplarStorage {
-		return nil
+		return
 	}
 	// Head uses opts.MaxExemplars in combination with opts.EnableExemplarStorage
@ -726,12 +863,21 @@ func (h *Head) ApplyConfig(cfg *config.Config) error {
 	newSize := h.opts.MaxExemplars.Load()
 	if prevSize == newSize {
-		return nil
+		return
 	}
 	migrated := h.exemplars.(*CircularExemplarStorage).Resize(newSize)
 	level.Info(h.logger).Log("msg", "Exemplar storage resized", "from", prevSize, "to", newSize, "migrated", migrated)
-	return nil
+}
 // SetOutOfOrderTimeWindow updates the out of order related parameters.
 // If the Head already has a WBL set, then the wbl will be ignored.
 func (h *Head) SetOutOfOrderTimeWindow(oooTimeWindow int64, wbl *wal.WAL) {
 	if oooTimeWindow > 0 && h.wbl == nil {
 		h.wbl = wbl
 	}
 	h.opts.OutOfOrderTimeWindow.Store(oooTimeWindow)
 }
 // PostingsCardinalityStats returns top 10 highest cardinality stats By label and value names.
@ -773,6 +919,27 @@ func (h *Head) updateMinMaxTime(mint, maxt int64) {
 	}
 }
 func (h *Head) updateMinOOOMaxOOOTime(mint, maxt int64) {
 	for {
 		lt := h.MinOOOTime()
 		if mint >= lt {
 			break
 		}
 		if h.minOOOTime.CompareAndSwap(lt, mint) {
 			break
 		}
 	}
 	for {
 		ht := h.MaxOOOTime()
 		if maxt <= ht {
 			break
 		}
 		if h.maxOOOTime.CompareAndSwap(ht, maxt) {
 			break
 		}
 	}
 }
 // SetMinValidTime sets the minimum timestamp the head can ingest.
 func (h *Head) SetMinValidTime(minValidTime int64) {
 	h.minValidTime.Store(minValidTime)
@ -838,30 +1005,7 @@ func (h *Head) truncateMemory(mint int64) (err error) {
 	}
 	h.metrics.headTruncateTotal.Inc()
-	start := time.Now()
+	return h.truncateSeriesAndChunkDiskMapper("truncateMemory")
 	actualMint := h.gc()
 	level.Info(h.logger).Log("msg", "Head GC completed", "duration", time.Since(start))
 	h.metrics.gcDuration.Observe(time.Since(start).Seconds())
 	if actualMint > h.minTime.Load() {
 		// The actual mint of the Head is higher than the one asked to truncate.
 		appendableMinValidTime := h.appendableMinValidTime()
 		if actualMint < appendableMinValidTime {
 			h.minTime.Store(actualMint)
 			h.minValidTime.Store(actualMint)
 		} else {
 			// The actual min time is in the appendable window.
 			// So we set the mint to the appendableMinValidTime.
 			h.minTime.Store(appendableMinValidTime)
 			h.minValidTime.Store(appendableMinValidTime)
 		}
 	}
 	// Truncate the chunk m-mapper.
 	if err := h.chunkDiskMapper.Truncate(mint); err != nil {
 		return errors.Wrap(err, "truncate chunks.HeadReadWriter")
 	}
 	return nil
 }
 // WaitForPendingReadersInTimeRange waits for queries overlapping with given range to finish querying.
@ -950,7 +1094,7 @@ func (h *Head) truncateWAL(mint int64) error {
 	}
 	// Start a new segment, so low ingestion volume TSDB don't have more WAL than
 	// needed.
-	if err := h.wal.NextSegment(); err != nil {
+	if _, err := h.wal.NextSegment(); err != nil {
 		return errors.Wrap(err, "next segment")
 	}
 	last-- // Never consider last segment for checkpoint.
@ -1016,6 +1160,59 @@ func (h *Head) truncateWAL(mint int64) error {
 	return nil
 }
 // truncateOOO
 //   - truncates the OOO WBL files whose index is strictly less than lastWBLFile.
 //   - garbage collects all the m-map chunks from the memory that are less than or equal to minOOOMmapRef
 //     and then deletes the series that do not have any data anymore.
 func (h *Head) truncateOOO(lastWBLFile int, minOOOMmapRef chunks.ChunkDiskMapperRef) error {
 	curMinOOOMmapRef := chunks.ChunkDiskMapperRef(h.minOOOMmapRef.Load())
 	if minOOOMmapRef.GreaterThan(curMinOOOMmapRef) {
 		h.minOOOMmapRef.Store(uint64(minOOOMmapRef))
 		if err := h.truncateSeriesAndChunkDiskMapper("truncateOOO"); err != nil {
 			return err
 		}
 	}
 	return h.wbl.Truncate(lastWBLFile)
 }
 // truncateSeriesAndChunkDiskMapper is a helper function for truncateMemory and truncateOOO.
 // It runs GC on the Head and truncates the ChunkDiskMapper accordingly.
 func (h *Head) truncateSeriesAndChunkDiskMapper(caller string) error {
 	start := time.Now()
 	headMaxt := h.MaxTime()
 	actualMint, minOOOTime, minMmapFile := h.gc()
 	level.Info(h.logger).Log("msg", "Head GC completed", "caller", caller, "duration", time.Since(start))
 	h.metrics.gcDuration.Observe(time.Since(start).Seconds())
 	if actualMint > h.minTime.Load() {
 		// The actual mint of the head is higher than the one asked to truncate.
 		appendableMinValidTime := h.appendableMinValidTime()
 		if actualMint < appendableMinValidTime {
 			h.minTime.Store(actualMint)
 			h.minValidTime.Store(actualMint)
 		} else {
 			// The actual min time is in the appendable window.
 			// So we set the mint to the appendableMinValidTime.
 			h.minTime.Store(appendableMinValidTime)
 			h.minValidTime.Store(appendableMinValidTime)
 		}
 	}
 	if headMaxt-h.opts.OutOfOrderTimeWindow.Load() < minOOOTime {
 		// The allowed OOO window is lower than the min OOO time seen during GC.
 		// So it is possible that some OOO sample was inserted that was less that minOOOTime.
 		// So we play safe and set it to the min that was possible.
 		minOOOTime = headMaxt - h.opts.OutOfOrderTimeWindow.Load()
 	}
 	h.minOOOTime.Store(minOOOTime)
 	// Truncate the chunk m-mapper.
 	if err := h.chunkDiskMapper.Truncate(uint32(minMmapFile)); err != nil {
 		return errors.Wrap(err, "truncate chunks.HeadReadWriter by file number")
 	}
 	return nil
 }
 type Stats struct {
 	NumSeries         uint64
 	MinTime, MaxTime  int64
@ -1149,14 +1346,20 @@ func (h *Head) Delete(mint, maxt int64, ms ...*labels.Matcher) error {
 }
 // gc removes data before the minimum timestamp from the head.
-// It returns the actual min times of the chunks present in the Head.
+// It returns
-func (h *Head) gc() int64 {
+// * The actual min times of the chunks present in the Head.
 // * The min OOO time seen during the GC.
 // * Min mmap file number seen in the series (in-order and out-of-order) after gc'ing the series.
 func (h *Head) gc() (actualInOrderMint, minOOOTime int64, minMmapFile int) {
 	// Only data strictly lower than this timestamp must be deleted.
 	mint := h.MinTime()
 	// Only ooo m-map chunks strictly lower than or equal to this ref
 	// must be deleted.
 	minOOOMmapRef := chunks.ChunkDiskMapperRef(h.minOOOMmapRef.Load())
 	// Drop old chunks and remember series IDs and hashes if they can be
 	// deleted entirely.
-	deleted, chunksRemoved, actualMint := h.series.gc(mint)
+	deleted, chunksRemoved, actualInOrderMint, minOOOTime, minMmapFile := h.series.gc(mint, minOOOMmapRef)
 	seriesRemoved := len(deleted)
 	h.metrics.seriesRemoved.Add(float64(seriesRemoved))
@ -1186,7 +1389,7 @@ func (h *Head) gc() int64 {
 		h.deletedMtx.Unlock()
 	}
-	return actualMint
+	return actualInOrderMint, minOOOTime, minMmapFile
 }
 // Tombstones returns a new reader over the head's tombstones
@ -1224,6 +1427,18 @@ func (h *Head) MaxTime() int64 {
 	return h.maxTime.Load()
 }
 // MinOOOTime returns the lowest time bound on visible data in the out of order
 // head.
 func (h *Head) MinOOOTime() int64 {
 	return h.minOOOTime.Load()
 }
 // MaxOOOTime returns the highest timestamp on visible data in the out of order
 // head.
 func (h *Head) MaxOOOTime() int64 {
 	return h.maxOOOTime.Load()
 }
 // compactable returns whether the head has a compactable range.
 // The head has a compactable range when the head time range is 1.5 times the chunk range.
 // The 0.5 acts as a buffer of the appendable window.
@ -1241,6 +1456,9 @@ func (h *Head) Close() error {
 	if h.wal != nil {
 		errs.Add(h.wal.Close())
 	}
 	if h.wbl != nil {
 		errs.Add(h.wbl.Close())
 	}
 	if errs.Err() == nil && h.opts.EnableMemorySnapshotOnShutdown {
 		errs.Add(h.performChunkSnapshot())
 	}
@ -1271,7 +1489,7 @@ func (h *Head) getOrCreate(hash uint64, lset labels.Labels) (*memSeries, bool, e
 func (h *Head) getOrCreateWithID(id chunks.HeadSeriesRef, hash uint64, lset labels.Labels) (*memSeries, bool, error) {
 	s, created, err := h.series.getOrSet(hash, lset, func() *memSeries {
-		return newMemSeries(lset, id, h.chunkRange.Load(), h.opts.IsolationDisabled)
+		return newMemSeries(lset, id, h.chunkRange.Load(), h.opts.OutOfOrderCapMax.Load(), h.opts.IsolationDisabled)
 	})
 	if err != nil {
 		return nil, false, err
@ -1333,7 +1551,7 @@ const (
 )
 // stripeSeries holds series by HeadSeriesRef ("ID") and also by hash of their labels.
-// ID-based lookups via (getByID()) are preferred over getByHash() for performance reasons.
+// ID-based lookups via getByID() are preferred over getByHash() for performance reasons.
 // It locks modulo ranges of IDs and hashes to reduce lock contention.
 // The locks are padded to not be on the same cache line. Filling the padded space
 // with the maps was profiled to be slower – likely due to the additional pointer
@ -1375,13 +1593,16 @@ func newStripeSeries(stripeSize int, seriesCallback SeriesLifecycleCallback) *st
 // note: returning map[chunks.HeadSeriesRef]struct{} would be more accurate,
 // but the returned map goes into postings.Delete() which expects a map[storage.SeriesRef]struct
 // and there's no easy way to cast maps.
-func (s *stripeSeries) gc(mint int64) (map[storage.SeriesRef]struct{}, int, int64) {
+// minMmapFile is the min mmap file number seen in the series (in-order and out-of-order) after gc'ing the series.
 func (s *stripeSeries) gc(mint int64, minOOOMmapRef chunks.ChunkDiskMapperRef) (_ map[storage.SeriesRef]struct{}, _ int, _, _ int64, minMmapFile int) {
 	var (
 		deleted                  = map[storage.SeriesRef]struct{}{}
 		deletedForCallback       = []labels.Labels{}
 		rmChunks                 = 0
 		actualMint         int64 = math.MaxInt64
 		minOOOTime         int64 = math.MaxInt64
 	)
 	minMmapFile = math.MaxInt32
 	// Run through all series and truncate old chunks. Mark those with no
 	// chunks left as deleted and store their ID.
 	for i := 0; i < s.size; i++ {
@ -1390,9 +1611,32 @@ func (s *stripeSeries) gc(mint int64) (map[storage.SeriesRef]struct{}, int, int6
 		for hash, all := range s.hashes[i] {
 			for _, series := range all {
 				series.Lock()
-				rmChunks += series.truncateChunksBefore(mint)
+				rmChunks += series.truncateChunksBefore(mint, minOOOMmapRef)
-				if len(series.mmappedChunks) > 0 || series.headChunk != nil || series.pendingCommit {
+				if len(series.mmappedChunks) > 0 {
 					seq, _ := series.mmappedChunks[0].ref.Unpack()
 					if seq < minMmapFile {
 						minMmapFile = seq
 					}
 				}
 				if len(series.oooMmappedChunks) > 0 {
 					seq, _ := series.oooMmappedChunks[0].ref.Unpack()
 					if seq < minMmapFile {
 						minMmapFile = seq
 					}
 					for _, ch := range series.oooMmappedChunks {
 						if ch.minTime < minOOOTime {
 							minOOOTime = ch.minTime
 						}
 					}
 				}
 				if series.oooHeadChunk != nil {
 					if series.oooHeadChunk.minTime < minOOOTime {
 						minOOOTime = series.oooHeadChunk.minTime
 					}
 				}
 				if len(series.mmappedChunks) > 0 || len(series.oooMmappedChunks) > 0 ||
 					series.headChunk != nil || series.oooHeadChunk != nil || series.pendingCommit {
 					seriesMint := series.minTime()
 					if seriesMint < actualMint {
 						actualMint = seriesMint
@ -1435,7 +1679,7 @@ func (s *stripeSeries) gc(mint int64) (map[storage.SeriesRef]struct{}, int, int6
 		actualMint = mint
 	}
-	return deleted, rmChunks, actualMint
+	return deleted, rmChunks, actualMint, minOOOTime, minMmapFile
 }
 func (s *stripeSeries) getByID(id chunks.HeadSeriesRef) *memSeries {
@ -1528,11 +1772,16 @@ type memSeries struct {
 	//
 	// pN is the pointer to the mmappedChunk referered to by HeadChunkID=N
 	mmappedChunks []*mmappedChunk
 	headChunk     *memChunk          // Most recent chunk in memory that's still being built.
 	firstChunkID  chunks.HeadChunkID // HeadChunkID for mmappedChunks[0]
-	mmMaxTime    int64     // Max time of any mmapped chunk, only used during WAL replay.
+	oooMmappedChunks []*mmappedChunk    // Immutable chunks on disk containing OOO samples.
-	headChunk    *memChunk // Most recent chunk in memory that's still being built.
+	oooHeadChunk     *oooHeadChunk      // Most recent chunk for ooo samples in memory that's still being built.
-	chunkRange   int64
+	firstOOOChunkID  chunks.HeadChunkID // HeadOOOChunkID for oooMmappedChunks[0]
-	firstChunkID chunks.HeadChunkID // HeadChunkID for mmappedChunks[0]
+
 	mmMaxTime  int64 // Max time of any mmapped chunk, only used during WAL replay.
 	chunkRange int64
 	oooCapMax  uint8
 	nextAt int64 // Timestamp at which to cut the next chunk.
@ -1551,12 +1800,13 @@ type memSeries struct {
 	pendingCommit bool // Whether there are samples waiting to be committed to this series.
 }
-func newMemSeries(lset labels.Labels, id chunks.HeadSeriesRef, chunkRange int64, isolationDisabled bool) *memSeries {
+func newMemSeries(lset labels.Labels, id chunks.HeadSeriesRef, chunkRange, oooCapMax int64, isolationDisabled bool) *memSeries {
 	s := &memSeries{
 		lset:       lset,
 		ref:        id,
 		chunkRange: chunkRange,
 		nextAt:     math.MinInt64,
 		oooCapMax:  uint8(oooCapMax),
 	}
 	if !isolationDisabled {
 		s.txs = newTxRing(4)
@ -1575,6 +1825,7 @@ func (s *memSeries) minTime() int64 {
 }
 func (s *memSeries) maxTime() int64 {
 	// The highest timestamps will always be in the regular (non-OOO) chunks, even if OOO is enabled.
 	c := s.head()
 	if c != nil {
 		return c.maxTime
@ -1588,26 +1839,39 @@ func (s *memSeries) maxTime() int64 {
 // truncateChunksBefore removes all chunks from the series that
 // have no timestamp at or after mint.
 // Chunk IDs remain unchanged.
-func (s *memSeries) truncateChunksBefore(mint int64) (removed int) {
+func (s *memSeries) truncateChunksBefore(mint int64, minOOOMmapRef chunks.ChunkDiskMapperRef) int {
 	var removedInOrder int
 	if s.headChunk != nil && s.headChunk.maxTime < mint {
 		// If head chunk is truncated, we can truncate all mmapped chunks.
-		removed = 1 + len(s.mmappedChunks)
+		removedInOrder = 1 + len(s.mmappedChunks)
-		s.firstChunkID += chunks.HeadChunkID(removed)
+		s.firstChunkID += chunks.HeadChunkID(removedInOrder)
 		s.headChunk = nil
 		s.mmappedChunks = nil
 		return removed
 	}
 	if len(s.mmappedChunks) > 0 {
 		for i, c := range s.mmappedChunks {
 			if c.maxTime >= mint {
 				break
 			}
-			removed = i + 1
+			removedInOrder = i + 1
 		}
-		s.mmappedChunks = append(s.mmappedChunks[:0], s.mmappedChunks[removed:]...)
+		s.mmappedChunks = append(s.mmappedChunks[:0], s.mmappedChunks[removedInOrder:]...)
-		s.firstChunkID += chunks.HeadChunkID(removed)
+		s.firstChunkID += chunks.HeadChunkID(removedInOrder)
 	}
-	return removed
+
 	var removedOOO int
 	if len(s.oooMmappedChunks) > 0 {
 		for i, c := range s.oooMmappedChunks {
 			if c.ref.GreaterThan(minOOOMmapRef) {
 				break
 			}
 			removedOOO = i + 1
 		}
 		s.oooMmappedChunks = append(s.oooMmappedChunks[:0], s.oooMmappedChunks[removedOOO:]...)
 		s.firstOOOChunkID += chunks.HeadChunkID(removedOOO)
 	}
 	return removedInOrder + removedOOO
 }
 // cleanupAppendIDsBelow cleans up older appendIDs. Has to be called after
@ -1627,6 +1891,16 @@ type memChunk struct {
 	minTime, maxTime int64
 }
 type oooHeadChunk struct {
 	chunk            *OOOChunk
 	minTime, maxTime int64 // can probably be removed and pulled out of the chunk instead
 }
 // OverlapsClosedInterval returns true if the chunk overlaps [mint, maxt].
 func (mc *oooHeadChunk) OverlapsClosedInterval(mint, maxt int64) bool {
 	return overlapsClosedInterval(mc.minTime, mc.maxTime, mint, maxt)
 }
 // OverlapsClosedInterval returns true if the chunk overlaps [mint, maxt].
 func (mc *memChunk) OverlapsClosedInterval(mint, maxt int64) bool {
 	return overlapsClosedInterval(mc.minTime, mc.maxTime, mint, maxt)
@ -1655,12 +1929,15 @@ func (noopSeriesLifecycleCallback) PostCreation(labels.Labels)      {}
 func (noopSeriesLifecycleCallback) PostDeletion(...labels.Labels)   {}
 func (h *Head) Size() int64 {
-	var walSize int64
+	var walSize, wblSize int64
 	if h.wal != nil {
 		walSize, _ = h.wal.Size()
 	}
 	if h.wbl != nil {
 		wblSize, _ = h.wbl.Size()
 	}
 	cdmSize, _ := h.chunkDiskMapper.Size()
-	return walSize + cdmSize
+	return walSize + wblSize + cdmSize
 }
 func (h *RangeHead) Size() int64 {
--- a/tsdb/head_append.go
+++ b/tsdb/head_append.go
@ -137,6 +137,8 @@ func (h *Head) appender() *headAppender {
 		minValidTime:          h.appendableMinValidTime(),
 		mint:                  math.MaxInt64,
 		maxt:                  math.MinInt64,
 		headMaxt:              h.MaxTime(),
 		oooTimeWindow:         h.opts.OutOfOrderTimeWindow.Load(),
 		samples:               h.getAppendBuffer(),
 		sampleSeries:          h.getSeriesBuffer(),
 		exemplars:             exemplarsBuf,
@ -252,9 +254,11 @@ type exemplarWithSeriesRef struct {
 }
 type headAppender struct {
-	head         *Head
+	head          *Head
-	minValidTime int64 // No samples below this timestamp are allowed.
+	minValidTime  int64 // No samples below this timestamp are allowed.
-	mint, maxt   int64
+	mint, maxt    int64
 	headMaxt      int64 // We track it here to not take the lock for every sample appended.
 	oooTimeWindow int64 // Use the same for the entire append, and don't load the atomic for each sample.
 	series         []record.RefSeries      // New series held by this appender.
 	metadata       []record.RefMetadata    // New metadata held by this appender.
@ -268,7 +272,9 @@ type headAppender struct {
 }
 func (a *headAppender) Append(ref storage.SeriesRef, lset labels.Labels, t int64, v float64) (storage.SeriesRef, error) {
-	if t < a.minValidTime {
+	// For OOO inserts, this restriction is irrelevant and will be checked later once we confirm the sample is an in-order append.
 	// If OOO inserts are disabled, we may as well as check this as early as we can and avoid more work.
 	if a.oooTimeWindow == 0 && t < a.minValidTime {
 		a.head.metrics.outOfBoundSamples.Inc()
 		return 0, storage.ErrOutOfBounds
 	}
@ -300,15 +306,25 @@ func (a *headAppender) Append(ref storage.SeriesRef, lset labels.Labels, t int64
 	}
 	s.Lock()
-	if err := s.appendable(t, v); err != nil {
+	// TODO(codesome): If we definitely know at this point that the sample is ooo, then optimise
-		s.Unlock()
+	// to skip that sample from the WAL and write only in the WBL.
-		if err == storage.ErrOutOfOrderSample {
+	_, delta, err := s.appendable(t, v, a.headMaxt, a.minValidTime, a.oooTimeWindow)
 	if err == nil {
 		s.pendingCommit = true
 	}
 	s.Unlock()
 	if delta > 0 {
 		a.head.metrics.oooHistogram.Observe(float64(delta))
 	}
 	if err != nil {
 		switch err {
 		case storage.ErrOutOfOrderSample:
 			a.head.metrics.outOfOrderSamples.Inc()
 		case storage.ErrTooOldSample:
 			a.head.metrics.tooOldSamples.Inc()
 		}
 		return 0, err
 	}
 	s.pendingCommit = true
 	s.Unlock()
 	if t < a.mint {
 		a.mint = t
@ -326,25 +342,46 @@ func (a *headAppender) Append(ref storage.SeriesRef, lset labels.Labels, t int64
 	return storage.SeriesRef(s.ref), nil
 }
-// appendable checks whether the given sample is valid for appending to the series.
+// appendable checks whether the given sample is valid for appending to the series. (if we return false and no error)
-func (s *memSeries) appendable(t int64, v float64) error {
+// The sample belongs to the out of order chunk if we return true and no error.
-	c := s.head()
+// An error signifies the sample cannot be handled.
-	if c == nil {
+func (s *memSeries) appendable(t int64, v float64, headMaxt, minValidTime, oooTimeWindow int64) (isOOO bool, oooDelta int64, err error) {
-		return nil
+	// Check if we can append in the in-order chunk.
 	if t >= minValidTime {
 		if s.head() == nil {
 			// The series has no sample and was freshly created.
 			return false, 0, nil
 		}
 		msMaxt := s.maxTime()
 		if t > msMaxt {
 			return false, 0, nil
 		}
 		if t == msMaxt {
 			// We are allowing exact duplicates as we can encounter them in valid cases
 			// like federation and erroring out at that time would be extremely noisy.
 			// This only checks against the latest in-order sample.
 			// The OOO headchunk has its own method to detect these duplicates.
 			if math.Float64bits(s.sampleBuf[3].v) != math.Float64bits(v) {
 				return false, 0, storage.ErrDuplicateSampleForTimestamp
 			}
 			// Sample is identical (ts + value) with most current (highest ts) sample in sampleBuf.
 			return false, 0, nil
 		}
 	}
-	if t > c.maxTime {
+	// The sample cannot go in the in-order chunk. Check if it can go in the out-of-order chunk.
-		return nil
+	if oooTimeWindow > 0 && t >= headMaxt-oooTimeWindow {
 		return true, headMaxt - t, nil
 	}
-	if t < c.maxTime {
+
-		return storage.ErrOutOfOrderSample
+	// The sample cannot go in both in-order and out-of-order chunk.
 	if oooTimeWindow > 0 {
 		return true, headMaxt - t, storage.ErrTooOldSample
 	}
-	// We are allowing exact duplicates as we can encounter them in valid cases
+	if t < minValidTime {
-	// like federation and erroring out at that time would be extremely noisy.
+		return false, headMaxt - t, storage.ErrOutOfBounds
 	if math.Float64bits(s.sampleBuf[3].v) != math.Float64bits(v) {
 		return storage.ErrDuplicateSampleForTimestamp
 	}
-	return nil
+	return false, headMaxt - t, storage.ErrOutOfOrderSample
 }
 // AppendExemplar for headAppender assumes the series ref already exists, and so it doesn't
@ -487,6 +524,7 @@ func exemplarsForEncoding(es []exemplarWithSeriesRef) []record.RefExemplar {
 }
 // Commit writes to the WAL and adds the data to the Head.
 // TODO(codesome): Refactor this method to reduce indentation and make it more readable.
 func (a *headAppender) Commit() (err error) {
 	if a.closed {
 		return ErrAppenderClosed
@ -517,24 +555,143 @@ func (a *headAppender) Commit() (err error) {
 	defer a.head.putMetadataBuffer(a.metadata)
 	defer a.head.iso.closeAppend(a.appendID)
-	total := len(a.samples)
+	var (
-	var series *memSeries
+		samplesAppended = len(a.samples)
 		oooAccepted     int   // number of samples out of order but accepted: with ooo enabled and within time window
 		oooRejected     int   // number of samples rejected due to: out of order but OOO support disabled.
 		tooOldRejected  int   // number of samples rejected due to: that are out of order but too old (OOO support enabled, but outside time window)
 		oobRejected     int   // number of samples rejected due to: out of bounds: with t < minValidTime (OOO support disabled)
 		inOrderMint     int64 = math.MaxInt64
 		inOrderMaxt     int64 = math.MinInt64
 		ooomint         int64 = math.MaxInt64
 		ooomaxt         int64 = math.MinInt64
 		wblSamples      []record.RefSample
 		oooMmapMarkers  map[chunks.HeadSeriesRef]chunks.ChunkDiskMapperRef
 		oooRecords      [][]byte
 		series          *memSeries
 		enc             record.Encoder
 	)
 	defer func() {
 		for i := range oooRecords {
 			a.head.putBytesBuffer(oooRecords[i][:0])
 		}
 	}()
 	collectOOORecords := func() {
 		if a.head.wbl == nil {
 			// WBL is not enabled. So no need to collect.
 			wblSamples = nil
 			oooMmapMarkers = nil
 			return
 		}
 		// The m-map happens before adding a new sample. So we collect
 		// the m-map markers first, and then samples.
 		// WBL Graphically:
 		//   WBL Before this Commit(): [old samples before this commit for chunk 1]
 		//   WBL After this Commit():  [old samples before this commit for chunk 1][new samples in this commit for chunk 1]mmapmarker1[samples for chunk 2]mmapmarker2[samples for chunk 3]
 		if oooMmapMarkers != nil {
 			markers := make([]record.RefMmapMarker, 0, len(oooMmapMarkers))
 			for ref, mmapRef := range oooMmapMarkers {
 				markers = append(markers, record.RefMmapMarker{
 					Ref:     ref,
 					MmapRef: mmapRef,
 				})
 			}
 			r := enc.MmapMarkers(markers, a.head.getBytesBuffer())
 			oooRecords = append(oooRecords, r)
 		}
 		if len(wblSamples) > 0 {
 			r := enc.Samples(wblSamples, a.head.getBytesBuffer())
 			oooRecords = append(oooRecords, r)
 		}
 		wblSamples = nil
 		oooMmapMarkers = nil
 	}
 	for i, s := range a.samples {
 		series = a.sampleSeries[i]
 		series.Lock()
 		ok, chunkCreated := series.append(s.T, s.V, a.appendID, a.head.chunkDiskMapper)
 		series.cleanupAppendIDsBelow(a.cleanupAppendIDsBelow)
 		series.pendingCommit = false
 		series.Unlock()
-		if !ok {
+		oooSample, _, err := series.appendable(s.T, s.V, a.headMaxt, a.minValidTime, a.oooTimeWindow)
-			total--
+		switch err {
-			a.head.metrics.outOfOrderSamples.Inc()
+		case storage.ErrOutOfOrderSample:
 			samplesAppended--
 			oooRejected++
 		case storage.ErrOutOfBounds:
 			samplesAppended--
 			oobRejected++
 		case storage.ErrTooOldSample:
 			samplesAppended--
 			tooOldRejected++
 		case nil:
 			// Do nothing.
 		default:
 			samplesAppended--
 		}
 		var ok, chunkCreated bool
 		if err == nil && oooSample {
 			// Sample is OOO and OOO handling is enabled
 			// and the delta is within the OOO tolerance.
 			var mmapRef chunks.ChunkDiskMapperRef
 			ok, chunkCreated, mmapRef = series.insert(s.T, s.V, a.head.chunkDiskMapper)
 			if chunkCreated {
 				r, ok := oooMmapMarkers[series.ref]
 				if !ok || r != 0 {
 					// !ok means there are no markers collected for these samples yet. So we first flush the samples
 					// before setting this m-map marker.
 					// r != 0 means we have already m-mapped a chunk for this series in the same Commit().
 					// Hence, before we m-map again, we should add the samples and m-map markers
 					// seen till now to the WBL records.
 					collectOOORecords()
 				}
 				if oooMmapMarkers == nil {
 					oooMmapMarkers = make(map[chunks.HeadSeriesRef]chunks.ChunkDiskMapperRef)
 				}
 				oooMmapMarkers[series.ref] = mmapRef
 			}
 			if ok {
 				wblSamples = append(wblSamples, s)
 				if s.T < ooomint {
 					ooomint = s.T
 				}
 				if s.T > ooomaxt {
 					ooomaxt = s.T
 				}
 				oooAccepted++
 			} else {
 				// Sample is an exact duplicate of the last sample.
 				// NOTE: We can only detect updates if they clash with a sample in the OOOHeadChunk,
 				// not with samples in already flushed OOO chunks.
 				// TODO(codesome): Add error reporting? It depends on addressing https://github.com/prometheus/prometheus/discussions/10305.
 				samplesAppended--
 			}
 		} else if err == nil {
 			ok, chunkCreated = series.append(s.T, s.V, a.appendID, a.head.chunkDiskMapper)
 			if ok {
 				if s.T < inOrderMint {
 					inOrderMint = s.T
 				}
 				if s.T > inOrderMaxt {
 					inOrderMaxt = s.T
 				}
 			} else {
 				// The sample is an exact duplicate, and should be silently dropped.
 				samplesAppended--
 			}
 		}
 		if chunkCreated {
 			a.head.metrics.chunks.Inc()
 			a.head.metrics.chunksCreated.Inc()
 		}
 		series.cleanupAppendIDsBelow(a.cleanupAppendIDsBelow)
 		series.pendingCommit = false
 		series.Unlock()
 	}
 	for i, m := range a.metadata {
@ -544,12 +701,48 @@ func (a *headAppender) Commit() (err error) {
 		series.Unlock()
 	}
-	a.head.metrics.samplesAppended.Add(float64(total))
+	a.head.metrics.outOfOrderSamples.Add(float64(oooRejected))
-	a.head.updateMinMaxTime(a.mint, a.maxt)
+	a.head.metrics.outOfBoundSamples.Add(float64(oobRejected))
 	a.head.metrics.tooOldSamples.Add(float64(tooOldRejected))
 	a.head.metrics.samplesAppended.Add(float64(samplesAppended))
 	a.head.metrics.outOfOrderSamplesAppended.Add(float64(oooAccepted))
 	a.head.updateMinMaxTime(inOrderMint, inOrderMaxt)
 	a.head.updateMinOOOMaxOOOTime(ooomint, ooomaxt)
 	collectOOORecords()
 	if a.head.wbl != nil {
 		if err := a.head.wbl.Log(oooRecords...); err != nil {
 			// TODO(codesome): Currently WBL logging of ooo samples is best effort here since we cannot try logging
 			// until we have found what samples become OOO. We can try having a metric for this failure.
 			// Returning the error here is not correct because we have already put the samples into the memory,
 			// hence the append/insert was a success.
 			level.Error(a.head.logger).Log("msg", "Failed to log out of order samples into the WAL", "err", err)
 		}
 	}
 	return nil
 }
 // insert is like append, except it inserts. Used for OOO samples.
 func (s *memSeries) insert(t int64, v float64, chunkDiskMapper *chunks.ChunkDiskMapper) (inserted, chunkCreated bool, mmapRef chunks.ChunkDiskMapperRef) {
 	c := s.oooHeadChunk
 	if c == nil || c.chunk.NumSamples() == int(s.oooCapMax) {
 		// Note: If no new samples come in then we rely on compaction to clean up stale in-memory OOO chunks.
 		c, mmapRef = s.cutNewOOOHeadChunk(t, chunkDiskMapper)
 		chunkCreated = true
 	}
 	ok := c.chunk.Insert(t, v)
 	if ok {
 		if chunkCreated || t < c.minTime {
 			c.minTime = t
 		}
 		if chunkCreated || t > c.maxTime {
 			c.maxTime = t
 		}
 	}
 	return ok, chunkCreated, mmapRef
 }
 // append adds the sample (t, v) to the series. The caller also has to provide
 // the appendID for isolation. (The appendID can be zero, which results in no
 // isolation for this append.)
@ -567,7 +760,7 @@ func (s *memSeries) append(t int64, v float64, appendID uint64, chunkDiskMapper
 			// Out of order sample. Sample timestamp is already in the mmapped chunks, so ignore it.
 			return false, false
 		}
-		// There is no chunk in this series yet, create the first chunk for the sample.
+		// There is no head chunk in this series yet, create the first chunk for the sample.
 		c = s.cutNewHeadChunk(t, chunkDiskMapper)
 		chunkCreated = true
 	}
@ -651,6 +844,36 @@ func (s *memSeries) cutNewHeadChunk(mint int64, chunkDiskMapper *chunks.ChunkDis
 	return s.headChunk
 }
 func (s *memSeries) cutNewOOOHeadChunk(mint int64, chunkDiskMapper *chunks.ChunkDiskMapper) (*oooHeadChunk, chunks.ChunkDiskMapperRef) {
 	ref := s.mmapCurrentOOOHeadChunk(chunkDiskMapper)
 	s.oooHeadChunk = &oooHeadChunk{
 		chunk:   NewOOOChunk(),
 		minTime: mint,
 		maxTime: math.MinInt64,
 	}
 	return s.oooHeadChunk, ref
 }
 func (s *memSeries) mmapCurrentOOOHeadChunk(chunkDiskMapper *chunks.ChunkDiskMapper) chunks.ChunkDiskMapperRef {
 	if s.oooHeadChunk == nil {
 		// There is no head chunk, so nothing to m-map here.
 		return 0
 	}
 	xor, _ := s.oooHeadChunk.chunk.ToXOR() // Encode to XorChunk which is more compact and implements all of the needed functionality.
 	oooXor := &chunkenc.OOOXORChunk{XORChunk: xor}
 	chunkRef := chunkDiskMapper.WriteChunk(s.ref, s.oooHeadChunk.minTime, s.oooHeadChunk.maxTime, oooXor, handleChunkWriteError)
 	s.oooMmappedChunks = append(s.oooMmappedChunks, &mmappedChunk{
 		ref:        chunkRef,
 		numSamples: uint16(xor.NumSamples()),
 		minTime:    s.oooHeadChunk.minTime,
 		maxTime:    s.oooHeadChunk.maxTime,
 	})
 	s.oooHeadChunk = nil
 	return chunkRef
 }
 func (s *memSeries) mmapCurrentHeadChunk(chunkDiskMapper *chunks.ChunkDiskMapper) {
 	if s.headChunk == nil {
 		// There is no head chunk, so nothing to m-map here.
--- a/tsdb/head_bench_test.go
+++ b/tsdb/head_bench_test.go
@ -30,7 +30,7 @@ func BenchmarkHeadStripeSeriesCreate(b *testing.B) {
 	opts := DefaultHeadOptions()
 	opts.ChunkRange = 1000
 	opts.ChunkDirRoot = chunkDir
-	h, err := NewHead(nil, nil, nil, opts, nil)
+	h, err := NewHead(nil, nil, nil, nil, opts, nil)
 	require.NoError(b, err)
 	defer h.Close()
@ -45,7 +45,7 @@ func BenchmarkHeadStripeSeriesCreateParallel(b *testing.B) {
 	opts := DefaultHeadOptions()
 	opts.ChunkRange = 1000
 	opts.ChunkDirRoot = chunkDir
-	h, err := NewHead(nil, nil, nil, opts, nil)
+	h, err := NewHead(nil, nil, nil, nil, opts, nil)
 	require.NoError(b, err)
 	defer h.Close()
@ -69,7 +69,7 @@ func BenchmarkHeadStripeSeriesCreate_PreCreationFailure(b *testing.B) {
 	// Mock the PreCreation() callback to fail on each series.
 	opts.SeriesCallback = failingSeriesLifecycleCallback{}
-	h, err := NewHead(nil, nil, nil, opts, nil)
+	h, err := NewHead(nil, nil, nil, nil, opts, nil)
 	require.NoError(b, err)
 	defer h.Close()
--- a/tsdb/head_read.go
+++ b/tsdb/head_read.go
@ -183,11 +183,20 @@ func (h *headIndexReader) Series(ref storage.SeriesRef, lbls *labels.Labels, chk
 	return nil
 }
-// headChunkID returns the HeadChunkID corresponding to .mmappedChunks[pos]
+// headChunkID returns the HeadChunkID referred to by the given position.
 // * 0 <= pos < len(s.mmappedChunks) refer to s.mmappedChunks[pos]
 // * pos == len(s.mmappedChunks) refers to s.headChunk
 func (s *memSeries) headChunkID(pos int) chunks.HeadChunkID {
 	return chunks.HeadChunkID(pos) + s.firstChunkID
 }
 // oooHeadChunkID returns the HeadChunkID referred to by the given position.
 // * 0 <= pos < len(s.oooMmappedChunks) refer to s.oooMmappedChunks[pos]
 // * pos == len(s.oooMmappedChunks) refers to s.oooHeadChunk
 func (s *memSeries) oooHeadChunkID(pos int) chunks.HeadChunkID {
 	return chunks.HeadChunkID(pos) + s.firstOOOChunkID
 }
 // LabelValueFor returns label value for the given label name in the series referred to by ID.
 func (h *headIndexReader) LabelValueFor(id storage.SeriesRef, label string) (string, error) {
 	memSeries := h.head.series.getByID(chunks.HeadSeriesRef(id))
@ -258,8 +267,8 @@ func (h *headChunkReader) Close() error {
 }
 // Chunk returns the chunk for the reference number.
-func (h *headChunkReader) Chunk(ref chunks.ChunkRef) (chunkenc.Chunk, error) {
+func (h *headChunkReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
-	sid, cid := chunks.HeadChunkRef(ref).Unpack()
+	sid, cid := chunks.HeadChunkRef(meta.Ref).Unpack()
 	s := h.head.series.getByID(sid)
 	// This means that the series has been garbage collected.
@ -330,6 +339,260 @@ func (s *memSeries) chunk(id chunks.HeadChunkID, chunkDiskMapper *chunks.ChunkDi
 	return mc, true, nil
 }
 // oooMergedChunk returns the requested chunk based on the given chunks.Meta
 // reference from memory or by m-mapping it from the disk. The returned chunk
 // might be a merge of all the overlapping chunks, if any, amongst all the
 // chunks in the OOOHead.
 // This function is not thread safe unless the caller holds a lock.
 func (s *memSeries) oooMergedChunk(meta chunks.Meta, cdm *chunks.ChunkDiskMapper, mint, maxt int64) (chunk *mergedOOOChunks, err error) {
 	_, cid := chunks.HeadChunkRef(meta.Ref).Unpack()
 	// ix represents the index of chunk in the s.mmappedChunks slice. The chunk meta's are
 	// incremented by 1 when new chunk is created, hence (meta - firstChunkID) gives the slice index.
 	// The max index for the s.mmappedChunks slice can be len(s.mmappedChunks)-1, hence if the ix
 	// is len(s.mmappedChunks), it represents the next chunk, which is the head chunk.
 	ix := int(cid) - int(s.firstOOOChunkID)
 	if ix < 0 || ix > len(s.oooMmappedChunks) {
 		return nil, storage.ErrNotFound
 	}
 	if ix == len(s.oooMmappedChunks) {
 		if s.oooHeadChunk == nil {
 			return nil, errors.New("invalid ooo head chunk")
 		}
 	}
 	// We create a temporary slice of chunk metas to hold the information of all
 	// possible chunks that may overlap with the requested chunk.
 	tmpChks := make([]chunkMetaAndChunkDiskMapperRef, 0, len(s.oooMmappedChunks))
 	oooHeadRef := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(len(s.oooMmappedChunks))))
 	if s.oooHeadChunk != nil && s.oooHeadChunk.OverlapsClosedInterval(mint, maxt) {
 		// We only want to append the head chunk if this chunk existed when
 		// Series() was called. This brings consistency in case new data
 		// is added in between Series() and Chunk() calls.
 		if oooHeadRef == meta.OOOLastRef {
 			tmpChks = append(tmpChks, chunkMetaAndChunkDiskMapperRef{
 				meta: chunks.Meta{
 					// Ignoring samples added before and after the last known min and max time for this chunk.
 					MinTime: meta.OOOLastMinTime,
 					MaxTime: meta.OOOLastMaxTime,
 					Ref:     oooHeadRef,
 				},
 			})
 		}
 	}
 	for i, c := range s.oooMmappedChunks {
 		chunkRef := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(i)))
 		// We can skip chunks that came in later than the last known OOOLastRef.
 		if chunkRef > meta.OOOLastRef {
 			break
 		}
 		if chunkRef == meta.OOOLastRef {
 			tmpChks = append(tmpChks, chunkMetaAndChunkDiskMapperRef{
 				meta: chunks.Meta{
 					MinTime: meta.OOOLastMinTime,
 					MaxTime: meta.OOOLastMaxTime,
 					Ref:     chunkRef,
 				},
 				ref:      c.ref,
 				origMinT: c.minTime,
 				origMaxT: c.maxTime,
 			})
 		} else if c.OverlapsClosedInterval(mint, maxt) {
 			tmpChks = append(tmpChks, chunkMetaAndChunkDiskMapperRef{
 				meta: chunks.Meta{
 					MinTime: c.minTime,
 					MaxTime: c.maxTime,
 					Ref:     chunkRef,
 				},
 				ref: c.ref,
 			})
 		}
 	}
 	// Next we want to sort all the collected chunks by min time so we can find
 	// those that overlap and stop when we know the rest don't.
 	sort.Sort(byMinTimeAndMinRef(tmpChks))
 	mc := &mergedOOOChunks{}
 	absoluteMax := int64(math.MinInt64)
 	for _, c := range tmpChks {
 		if c.meta.Ref != meta.Ref && (len(mc.chunks) == 0 || c.meta.MinTime > absoluteMax) {
 			continue
 		}
 		if c.meta.Ref == oooHeadRef {
 			var xor *chunkenc.XORChunk
 			// If head chunk min and max time match the meta OOO markers
 			// that means that the chunk has not expanded so we can append
 			// it as it is.
 			if s.oooHeadChunk.minTime == meta.OOOLastMinTime && s.oooHeadChunk.maxTime == meta.OOOLastMaxTime {
 				xor, err = s.oooHeadChunk.chunk.ToXOR() // TODO(jesus.vazquez) (This is an optimization idea that has no priority and might not be that useful) See if we could use a copy of the underlying slice. That would leave the more expensive ToXOR() function only for the usecase where Bytes() is called.
 			} else {
 				// We need to remove samples that are outside of the markers
 				xor, err = s.oooHeadChunk.chunk.ToXORBetweenTimestamps(meta.OOOLastMinTime, meta.OOOLastMaxTime)
 			}
 			if err != nil {
 				return nil, errors.Wrap(err, "failed to convert ooo head chunk to xor chunk")
 			}
 			c.meta.Chunk = xor
 		} else {
 			chk, err := cdm.Chunk(c.ref)
 			if err != nil {
 				if _, ok := err.(*chunks.CorruptionErr); ok {
 					return nil, errors.Wrap(err, "invalid ooo mmapped chunk")
 				}
 				return nil, err
 			}
 			if c.meta.Ref == meta.OOOLastRef &&
 				(c.origMinT != meta.OOOLastMinTime || c.origMaxT != meta.OOOLastMaxTime) {
 				// The head expanded and was memory mapped so now we need to
 				// wrap the chunk within a chunk that doesnt allows us to iterate
 				// through samples out of the OOOLastMinT and OOOLastMaxT
 				// markers.
 				c.meta.Chunk = boundedChunk{chk, meta.OOOLastMinTime, meta.OOOLastMaxTime}
 			} else {
 				c.meta.Chunk = chk
 			}
 		}
 		mc.chunks = append(mc.chunks, c.meta)
 		if c.meta.MaxTime > absoluteMax {
 			absoluteMax = c.meta.MaxTime
 		}
 	}
 	return mc, nil
 }
 var _ chunkenc.Chunk = &mergedOOOChunks{}
 // mergedOOOChunks holds the list of overlapping chunks. This struct satisfies
 // chunkenc.Chunk.
 type mergedOOOChunks struct {
 	chunks []chunks.Meta
 }
 // Bytes is a very expensive method because its calling the iterator of all the
 // chunks in the mergedOOOChunk and building a new chunk with the samples.
 func (o mergedOOOChunks) Bytes() []byte {
 	xc := chunkenc.NewXORChunk()
 	app, err := xc.Appender()
 	if err != nil {
 		panic(err)
 	}
 	it := o.Iterator(nil)
 	for it.Next() {
 		t, v := it.At()
 		app.Append(t, v)
 	}
 	return xc.Bytes()
 }
 func (o mergedOOOChunks) Encoding() chunkenc.Encoding {
 	return chunkenc.EncXOR
 }
 func (o mergedOOOChunks) Appender() (chunkenc.Appender, error) {
 	return nil, errors.New("can't append to mergedOOOChunks")
 }
 func (o mergedOOOChunks) Iterator(iterator chunkenc.Iterator) chunkenc.Iterator {
 	iterators := make([]chunkenc.Iterator, 0, len(o.chunks))
 	for _, c := range o.chunks {
 		iterators = append(iterators, c.Chunk.Iterator(nil))
 	}
 	return storage.NewChainSampleIterator(iterators)
 }
 func (o mergedOOOChunks) NumSamples() int {
 	samples := 0
 	for _, c := range o.chunks {
 		samples += c.Chunk.NumSamples()
 	}
 	return samples
 }
 func (o mergedOOOChunks) Compact() {}
 var _ chunkenc.Chunk = &boundedChunk{}
 // boundedChunk is an implementation of chunkenc.Chunk that uses a
 // boundedIterator that only iterates through samples which timestamps are
 // >= minT and <= maxT
 type boundedChunk struct {
 	chunkenc.Chunk
 	minT int64
 	maxT int64
 }
 func (b boundedChunk) Bytes() []byte {
 	xor := chunkenc.NewXORChunk()
 	a, _ := xor.Appender()
 	it := b.Iterator(nil)
 	for it.Next() {
 		t, v := it.At()
 		a.Append(t, v)
 	}
 	return xor.Bytes()
 }
 func (b boundedChunk) Iterator(iterator chunkenc.Iterator) chunkenc.Iterator {
 	it := b.Chunk.Iterator(iterator)
 	if it == nil {
 		panic("iterator shouldn't be nil")
 	}
 	return boundedIterator{it, b.minT, b.maxT}
 }
 var _ chunkenc.Iterator = &boundedIterator{}
 // boundedIterator is an implementation of Iterator that only iterates through
 // samples which timestamps are >= minT and <= maxT
 type boundedIterator struct {
 	chunkenc.Iterator
 	minT int64
 	maxT int64
 }
 // Next the first time its called it will advance as many positions as necessary
 // until its able to find a sample within the bounds minT and maxT.
 // If there are samples within bounds it will advance one by one amongst them.
 // If there are no samples within bounds it will return false.
 func (b boundedIterator) Next() bool {
 	for b.Iterator.Next() {
 		t, _ := b.Iterator.At()
 		if t < b.minT {
 			continue
 		} else if t > b.maxT {
 			return false
 		}
 		return true
 	}
 	return false
 }
 func (b boundedIterator) Seek(t int64) bool {
 	if t < b.minT {
 		// We must seek at least up to b.minT if it is asked for something before that.
 		ok := b.Iterator.Seek(b.minT)
 		if !ok {
 			return false
 		}
 		t, _ := b.Iterator.At()
 		return t <= b.maxT
 	}
 	if t > b.maxT {
 		// We seek anyway so that the subsequent Next() calls will also return false.
 		b.Iterator.Seek(t)
 		return false
 	}
 	return b.Iterator.Seek(t)
 }
 // safeChunk makes sure that the chunk can be accessed without a race condition
 type safeChunk struct {
 	chunkenc.Chunk
 	s               *memSeries
--- a/tsdb/head_read_test.go
+++ b/tsdb/head_read_test.go
@ -0,0 +1,178 @@
 // Copyright 2021 The Prometheus Authors
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 // http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 package tsdb
 import (
 	"fmt"
 	"testing"
 	"github.com/stretchr/testify/require"
 	"github.com/prometheus/prometheus/tsdb/chunkenc"
 )
 func TestBoundedChunk(t *testing.T) {
 	tests := []struct {
 		name           string
 		inputChunk     chunkenc.Chunk
 		inputMinT      int64
 		inputMaxT      int64
 		initialSeek    int64
 		seekIsASuccess bool
 		expSamples     []sample
 	}{
 		{
 			name:       "if there are no samples it returns nothing",
 			inputChunk: newTestChunk(0),
 			expSamples: nil,
 		},
 		{
 			name:       "bounds represent a single sample",
 			inputChunk: newTestChunk(10),
 			expSamples: []sample{
 				{0, 0},
 			},
 		},
 		{
 			name:       "if there are bounds set only samples within them are returned",
 			inputChunk: newTestChunk(10),
 			inputMinT:  1,
 			inputMaxT:  8,
 			expSamples: []sample{
 				{1, 1},
 				{2, 2},
 				{3, 3},
 				{4, 4},
 				{5, 5},
 				{6, 6},
 				{7, 7},
 				{8, 8},
 			},
 		},
 		{
 			name:       "if bounds set and only maxt is less than actual maxt",
 			inputChunk: newTestChunk(10),
 			inputMinT:  0,
 			inputMaxT:  5,
 			expSamples: []sample{
 				{0, 0},
 				{1, 1},
 				{2, 2},
 				{3, 3},
 				{4, 4},
 				{5, 5},
 			},
 		},
 		{
 			name:       "if bounds set and only mint is more than actual mint",
 			inputChunk: newTestChunk(10),
 			inputMinT:  5,
 			inputMaxT:  9,
 			expSamples: []sample{
 				{5, 5},
 				{6, 6},
 				{7, 7},
 				{8, 8},
 				{9, 9},
 			},
 		},
 		{
 			name:           "if there are bounds set with seek before mint",
 			inputChunk:     newTestChunk(10),
 			inputMinT:      3,
 			inputMaxT:      7,
 			initialSeek:    1,
 			seekIsASuccess: true,
 			expSamples: []sample{
 				{3, 3},
 				{4, 4},
 				{5, 5},
 				{6, 6},
 				{7, 7},
 			},
 		},
 		{
 			name:           "if there are bounds set with seek between mint and maxt",
 			inputChunk:     newTestChunk(10),
 			inputMinT:      3,
 			inputMaxT:      7,
 			initialSeek:    5,
 			seekIsASuccess: true,
 			expSamples: []sample{
 				{5, 5},
 				{6, 6},
 				{7, 7},
 			},
 		},
 		{
 			name:           "if there are bounds set with seek after maxt",
 			inputChunk:     newTestChunk(10),
 			inputMinT:      3,
 			inputMaxT:      7,
 			initialSeek:    8,
 			seekIsASuccess: false,
 		},
 	}
 	for _, tc := range tests {
 		t.Run(fmt.Sprintf("name=%s", tc.name), func(t *testing.T) {
 			chunk := boundedChunk{tc.inputChunk, tc.inputMinT, tc.inputMaxT}
 			// Testing Bytes()
 			expChunk := chunkenc.NewXORChunk()
 			if tc.inputChunk.NumSamples() > 0 {
 				app, err := expChunk.Appender()
 				require.NoError(t, err)
 				for ts := tc.inputMinT; ts <= tc.inputMaxT; ts++ {
 					app.Append(ts, float64(ts))
 				}
 			}
 			require.Equal(t, expChunk.Bytes(), chunk.Bytes())
 			var samples []sample
 			it := chunk.Iterator(nil)
 			if tc.initialSeek != 0 {
 				// Testing Seek()
 				ok := it.Seek(tc.initialSeek)
 				require.Equal(t, tc.seekIsASuccess, ok)
 				if ok {
 					t, v := it.At()
 					samples = append(samples, sample{t, v})
 				}
 			}
 			// Testing Next()
 			for it.Next() {
 				t, v := it.At()
 				samples = append(samples, sample{t, v})
 			}
 			// it.Next() should keep returning false.
 			for i := 0; i < 10; i++ {
 				require.False(t, it.Next())
 			}
 			require.Equal(t, tc.expSamples, samples)
 		})
 	}
 }
 func newTestChunk(numSamples int) chunkenc.Chunk {
 	xor := chunkenc.NewXORChunk()
 	a, _ := xor.Appender()
 	for i := 0; i < numSamples; i++ {
 		a.Append(int64(i), float64(i))
 	}
 	return xor
 }
--- a/tsdb/head_test.go
+++ b/tsdb/head_test.go
@ -49,7 +49,7 @@ import (
 	"github.com/prometheus/prometheus/tsdb/wal"
 )
-func newTestHead(t testing.TB, chunkRange int64, compressWAL bool) (*Head, *wal.WAL) {
+func newTestHead(t testing.TB, chunkRange int64, compressWAL, oooEnabled bool) (*Head, *wal.WAL) {
 	dir := t.TempDir()
 	wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, compressWAL)
 	require.NoError(t, err)
@ -59,18 +59,23 @@ func newTestHead(t testing.TB, chunkRange int64, compressWAL bool) (*Head, *wal.
 	opts.ChunkDirRoot = dir
 	opts.EnableExemplarStorage = true
 	opts.MaxExemplars.Store(config.DefaultExemplarsConfig.MaxExemplars)
 	if oooEnabled {
 		opts.OutOfOrderTimeWindow.Store(10 * time.Minute.Milliseconds())
 	}
-	h, err := NewHead(nil, nil, wlog, opts, nil)
+	h, err := NewHead(nil, nil, wlog, nil, opts, nil)
 	require.NoError(t, err)
-	require.NoError(t, h.chunkDiskMapper.IterateAllChunks(func(_ chunks.HeadSeriesRef, _ chunks.ChunkDiskMapperRef, _, _ int64, _ uint16) error { return nil }))
+	require.NoError(t, h.chunkDiskMapper.IterateAllChunks(func(_ chunks.HeadSeriesRef, _ chunks.ChunkDiskMapperRef, _, _ int64, _ uint16, _ chunkenc.Encoding) error {
 		return nil
 	}))
 	return h, wlog
 }
 func BenchmarkCreateSeries(b *testing.B) {
 	series := genSeries(b.N, 10, 0, 0)
-	h, _ := newTestHead(b, 10000, false)
+	h, _ := newTestHead(b, 10000, false, false)
 	defer func() {
 		require.NoError(b, h.Close())
 	}()
@ -224,7 +229,7 @@ func BenchmarkLoadWAL(b *testing.B) {
 						require.NoError(b, err)
 						for k := 0; k < c.batches*c.seriesPerBatch; k++ {
 							// Create one mmapped chunk per series, with one sample at the given time.
-							s := newMemSeries(labels.Labels{}, chunks.HeadSeriesRef(k)*101, c.mmappedChunkT, defaultIsolationDisabled)
+							s := newMemSeries(labels.Labels{}, chunks.HeadSeriesRef(k)*101, c.mmappedChunkT, 1, defaultIsolationDisabled)
 							s.append(c.mmappedChunkT, 42, 0, chunkDiskMapper)
 							s.mmapCurrentHeadChunk(chunkDiskMapper)
 						}
@ -255,7 +260,7 @@ func BenchmarkLoadWAL(b *testing.B) {
 						opts := DefaultHeadOptions()
 						opts.ChunkRange = 1000
 						opts.ChunkDirRoot = w.Dir()
-						h, err := NewHead(nil, nil, w, opts, nil)
+						h, err := NewHead(nil, nil, w, nil, opts, nil)
 						require.NoError(b, err)
 						h.Init(0)
 					}
@ -271,7 +276,7 @@ func BenchmarkLoadWAL(b *testing.B) {
 // While appending the samples to the head it concurrently queries them from multiple go routines and verifies that the
 // returned results are correct.
 func TestHead_HighConcurrencyReadAndWrite(t *testing.T) {
-	head, _ := newTestHead(t, DefaultBlockDuration, false)
+	head, _ := newTestHead(t, DefaultBlockDuration, false, false)
 	defer func() {
 		require.NoError(t, head.Close())
 	}()
@ -487,7 +492,7 @@ func TestHead_ReadWAL(t *testing.T) {
 				},
 			}
-			head, w := newTestHead(t, 1000, compress)
+			head, w := newTestHead(t, 1000, compress, false)
 			defer func() {
 				require.NoError(t, head.Close())
 			}()
@ -531,7 +536,7 @@ func TestHead_ReadWAL(t *testing.T) {
 }
 func TestHead_WALMultiRef(t *testing.T) {
-	head, w := newTestHead(t, 1000, false)
+	head, w := newTestHead(t, 1000, false, false)
 	require.NoError(t, head.Init(0))
@ -572,7 +577,7 @@ func TestHead_WALMultiRef(t *testing.T) {
 	opts := DefaultHeadOptions()
 	opts.ChunkRange = 1000
 	opts.ChunkDirRoot = w.Dir()
-	head, err = NewHead(nil, nil, w, opts, nil)
+	head, err = NewHead(nil, nil, w, nil, opts, nil)
 	require.NoError(t, err)
 	require.NoError(t, head.Init(0))
 	defer func() {
@ -591,7 +596,7 @@ func TestHead_WALMultiRef(t *testing.T) {
 }
 func TestHead_ActiveAppenders(t *testing.T) {
-	head, _ := newTestHead(t, 1000, false)
+	head, _ := newTestHead(t, 1000, false, false)
 	defer head.Close()
 	require.NoError(t, head.Init(0))
@ -624,14 +629,14 @@ func TestHead_ActiveAppenders(t *testing.T) {
 }
 func TestHead_UnknownWALRecord(t *testing.T) {
-	head, w := newTestHead(t, 1000, false)
+	head, w := newTestHead(t, 1000, false, false)
 	w.Log([]byte{255, 42})
 	require.NoError(t, head.Init(0))
 	require.NoError(t, head.Close())
 }
 func TestHead_Truncate(t *testing.T) {
-	h, _ := newTestHead(t, 1000, false)
+	h, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, h.Close())
 	}()
@ -733,7 +738,7 @@ func TestMemSeries_truncateChunks(t *testing.T) {
 		},
 	}
-	s := newMemSeries(labels.FromStrings("a", "b"), 1, 2000, defaultIsolationDisabled)
+	s := newMemSeries(labels.FromStrings("a", "b"), 1, 2000, 1, defaultIsolationDisabled)
 	for i := 0; i < 4000; i += 5 {
 		ok, _ := s.append(int64(i), float64(i), 0, chunkDiskMapper)
@ -752,7 +757,7 @@ func TestMemSeries_truncateChunks(t *testing.T) {
 	require.NotNil(t, chk)
 	require.NoError(t, err)
-	s.truncateChunksBefore(2000)
+	s.truncateChunksBefore(2000, 0)
 	require.Equal(t, int64(2000), s.mmappedChunks[0].minTime)
 	_, _, err = s.chunk(0, chunkDiskMapper, &memChunkPool)
@ -789,7 +794,7 @@ func TestHeadDeleteSeriesWithoutSamples(t *testing.T) {
 					{Ref: 50, T: 90, V: 1},
 				},
 			}
-			head, w := newTestHead(t, 1000, compress)
+			head, w := newTestHead(t, 1000, compress, false)
 			defer func() {
 				require.NoError(t, head.Close())
 			}()
@ -857,7 +862,8 @@ func TestHeadDeleteSimple(t *testing.T) {
 	for _, compress := range []bool{false, true} {
 		t.Run(fmt.Sprintf("compress=%t", compress), func(t *testing.T) {
 			for _, c := range cases {
-				head, w := newTestHead(t, 1000, compress)
+				head, w := newTestHead(t, 1000, compress, false)
 				require.NoError(t, head.Init(0))
 				app := head.Appender(context.Background())
 				for _, smpl := range smplsAll {
@ -887,7 +893,7 @@ func TestHeadDeleteSimple(t *testing.T) {
 				opts := DefaultHeadOptions()
 				opts.ChunkRange = 1000
 				opts.ChunkDirRoot = reloadedW.Dir()
-				reloadedHead, err := NewHead(nil, nil, reloadedW, opts, nil)
+				reloadedHead, err := NewHead(nil, nil, reloadedW, nil, opts, nil)
 				require.NoError(t, err)
 				require.NoError(t, reloadedHead.Init(0))
@ -937,7 +943,7 @@ func TestHeadDeleteSimple(t *testing.T) {
 }
 func TestDeleteUntilCurMax(t *testing.T) {
-	hb, _ := newTestHead(t, 1000000, false)
+	hb, _ := newTestHead(t, 1000000, false, false)
 	defer func() {
 		require.NoError(t, hb.Close())
 	}()
@ -990,7 +996,7 @@ func TestDeletedSamplesAndSeriesStillInWALAfterCheckpoint(t *testing.T) {
 	numSamples := 10000
 	// Enough samples to cause a checkpoint.
-	hb, w := newTestHead(t, int64(numSamples)*10, false)
+	hb, w := newTestHead(t, int64(numSamples)*10, false, false)
 	for i := 0; i < numSamples; i++ {
 		app := hb.Appender(context.Background())
@ -1082,7 +1088,7 @@ func TestDelete_e2e(t *testing.T) {
 		seriesMap[labels.New(l...).String()] = []tsdbutil.Sample{}
 	}
-	hb, _ := newTestHead(t, 100000, false)
+	hb, _ := newTestHead(t, 100000, false, false)
 	defer func() {
 		require.NoError(t, hb.Close())
 	}()
@ -1271,7 +1277,7 @@ func TestMemSeries_append(t *testing.T) {
 		require.NoError(t, chunkDiskMapper.Close())
 	}()
-	s := newMemSeries(labels.Labels{}, 1, 500, defaultIsolationDisabled)
+	s := newMemSeries(labels.Labels{}, 1, 500, 1, defaultIsolationDisabled)
 	// Add first two samples at the very end of a chunk range and the next two
 	// on and after it.
@ -1325,7 +1331,7 @@ func TestMemSeries_append_atVariableRate(t *testing.T) {
 		require.NoError(t, chunkDiskMapper.Close())
 	})
-	s := newMemSeries(labels.Labels{}, 1, DefaultBlockDuration, defaultIsolationDisabled)
+	s := newMemSeries(labels.Labels{}, 1, DefaultBlockDuration, 0, defaultIsolationDisabled)
 	// At this slow rate, we will fill the chunk in two block durations.
 	slowRate := (DefaultBlockDuration * 2) / samplesPerChunk
@ -1361,7 +1367,7 @@ func TestMemSeries_append_atVariableRate(t *testing.T) {
 func TestGCChunkAccess(t *testing.T) {
 	// Put a chunk, select it. GC it and then access it.
-	h, _ := newTestHead(t, 1000, false)
+	h, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, h.Close())
 	}()
@ -1398,22 +1404,22 @@ func TestGCChunkAccess(t *testing.T) {
 	cr, err := h.chunksRange(0, 1500, nil)
 	require.NoError(t, err)
-	_, err = cr.Chunk(chunks[0].Ref)
+	_, err = cr.Chunk(chunks[0])
 	require.NoError(t, err)
-	_, err = cr.Chunk(chunks[1].Ref)
+	_, err = cr.Chunk(chunks[1])
 	require.NoError(t, err)
 	require.NoError(t, h.Truncate(1500)) // Remove a chunk.
-	_, err = cr.Chunk(chunks[0].Ref)
+	_, err = cr.Chunk(chunks[0])
 	require.Equal(t, storage.ErrNotFound, err)
-	_, err = cr.Chunk(chunks[1].Ref)
+	_, err = cr.Chunk(chunks[1])
 	require.NoError(t, err)
 }
 func TestGCSeriesAccess(t *testing.T) {
 	// Put a series, select it. GC it and then access it.
-	h, _ := newTestHead(t, 1000, false)
+	h, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, h.Close())
 	}()
@ -1450,23 +1456,23 @@ func TestGCSeriesAccess(t *testing.T) {
 	cr, err := h.chunksRange(0, 2000, nil)
 	require.NoError(t, err)
-	_, err = cr.Chunk(chunks[0].Ref)
+	_, err = cr.Chunk(chunks[0])
 	require.NoError(t, err)
-	_, err = cr.Chunk(chunks[1].Ref)
+	_, err = cr.Chunk(chunks[1])
 	require.NoError(t, err)
 	require.NoError(t, h.Truncate(2000)) // Remove the series.
 	require.Equal(t, (*memSeries)(nil), h.series.getByID(1))
-	_, err = cr.Chunk(chunks[0].Ref)
+	_, err = cr.Chunk(chunks[0])
 	require.Equal(t, storage.ErrNotFound, err)
-	_, err = cr.Chunk(chunks[1].Ref)
+	_, err = cr.Chunk(chunks[1])
 	require.Equal(t, storage.ErrNotFound, err)
 }
 func TestUncommittedSamplesNotLostOnTruncate(t *testing.T) {
-	h, _ := newTestHead(t, 1000, false)
+	h, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, h.Close())
 	}()
@ -1496,7 +1502,7 @@ func TestUncommittedSamplesNotLostOnTruncate(t *testing.T) {
 }
 func TestRemoveSeriesAfterRollbackAndTruncate(t *testing.T) {
-	h, _ := newTestHead(t, 1000, false)
+	h, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, h.Close())
 	}()
@ -1529,7 +1535,7 @@ func TestRemoveSeriesAfterRollbackAndTruncate(t *testing.T) {
 func TestHead_LogRollback(t *testing.T) {
 	for _, compress := range []bool{false, true} {
 		t.Run(fmt.Sprintf("compress=%t", compress), func(t *testing.T) {
-			h, w := newTestHead(t, 1000, compress)
+			h, w := newTestHead(t, 1000, compress, false)
 			defer func() {
 				require.NoError(t, h.Close())
 			}()
@ -1606,7 +1612,7 @@ func TestWalRepair_DecodingError(t *testing.T) {
 					opts := DefaultHeadOptions()
 					opts.ChunkRange = 1
 					opts.ChunkDirRoot = w.Dir()
-					h, err := NewHead(nil, nil, w, opts, nil)
+					h, err := NewHead(nil, nil, w, nil, opts, nil)
 					require.NoError(t, err)
 					require.Equal(t, 0.0, prom_testutil.ToFloat64(h.metrics.walCorruptionsTotal))
 					initErr := h.Init(math.MinInt64)
@ -1660,7 +1666,8 @@ func TestHeadReadWriterRepair(t *testing.T) {
 		opts := DefaultHeadOptions()
 		opts.ChunkRange = chunkRange
 		opts.ChunkDirRoot = dir
-		h, err := NewHead(nil, nil, w, opts, nil)
+		opts.ChunkWriteQueueSize = 1 // We need to set this option so that we use the async queue. Upstream prometheus uses the queue directly.
 		h, err := NewHead(nil, nil, w, nil, opts, nil)
 		require.NoError(t, err)
 		require.Equal(t, 0.0, prom_testutil.ToFloat64(h.metrics.mmapChunkCorruptionTotal))
 		require.NoError(t, h.Init(math.MinInt64))
@ -1715,7 +1722,7 @@ func TestHeadReadWriterRepair(t *testing.T) {
 }
 func TestNewWalSegmentOnTruncate(t *testing.T) {
-	h, wlog := newTestHead(t, 1000, false)
+	h, wlog := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, h.Close())
 	}()
@ -1745,7 +1752,7 @@ func TestNewWalSegmentOnTruncate(t *testing.T) {
 }
 func TestAddDuplicateLabelName(t *testing.T) {
-	h, _ := newTestHead(t, 1000, false)
+	h, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, h.Close())
 	}()
@ -1828,7 +1835,7 @@ func TestMemSeriesIsolation(t *testing.T) {
 	}
 	// Test isolation without restart of Head.
-	hb, _ := newTestHead(t, 1000, false)
+	hb, _ := newTestHead(t, 1000, false, false)
 	i := addSamples(hb)
 	testIsolation(hb, i)
@ -1890,7 +1897,7 @@ func TestMemSeriesIsolation(t *testing.T) {
 	require.NoError(t, hb.Close())
 	// Test isolation with restart of Head. This is to verify the num samples of chunks after m-map chunk replay.
-	hb, w := newTestHead(t, 1000, false)
+	hb, w := newTestHead(t, 1000, false, false)
 	i = addSamples(hb)
 	require.NoError(t, hb.Close())
@ -1899,7 +1906,7 @@ func TestMemSeriesIsolation(t *testing.T) {
 	opts := DefaultHeadOptions()
 	opts.ChunkRange = 1000
 	opts.ChunkDirRoot = wlog.Dir()
-	hb, err = NewHead(nil, nil, wlog, opts, nil)
+	hb, err = NewHead(nil, nil, wlog, nil, opts, nil)
 	defer func() { require.NoError(t, hb.Close()) }()
 	require.NoError(t, err)
 	require.NoError(t, hb.Init(0))
@ -1943,7 +1950,7 @@ func TestIsolationRollback(t *testing.T) {
 	}
 	// Rollback after a failed append and test if the low watermark has progressed anyway.
-	hb, _ := newTestHead(t, 1000, false)
+	hb, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, hb.Close())
 	}()
@ -1974,7 +1981,7 @@ func TestIsolationLowWatermarkMonotonous(t *testing.T) {
 		t.Skip("skipping test since tsdb isolation is disabled")
 	}
-	hb, _ := newTestHead(t, 1000, false)
+	hb, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, hb.Close())
 	}()
@ -2011,7 +2018,7 @@ func TestIsolationAppendIDZeroIsNoop(t *testing.T) {
 		t.Skip("skipping test since tsdb isolation is disabled")
 	}
-	h, _ := newTestHead(t, 1000, false)
+	h, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, h.Close())
 	}()
@ -2036,7 +2043,7 @@ func TestIsolationWithoutAdd(t *testing.T) {
 		t.Skip("skipping test since tsdb isolation is disabled")
 	}
-	hb, _ := newTestHead(t, 1000, false)
+	hb, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, hb.Close())
 	}()
@ -2131,7 +2138,7 @@ func TestOutOfOrderSamplesMetric(t *testing.T) {
 }
 func testHeadSeriesChunkRace(t *testing.T) {
-	h, _ := newTestHead(t, 1000, false)
+	h, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, h.Close())
 	}()
@ -2166,7 +2173,7 @@ func testHeadSeriesChunkRace(t *testing.T) {
 }
 func TestHeadLabelNamesValuesWithMinMaxRange(t *testing.T) {
-	head, _ := newTestHead(t, 1000, false)
+	head, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, head.Close())
 	}()
@ -2226,7 +2233,7 @@ func TestHeadLabelNamesValuesWithMinMaxRange(t *testing.T) {
 }
 func TestHeadLabelValuesWithMatchers(t *testing.T) {
-	head, _ := newTestHead(t, 1000, false)
+	head, _ := newTestHead(t, 1000, false, false)
 	t.Cleanup(func() { require.NoError(t, head.Close()) })
 	app := head.Appender(context.Background())
@ -2285,7 +2292,7 @@ func TestHeadLabelValuesWithMatchers(t *testing.T) {
 }
 func TestHeadLabelNamesWithMatchers(t *testing.T) {
-	head, _ := newTestHead(t, 1000, false)
+	head, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, head.Close())
 	}()
@ -2353,7 +2360,7 @@ func TestHeadLabelNamesWithMatchers(t *testing.T) {
 }
 func TestErrReuseAppender(t *testing.T) {
-	head, _ := newTestHead(t, 1000, false)
+	head, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, head.Close())
 	}()
@ -2389,7 +2396,7 @@ func TestErrReuseAppender(t *testing.T) {
 func TestHeadMintAfterTruncation(t *testing.T) {
 	chunkRange := int64(2000)
-	head, _ := newTestHead(t, chunkRange, false)
+	head, _ := newTestHead(t, chunkRange, false, false)
 	app := head.Appender(context.Background())
 	_, err := app.Append(0, labels.FromStrings("a", "b"), 100, 100)
@ -2423,7 +2430,7 @@ func TestHeadMintAfterTruncation(t *testing.T) {
 func TestHeadExemplars(t *testing.T) {
 	chunkRange := int64(2000)
-	head, _ := newTestHead(t, chunkRange, false)
+	head, _ := newTestHead(t, chunkRange, false, false)
 	app := head.Appender(context.Background())
 	l := labels.FromStrings("traceId", "123")
@ -2445,7 +2452,7 @@ func TestHeadExemplars(t *testing.T) {
 func BenchmarkHeadLabelValuesWithMatchers(b *testing.B) {
 	chunkRange := int64(2000)
-	head, _ := newTestHead(b, chunkRange, false)
+	head, _ := newTestHead(b, chunkRange, false, false)
 	b.Cleanup(func() { require.NoError(b, head.Close()) })
 	app := head.Appender(context.Background())
@ -2483,7 +2490,7 @@ func TestMemSafeIteratorSeekIntoBuffer(t *testing.T) {
 		require.NoError(t, chunkDiskMapper.Close())
 	}()
-	s := newMemSeries(labels.Labels{}, 1, 500, defaultIsolationDisabled)
+	s := newMemSeries(labels.Labels{}, 1, 500, 1, defaultIsolationDisabled)
 	for i := 0; i < 7; i++ {
 		ok, _ := s.append(int64(i), float64(i), 0, chunkDiskMapper)
@ -2754,7 +2761,7 @@ func TestWaitForPendingReadersInTimeRange(t *testing.T) {
 }
 func TestChunkSnapshot(t *testing.T) {
-	head, _ := newTestHead(t, 120*4, false)
+	head, _ := newTestHead(t, 120*4, false, false)
 	defer func() {
 		head.opts.EnableMemorySnapshotOnShutdown = false
 		require.NoError(t, head.Close())
@ -2833,7 +2840,7 @@ func TestChunkSnapshot(t *testing.T) {
 	openHeadAndCheckReplay := func() {
 		w, err := wal.NewSize(nil, nil, head.wal.Dir(), 32768, false)
 		require.NoError(t, err)
-		head, err = NewHead(nil, nil, w, head.opts, nil)
+		head, err = NewHead(nil, nil, w, nil, head.opts, nil)
 		require.NoError(t, err)
 		require.NoError(t, head.Init(math.MinInt64))
@ -2996,7 +3003,7 @@ func TestChunkSnapshot(t *testing.T) {
 }
 func TestSnapshotError(t *testing.T) {
-	head, _ := newTestHead(t, 120*4, false)
+	head, _ := newTestHead(t, 120*4, false, false)
 	defer func() {
 		head.opts.EnableMemorySnapshotOnShutdown = false
 		require.NoError(t, head.Close())
@ -3043,7 +3050,7 @@ func TestSnapshotError(t *testing.T) {
 	w, err := wal.NewSize(nil, nil, head.wal.Dir(), 32768, false)
 	require.NoError(t, err)
 	// Testing https://github.com/prometheus/prometheus/issues/9437 with the registry.
-	head, err = NewHead(prometheus.NewRegistry(), nil, w, head.opts, nil)
+	head, err = NewHead(prometheus.NewRegistry(), nil, w, nil, head.opts, nil)
 	require.NoError(t, err)
 	require.NoError(t, head.Init(math.MinInt64))
@ -3102,7 +3109,7 @@ func TestChunkSnapshotReplayBug(t *testing.T) {
 	opts := DefaultHeadOptions()
 	opts.ChunkDirRoot = dir
 	opts.EnableMemorySnapshotOnShutdown = true
-	head, err := NewHead(nil, nil, wlog, opts, nil)
+	head, err := NewHead(nil, nil, wlog, nil, opts, nil)
 	require.NoError(t, err)
 	require.NoError(t, head.Init(math.MinInt64))
 	defer func() {
@ -3136,7 +3143,7 @@ func TestChunkSnapshotTakenAfterIncompleteSnapshot(t *testing.T) {
 	opts := DefaultHeadOptions()
 	opts.ChunkDirRoot = dir
 	opts.EnableMemorySnapshotOnShutdown = true
-	head, err := NewHead(nil, nil, wlog, opts, nil)
+	head, err := NewHead(nil, nil, wlog, nil, opts, nil)
 	require.NoError(t, err)
 	require.NoError(t, head.Init(math.MinInt64))
@ -3159,6 +3166,251 @@ func TestChunkSnapshotTakenAfterIncompleteSnapshot(t *testing.T) {
 	require.Greater(t, offset, 0)
 }
 // TestOOOWalReplay checks the replay at a low level.
 // TODO(codesome): Needs test for ooo WAL repair.
 func TestOOOWalReplay(t *testing.T) {
 	dir := t.TempDir()
 	wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
 	require.NoError(t, err)
 	oooWlog, err := wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
 	require.NoError(t, err)
 	opts := DefaultHeadOptions()
 	opts.ChunkRange = 1000
 	opts.ChunkDirRoot = dir
 	opts.OutOfOrderTimeWindow.Store(30 * time.Minute.Milliseconds())
 	h, err := NewHead(nil, nil, wlog, oooWlog, opts, nil)
 	require.NoError(t, err)
 	require.NoError(t, h.Init(0))
 	var expOOOSamples []sample
 	l := labels.FromStrings("foo", "bar")
 	appendSample := func(mins int64, isOOO bool) {
 		app := h.Appender(context.Background())
 		ts, v := mins*time.Minute.Milliseconds(), float64(mins)
 		_, err := app.Append(0, l, ts, v)
 		require.NoError(t, err)
 		require.NoError(t, app.Commit())
 		if isOOO {
 			expOOOSamples = append(expOOOSamples, sample{t: ts, v: v})
 		}
 	}
 	// In-order sample.
 	appendSample(60, false)
 	// Out of order samples.
 	appendSample(40, true)
 	appendSample(35, true)
 	appendSample(50, true)
 	appendSample(55, true)
 	appendSample(59, true)
 	appendSample(31, true)
 	// Check that Head's time ranges are set properly.
 	require.Equal(t, 60*time.Minute.Milliseconds(), h.MinTime())
 	require.Equal(t, 60*time.Minute.Milliseconds(), h.MaxTime())
 	require.Equal(t, 31*time.Minute.Milliseconds(), h.MinOOOTime())
 	require.Equal(t, 59*time.Minute.Milliseconds(), h.MaxOOOTime())
 	// Restart head.
 	require.NoError(t, h.Close())
 	wlog, err = wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
 	require.NoError(t, err)
 	oooWlog, err = wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
 	require.NoError(t, err)
 	h, err = NewHead(nil, nil, wlog, oooWlog, opts, nil)
 	require.NoError(t, err)
 	require.NoError(t, h.Init(0)) // Replay happens here.
 	// Get the ooo samples from the Head.
 	ms, ok, err := h.getOrCreate(l.Hash(), l)
 	require.NoError(t, err)
 	require.False(t, ok)
 	require.NotNil(t, ms)
 	xor, err := ms.oooHeadChunk.chunk.ToXOR()
 	require.NoError(t, err)
 	it := xor.Iterator(nil)
 	actOOOSamples := make([]sample, 0, len(expOOOSamples))
 	for it.Next() {
 		ts, v := it.At()
 		actOOOSamples = append(actOOOSamples, sample{t: ts, v: v})
 	}
 	// OOO chunk will be sorted. Hence sort the expected samples.
 	sort.Slice(expOOOSamples, func(i, j int) bool {
 		return expOOOSamples[i].t < expOOOSamples[j].t
 	})
 	require.Equal(t, expOOOSamples, actOOOSamples)
 	require.NoError(t, h.Close())
 }
 // TestOOOMmapReplay checks the replay at a low level.
 func TestOOOMmapReplay(t *testing.T) {
 	dir := t.TempDir()
 	wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
 	require.NoError(t, err)
 	oooWlog, err := wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
 	require.NoError(t, err)
 	opts := DefaultHeadOptions()
 	opts.ChunkRange = 1000
 	opts.ChunkDirRoot = dir
 	opts.OutOfOrderCapMax.Store(30)
 	opts.OutOfOrderTimeWindow.Store(1000 * time.Minute.Milliseconds())
 	h, err := NewHead(nil, nil, wlog, oooWlog, opts, nil)
 	require.NoError(t, err)
 	require.NoError(t, h.Init(0))
 	l := labels.FromStrings("foo", "bar")
 	appendSample := func(mins int64) {
 		app := h.Appender(context.Background())
 		ts, v := mins*time.Minute.Milliseconds(), float64(mins)
 		_, err := app.Append(0, l, ts, v)
 		require.NoError(t, err)
 		require.NoError(t, app.Commit())
 	}
 	// In-order sample.
 	appendSample(200)
 	// Out of order samples. 92 samples to create 3 m-map chunks.
 	for mins := int64(100); mins <= 191; mins++ {
 		appendSample(mins)
 	}
 	ms, ok, err := h.getOrCreate(l.Hash(), l)
 	require.NoError(t, err)
 	require.False(t, ok)
 	require.NotNil(t, ms)
 	require.Len(t, ms.oooMmappedChunks, 3)
 	// Verify that we can access the chunks without error.
 	for _, m := range ms.oooMmappedChunks {
 		chk, err := h.chunkDiskMapper.Chunk(m.ref)
 		require.NoError(t, err)
 		require.Equal(t, int(m.numSamples), chk.NumSamples())
 	}
 	expMmapChunks := make([]*mmappedChunk, 3)
 	copy(expMmapChunks, ms.oooMmappedChunks)
 	// Restart head.
 	require.NoError(t, h.Close())
 	wlog, err = wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
 	require.NoError(t, err)
 	oooWlog, err = wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
 	require.NoError(t, err)
 	h, err = NewHead(nil, nil, wlog, oooWlog, opts, nil)
 	require.NoError(t, err)
 	require.NoError(t, h.Init(0)) // Replay happens here.
 	// Get the mmap chunks from the Head.
 	ms, ok, err = h.getOrCreate(l.Hash(), l)
 	require.NoError(t, err)
 	require.False(t, ok)
 	require.NotNil(t, ms)
 	require.Len(t, ms.oooMmappedChunks, len(expMmapChunks))
 	// Verify that we can access the chunks without error.
 	for _, m := range ms.oooMmappedChunks {
 		chk, err := h.chunkDiskMapper.Chunk(m.ref)
 		require.NoError(t, err)
 		require.Equal(t, int(m.numSamples), chk.NumSamples())
 	}
 	actMmapChunks := make([]*mmappedChunk, len(expMmapChunks))
 	copy(actMmapChunks, ms.oooMmappedChunks)
 	require.Equal(t, expMmapChunks, actMmapChunks)
 	require.NoError(t, h.Close())
 }
 func TestHeadInit_DiscardChunksWithUnsupportedEncoding(t *testing.T) {
 	h, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, h.Close())
 	}()
 	require.NoError(t, h.Init(0))
 	ctx := context.Background()
 	app := h.Appender(ctx)
 	seriesLabels := labels.FromStrings("a", "1")
 	var seriesRef storage.SeriesRef
 	var err error
 	for i := 0; i < 400; i++ {
 		seriesRef, err = app.Append(0, seriesLabels, int64(i), float64(i))
 		require.NoError(t, err)
 	}
 	require.NoError(t, app.Commit())
 	require.Greater(t, prom_testutil.ToFloat64(h.metrics.chunksCreated), 1.0)
 	uc := newUnsupportedChunk()
 	// Make this chunk not overlap with the previous and the next
 	h.chunkDiskMapper.WriteChunk(chunks.HeadSeriesRef(seriesRef), 500, 600, uc, func(err error) { require.NoError(t, err) })
 	app = h.Appender(ctx)
 	for i := 700; i < 1200; i++ {
 		_, err := app.Append(0, seriesLabels, int64(i), float64(i))
 		require.NoError(t, err)
 	}
 	require.NoError(t, app.Commit())
 	require.Greater(t, prom_testutil.ToFloat64(h.metrics.chunksCreated), 4.0)
 	series, created, err := h.getOrCreate(seriesLabels.Hash(), seriesLabels)
 	require.NoError(t, err)
 	require.False(t, created, "should already exist")
 	require.NotNil(t, series, "should return the series we created above")
 	expChunks := make([]*mmappedChunk, len(series.mmappedChunks))
 	copy(expChunks, series.mmappedChunks)
 	require.NoError(t, h.Close())
 	wlog, err := wal.NewSize(nil, nil, filepath.Join(h.opts.ChunkDirRoot, "wal"), 32768, false)
 	require.NoError(t, err)
 	h, err = NewHead(nil, nil, wlog, nil, h.opts, nil)
 	require.NoError(t, err)
 	require.NoError(t, h.Init(0))
 	series, created, err = h.getOrCreate(seriesLabels.Hash(), seriesLabels)
 	require.NoError(t, err)
 	require.False(t, created, "should already exist")
 	require.NotNil(t, series, "should return the series we created above")
 	require.Equal(t, expChunks, series.mmappedChunks)
 }
 const (
 	UnsupportedMask   = 0b10000000
 	EncUnsupportedXOR = chunkenc.EncXOR | UnsupportedMask
 )
 // unsupportedChunk holds a XORChunk and overrides the Encoding() method.
 type unsupportedChunk struct {
 	*chunkenc.XORChunk
 }
 func newUnsupportedChunk() *unsupportedChunk {
 	return &unsupportedChunk{chunkenc.NewXORChunk()}
 }
 func (c *unsupportedChunk) Encoding() chunkenc.Encoding {
 	return EncUnsupportedXOR
 }
 // Tests https://github.com/prometheus/prometheus/issues/10277.
 func TestMmapPanicAfterMmapReplayCorruption(t *testing.T) {
 	dir := t.TempDir()
@ -3171,7 +3423,7 @@ func TestMmapPanicAfterMmapReplayCorruption(t *testing.T) {
 	opts.EnableExemplarStorage = true
 	opts.MaxExemplars.Store(config.DefaultExemplarsConfig.MaxExemplars)
-	h, err := NewHead(nil, nil, wlog, opts, nil)
+	h, err := NewHead(nil, nil, wlog, nil, opts, nil)
 	require.NoError(t, err)
 	require.NoError(t, h.Init(0))
@ -3205,7 +3457,7 @@ func TestMmapPanicAfterMmapReplayCorruption(t *testing.T) {
 	require.NoError(t, err)
 	require.NoError(t, f.Close())
-	h, err = NewHead(nil, nil, wlog, opts, nil)
+	h, err = NewHead(nil, nil, wlog, nil, opts, nil)
 	require.NoError(t, err)
 	require.NoError(t, h.Init(0))
@ -3230,7 +3482,7 @@ func TestReplayAfterMmapReplayError(t *testing.T) {
 		opts.EnableMemorySnapshotOnShutdown = true
 		opts.MaxExemplars.Store(config.DefaultExemplarsConfig.MaxExemplars)
-		h, err = NewHead(nil, nil, wlog, opts, nil)
+		h, err = NewHead(nil, nil, wlog, nil, opts, nil)
 		require.NoError(t, err)
 		require.NoError(t, h.Init(0))
 	}
@ -3292,3 +3544,131 @@ func TestReplayAfterMmapReplayError(t *testing.T) {
 	require.NoError(t, h.Close())
 }
 func TestOOOAppendWithNoSeries(t *testing.T) {
 	dir := t.TempDir()
 	wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
 	require.NoError(t, err)
 	oooWlog, err := wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
 	require.NoError(t, err)
 	opts := DefaultHeadOptions()
 	opts.ChunkDirRoot = dir
 	opts.OutOfOrderCapMax.Store(30)
 	opts.OutOfOrderTimeWindow.Store(120 * time.Minute.Milliseconds())
 	h, err := NewHead(nil, nil, wlog, oooWlog, opts, nil)
 	require.NoError(t, err)
 	t.Cleanup(func() {
 		require.NoError(t, h.Close())
 	})
 	require.NoError(t, h.Init(0))
 	appendSample := func(lbls labels.Labels, ts int64) {
 		app := h.Appender(context.Background())
 		_, err := app.Append(0, lbls, ts*time.Minute.Milliseconds(), float64(ts))
 		require.NoError(t, err)
 		require.NoError(t, app.Commit())
 	}
 	verifyOOOSamples := func(lbls labels.Labels, expSamples int) {
 		ms, created, err := h.getOrCreate(lbls.Hash(), lbls)
 		require.NoError(t, err)
 		require.False(t, created)
 		require.NotNil(t, ms)
 		require.Nil(t, ms.headChunk)
 		require.NotNil(t, ms.oooHeadChunk)
 		require.Equal(t, expSamples, ms.oooHeadChunk.chunk.NumSamples())
 	}
 	verifyInOrderSamples := func(lbls labels.Labels, expSamples int) {
 		ms, created, err := h.getOrCreate(lbls.Hash(), lbls)
 		require.NoError(t, err)
 		require.False(t, created)
 		require.NotNil(t, ms)
 		require.Nil(t, ms.oooHeadChunk)
 		require.NotNil(t, ms.headChunk)
 		require.Equal(t, expSamples, ms.headChunk.chunk.NumSamples())
 	}
 	newLabels := func(idx int) labels.Labels { return labels.FromStrings("foo", fmt.Sprintf("%d", idx)) }
 	s1 := newLabels(1)
 	appendSample(s1, 300) // At 300m.
 	verifyInOrderSamples(s1, 1)
 	// At 239m, the sample cannot be appended to in-order chunk since it is
 	// beyond the minValidTime. So it should go in OOO chunk.
 	// Series does not exist for s2 yet.
 	s2 := newLabels(2)
 	appendSample(s2, 239) // OOO sample.
 	verifyOOOSamples(s2, 1)
 	// Similar for 180m.
 	s3 := newLabels(3)
 	appendSample(s3, 180) // OOO sample.
 	verifyOOOSamples(s3, 1)
 	// Now 179m is too old.
 	s4 := newLabels(4)
 	app := h.Appender(context.Background())
 	_, err = app.Append(0, s4, 179*time.Minute.Milliseconds(), float64(179))
 	require.Equal(t, storage.ErrTooOldSample, err)
 	require.NoError(t, app.Rollback())
 	verifyOOOSamples(s3, 1)
 	// Samples still go into in-order chunk for samples within
 	// appendable minValidTime.
 	s5 := newLabels(5)
 	appendSample(s5, 240)
 	verifyInOrderSamples(s5, 1)
 }
 func TestHeadMinOOOTimeUpdate(t *testing.T) {
 	dir := t.TempDir()
 	wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
 	require.NoError(t, err)
 	oooWlog, err := wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
 	require.NoError(t, err)
 	opts := DefaultHeadOptions()
 	opts.ChunkDirRoot = dir
 	opts.OutOfOrderTimeWindow.Store(10 * time.Minute.Milliseconds())
 	h, err := NewHead(nil, nil, wlog, oooWlog, opts, nil)
 	require.NoError(t, err)
 	t.Cleanup(func() {
 		require.NoError(t, h.Close())
 	})
 	require.NoError(t, h.Init(0))
 	appendSample := func(ts int64) {
 		lbls := labels.FromStrings("foo", "bar")
 		app := h.Appender(context.Background())
 		_, err := app.Append(0, lbls, ts*time.Minute.Milliseconds(), float64(ts))
 		require.NoError(t, err)
 		require.NoError(t, app.Commit())
 	}
 	appendSample(300) // In-order sample.
 	require.Equal(t, int64(math.MaxInt64), h.MinOOOTime())
 	appendSample(295) // OOO sample.
 	require.Equal(t, 295*time.Minute.Milliseconds(), h.MinOOOTime())
 	// Allowed window for OOO is >=290, which is before the earliest ooo sample 295, so it gets set to the lower value.
 	require.NoError(t, h.truncateOOO(0, 1))
 	require.Equal(t, 290*time.Minute.Milliseconds(), h.MinOOOTime())
 	appendSample(310) // In-order sample.
 	appendSample(305) // OOO sample.
 	require.Equal(t, 290*time.Minute.Milliseconds(), h.MinOOOTime())
 	// Now the OOO sample 295 was not gc'ed yet. And allowed window for OOO is now >=300.
 	// So the lowest among them, 295, is set as minOOOTime.
 	require.NoError(t, h.truncateOOO(0, 2))
 	require.Equal(t, 295*time.Minute.Milliseconds(), h.MinOOOTime())
 }
--- a/tsdb/head_wal.go
+++ b/tsdb/head_wal.go
@ -42,7 +42,7 @@ import (
 	"github.com/prometheus/prometheus/tsdb/wal"
 )
-func (h *Head) loadWAL(r *wal.Reader, multiRef map[chunks.HeadSeriesRef]chunks.HeadSeriesRef, mmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk) (err error) {
+func (h *Head) loadWAL(r *wal.Reader, multiRef map[chunks.HeadSeriesRef]chunks.HeadSeriesRef, mmappedChunks, oooMmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk) (err error) {
 	// Track number of samples that referenced a series we don't know about
 	// for error reporting.
 	var unknownRefs atomic.Uint64
@ -107,7 +107,7 @@ func (h *Head) loadWAL(r *wal.Reader, multiRef map[chunks.HeadSeriesRef]chunks.H
 		processors[i].setup()
 		go func(wp *walSubsetProcessor) {
-			unknown, overlapping := wp.processWALSamples(h, mmappedChunks)
+			unknown, overlapping := wp.processWALSamples(h, mmappedChunks, oooMmappedChunks)
 			unknownRefs.Add(unknown)
 			mmapOverlappingChunks.Add(overlapping)
 			wg.Done()
@ -343,7 +343,7 @@ Outer:
 }
 // resetSeriesWithMMappedChunks is only used during the WAL replay.
-func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc []*mmappedChunk, walSeriesRef chunks.HeadSeriesRef) (overlapped bool) {
+func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc, oooMmc []*mmappedChunk, walSeriesRef chunks.HeadSeriesRef) (overlapped bool) {
 	if mSeries.ref != walSeriesRef {
 		// Checking if the new m-mapped chunks overlap with the already existing ones.
 		if len(mSeries.mmappedChunks) > 0 && len(mmc) > 0 {
@ -368,10 +368,11 @@ func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc []*mmappedCh
 		}
 	}
-	h.metrics.chunksCreated.Add(float64(len(mmc)))
+	h.metrics.chunksCreated.Add(float64(len(mmc) + len(oooMmc)))
 	h.metrics.chunksRemoved.Add(float64(len(mSeries.mmappedChunks)))
-	h.metrics.chunks.Add(float64(len(mmc) - len(mSeries.mmappedChunks)))
+	h.metrics.chunks.Add(float64(len(mmc) + len(oooMmc) - len(mSeries.mmappedChunks)))
 	mSeries.mmappedChunks = mmc
 	mSeries.oooMmappedChunks = oooMmc
 	// Cache the last mmapped chunk time, so we can skip calling append() for samples it will reject.
 	if len(mmc) == 0 {
 		mSeries.mmMaxTime = math.MinInt64
@ -379,6 +380,19 @@ func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc []*mmappedCh
 		mSeries.mmMaxTime = mmc[len(mmc)-1].maxTime
 		h.updateMinMaxTime(mmc[0].minTime, mSeries.mmMaxTime)
 	}
 	if len(oooMmc) != 0 {
 		// Mint and maxt can be in any chunk, they are not sorted.
 		mint, maxt := int64(math.MaxInt64), int64(math.MinInt64)
 		for _, ch := range oooMmc {
 			if ch.minTime < mint {
 				mint = ch.minTime
 			}
 			if ch.maxTime > maxt {
 				maxt = ch.maxTime
 			}
 		}
 		h.updateMinOOOMaxOOOTime(mint, maxt)
 	}
 	// Any samples replayed till now would already be compacted. Resetting the head chunk.
 	mSeries.nextAt = 0
@ -421,7 +435,7 @@ func (wp *walSubsetProcessor) reuseBuf() []record.RefSample {
 // processWALSamples adds the samples it receives to the head and passes
 // the buffer received to an output channel for reuse.
-func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk) (unknownRefs, mmapOverlappingChunks uint64) {
+func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks, oooMmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk) (unknownRefs, mmapOverlappingChunks uint64) {
 	defer close(wp.output)
 	mint, maxt := int64(math.MaxInt64), int64(math.MinInt64)
@ -429,7 +443,8 @@ func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks map[chunk
 	for in := range wp.input {
 		if in.existingSeries != nil {
 			mmc := mmappedChunks[in.walSeriesRef]
-			if h.resetSeriesWithMMappedChunks(in.existingSeries, mmc, in.walSeriesRef) {
+			oooMmc := oooMmappedChunks[in.walSeriesRef]
 			if h.resetSeriesWithMMappedChunks(in.existingSeries, mmc, oooMmc, in.walSeriesRef) {
 				mmapOverlappingChunks++
 			}
 			continue
@ -465,6 +480,292 @@ func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks map[chunk
 	return unknownRefs, mmapOverlappingChunks
 }
 func (h *Head) loadWBL(r *wal.Reader, multiRef map[chunks.HeadSeriesRef]chunks.HeadSeriesRef, lastMmapRef chunks.ChunkDiskMapperRef) (err error) {
 	// Track number of samples, m-map markers, that referenced a series we don't know about
 	// for error reporting.
 	var unknownRefs, mmapMarkerUnknownRefs atomic.Uint64
 	lastSeq, lastOff := lastMmapRef.Unpack()
 	// Start workers that each process samples for a partition of the series ID space.
 	var (
 		wg         sync.WaitGroup
 		n          = runtime.GOMAXPROCS(0)
 		processors = make([]wblSubsetProcessor, n)
 		dec    record.Decoder
 		shards = make([][]record.RefSample, n)
 		decodedCh   = make(chan interface{}, 10)
 		decodeErr   error
 		samplesPool = sync.Pool{
 			New: func() interface{} {
 				return []record.RefSample{}
 			},
 		}
 		markersPool = sync.Pool{
 			New: func() interface{} {
 				return []record.RefMmapMarker{}
 			},
 		}
 	)
 	defer func() {
 		// For CorruptionErr ensure to terminate all workers before exiting.
 		// We also wrap it to identify OOO WBL corruption.
 		_, ok := err.(*wal.CorruptionErr)
 		if ok {
 			err = &errLoadWbl{err: err}
 			for i := 0; i < n; i++ {
 				processors[i].closeAndDrain()
 			}
 			wg.Wait()
 		}
 	}()
 	wg.Add(n)
 	for i := 0; i < n; i++ {
 		processors[i].setup()
 		go func(wp *wblSubsetProcessor) {
 			unknown := wp.processWBLSamples(h)
 			unknownRefs.Add(unknown)
 			wg.Done()
 		}(&processors[i])
 	}
 	go func() {
 		defer close(decodedCh)
 		for r.Next() {
 			rec := r.Record()
 			switch dec.Type(rec) {
 			case record.Samples:
 				samples := samplesPool.Get().([]record.RefSample)[:0]
 				samples, err = dec.Samples(rec, samples)
 				if err != nil {
 					decodeErr = &wal.CorruptionErr{
 						Err:     errors.Wrap(err, "decode samples"),
 						Segment: r.Segment(),
 						Offset:  r.Offset(),
 					}
 					return
 				}
 				decodedCh <- samples
 			case record.MmapMarkers:
 				markers := markersPool.Get().([]record.RefMmapMarker)[:0]
 				markers, err = dec.MmapMarkers(rec, markers)
 				if err != nil {
 					decodeErr = &wal.CorruptionErr{
 						Err:     errors.Wrap(err, "decode mmap markers"),
 						Segment: r.Segment(),
 						Offset:  r.Offset(),
 					}
 					return
 				}
 				decodedCh <- markers
 			default:
 				// Noop.
 			}
 		}
 	}()
 	// The records are always replayed from the oldest to the newest.
 	for d := range decodedCh {
 		switch v := d.(type) {
 		case []record.RefSample:
 			samples := v
 			// We split up the samples into parts of 5000 samples or less.
 			// With O(300 * #cores) in-flight sample batches, large scrapes could otherwise
 			// cause thousands of very large in flight buffers occupying large amounts
 			// of unused memory.
 			for len(samples) > 0 {
 				m := 5000
 				if len(samples) < m {
 					m = len(samples)
 				}
 				for i := 0; i < n; i++ {
 					shards[i] = processors[i].reuseBuf()
 				}
 				for _, sam := range samples[:m] {
 					if r, ok := multiRef[sam.Ref]; ok {
 						sam.Ref = r
 					}
 					mod := uint64(sam.Ref) % uint64(n)
 					shards[mod] = append(shards[mod], sam)
 				}
 				for i := 0; i < n; i++ {
 					processors[i].input <- shards[i]
 				}
 				samples = samples[m:]
 			}
 			//nolint:staticcheck // Ignore SA6002 relax staticcheck verification.
 			samplesPool.Put(d)
 		case []record.RefMmapMarker:
 			markers := v
 			for _, rm := range markers {
 				seq, off := rm.MmapRef.Unpack()
 				if seq > lastSeq || (seq == lastSeq && off > lastOff) {
 					// This m-map chunk from markers was not present during
 					// the load of mmapped chunks that happened in the head
 					// initialization.
 					continue
 				}
 				if r, ok := multiRef[rm.Ref]; ok {
 					rm.Ref = r
 				}
 				ms := h.series.getByID(rm.Ref)
 				if ms == nil {
 					mmapMarkerUnknownRefs.Inc()
 					continue
 				}
 				idx := uint64(ms.ref) % uint64(n)
 				// It is possible that some old sample is being processed in processWALSamples that
 				// could cause race below. So we wait for the goroutine to empty input the buffer and finish
 				// processing all old samples after emptying the buffer.
 				processors[idx].waitUntilIdle()
 				// Lock the subset so we can modify the series object
 				processors[idx].mx.Lock()
 				// All samples till now have been m-mapped. Hence clear out the headChunk.
 				// In case some samples slipped through and went into m-map chunks because of changed
 				// chunk size parameters, we are not taking care of that here.
 				// TODO(codesome): see if there is a way to avoid duplicate m-map chunks if
 				// the size of ooo chunk was reduced between restart.
 				ms.oooHeadChunk = nil
 				processors[idx].mx.Unlock()
 			}
 		default:
 			panic(fmt.Errorf("unexpected decodedCh type: %T", d))
 		}
 	}
 	if decodeErr != nil {
 		return decodeErr
 	}
 	// Signal termination to each worker and wait for it to close its output channel.
 	for i := 0; i < n; i++ {
 		processors[i].closeAndDrain()
 	}
 	wg.Wait()
 	if r.Err() != nil {
 		return errors.Wrap(r.Err(), "read records")
 	}
 	if unknownRefs.Load() > 0 || mmapMarkerUnknownRefs.Load() > 0 {
 		level.Warn(h.logger).Log("msg", "Unknown series references for ooo WAL replay", "samples", unknownRefs.Load(), "mmap_markers", mmapMarkerUnknownRefs.Load())
 	}
 	return nil
 }
 type errLoadWbl struct {
 	err error
 }
 func (e errLoadWbl) Error() string {
 	return e.err.Error()
 }
 // To support errors.Cause().
 func (e errLoadWbl) Cause() error {
 	return e.err
 }
 // To support errors.Unwrap().
 func (e errLoadWbl) Unwrap() error {
 	return e.err
 }
 // isErrLoadOOOWal returns a boolean if the error is errLoadWbl.
 func isErrLoadOOOWal(err error) bool {
 	_, ok := err.(*errLoadWbl)
 	return ok
 }
 type wblSubsetProcessor struct {
 	mx     sync.Mutex // Take this lock while modifying series in the subset.
 	input  chan []record.RefSample
 	output chan []record.RefSample
 }
 func (wp *wblSubsetProcessor) setup() {
 	wp.output = make(chan []record.RefSample, 300)
 	wp.input = make(chan []record.RefSample, 300)
 }
 func (wp *wblSubsetProcessor) closeAndDrain() {
 	close(wp.input)
 	for range wp.output {
 	}
 }
 // If there is a buffer in the output chan, return it for reuse, otherwise return nil.
 func (wp *wblSubsetProcessor) reuseBuf() []record.RefSample {
 	select {
 	case buf := <-wp.output:
 		return buf[:0]
 	default:
 	}
 	return nil
 }
 // processWBLSamples adds the samples it receives to the head and passes
 // the buffer received to an output channel for reuse.
 // Samples before the minValidTime timestamp are discarded.
 func (wp *wblSubsetProcessor) processWBLSamples(h *Head) (unknownRefs uint64) {
 	defer close(wp.output)
 	// We don't check for minValidTime for ooo samples.
 	mint, maxt := int64(math.MaxInt64), int64(math.MinInt64)
 	for samples := range wp.input {
 		wp.mx.Lock()
 		for _, s := range samples {
 			ms := h.series.getByID(s.Ref)
 			if ms == nil {
 				unknownRefs++
 				continue
 			}
 			ok, chunkCreated, _ := ms.insert(s.T, s.V, h.chunkDiskMapper)
 			if chunkCreated {
 				h.metrics.chunksCreated.Inc()
 				h.metrics.chunks.Inc()
 			}
 			if ok {
 				if s.T < mint {
 					mint = s.T
 				}
 				if s.T > maxt {
 					maxt = s.T
 				}
 			}
 		}
 		wp.mx.Unlock()
 		wp.output <- samples
 	}
 	h.updateMinOOOMaxOOOTime(mint, maxt)
 	return unknownRefs
 }
 func (wp *wblSubsetProcessor) waitUntilIdle() {
 	select {
 	case <-wp.output: // Allow output side to drain to avoid deadlock.
 	default:
 	}
 	wp.input <- []record.RefSample{}
 	for len(wp.input) != 0 {
 		time.Sleep(10 * time.Microsecond)
 		select {
 		case <-wp.output: // Allow output side to drain to avoid deadlock.
 		default:
 		}
 	}
 }
 const (
 	chunkSnapshotRecordTypeSeries     uint8 = 1
 	chunkSnapshotRecordTypeTombstones uint8 = 2
--- a/tsdb/ooo_head.go
+++ b/tsdb/ooo_head.go
@ -0,0 +1,159 @@
 // Copyright 2022 The Prometheus Authors
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 // http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 package tsdb
 import (
 	"fmt"
 	"sort"
 	"github.com/prometheus/prometheus/tsdb/chunkenc"
 	"github.com/prometheus/prometheus/tsdb/tombstones"
 )
 // OOOChunk maintains samples in time-ascending order.
 // Inserts for timestamps already seen, are dropped.
 // Samples are stored uncompressed to allow easy sorting.
 // Perhaps we can be more efficient later.
 type OOOChunk struct {
 	samples []sample
 }
 func NewOOOChunk() *OOOChunk {
 	return &OOOChunk{samples: make([]sample, 0, 4)}
 }
 // Insert inserts the sample such that order is maintained.
 // Returns false if insert was not possible due to the same timestamp already existing.
 func (o *OOOChunk) Insert(t int64, v float64) bool {
 	// Find index of sample we should replace.
 	i := sort.Search(len(o.samples), func(i int) bool { return o.samples[i].t >= t })
 	if i >= len(o.samples) {
 		// none found. append it at the end
 		o.samples = append(o.samples, sample{t, v})
 		return true
 	}
 	if o.samples[i].t == t {
 		return false
 	}
 	// Expand length by 1 to make room. use a zero sample, we will overwrite it anyway.
 	o.samples = append(o.samples, sample{})
 	copy(o.samples[i+1:], o.samples[i:])
 	o.samples[i] = sample{t, v}
 	return true
 }
 func (o *OOOChunk) NumSamples() int {
 	return len(o.samples)
 }
 func (o *OOOChunk) ToXOR() (*chunkenc.XORChunk, error) {
 	x := chunkenc.NewXORChunk()
 	app, err := x.Appender()
 	if err != nil {
 		return nil, err
 	}
 	for _, s := range o.samples {
 		app.Append(s.t, s.v)
 	}
 	return x, nil
 }
 func (o *OOOChunk) ToXORBetweenTimestamps(mint, maxt int64) (*chunkenc.XORChunk, error) {
 	x := chunkenc.NewXORChunk()
 	app, err := x.Appender()
 	if err != nil {
 		return nil, err
 	}
 	for _, s := range o.samples {
 		if s.t < mint {
 			continue
 		}
 		if s.t > maxt {
 			break
 		}
 		app.Append(s.t, s.v)
 	}
 	return x, nil
 }
 var _ BlockReader = &OOORangeHead{}
 // OOORangeHead allows querying Head out of order samples via BlockReader
 // interface implementation.
 type OOORangeHead struct {
 	head *Head
 	// mint and maxt are tracked because when a query is handled we only want
 	// the timerange of the query and having preexisting pointers to the first
 	// and last timestamp help with that.
 	mint, maxt int64
 }
 func NewOOORangeHead(head *Head, mint, maxt int64) *OOORangeHead {
 	return &OOORangeHead{
 		head: head,
 		mint: mint,
 		maxt: maxt,
 	}
 }
 func (oh *OOORangeHead) Index() (IndexReader, error) {
 	return NewOOOHeadIndexReader(oh.head, oh.mint, oh.maxt), nil
 }
 func (oh *OOORangeHead) Chunks() (ChunkReader, error) {
 	return NewOOOHeadChunkReader(oh.head, oh.mint, oh.maxt), nil
 }
 func (oh *OOORangeHead) Tombstones() (tombstones.Reader, error) {
 	// As stated in the design doc https://docs.google.com/document/d/1Kppm7qL9C-BJB1j6yb6-9ObG3AbdZnFUBYPNNWwDBYM/edit?usp=sharing
 	// Tombstones are not supported for out of order metrics.
 	return tombstones.NewMemTombstones(), nil
 }
 func (oh *OOORangeHead) Meta() BlockMeta {
 	var id [16]byte
 	copy(id[:], "____ooo_head____")
 	return BlockMeta{
 		MinTime: oh.mint,
 		MaxTime: oh.maxt,
 		ULID:    id,
 		Stats: BlockStats{
 			NumSeries: oh.head.NumSeries(),
 		},
 	}
 }
 // Size returns the size taken by the Head block.
 func (oh *OOORangeHead) Size() int64 {
 	return oh.head.Size()
 }
 // String returns an human readable representation of the out of order range
 // head. It's important to keep this function in order to avoid the struct dump
 // when the head is stringified in errors or logs.
 func (oh *OOORangeHead) String() string {
 	return fmt.Sprintf("ooo range head (mint: %d, maxt: %d)", oh.MinTime(), oh.MaxTime())
 }
 func (oh *OOORangeHead) MinTime() int64 {
 	return oh.mint
 }
 func (oh *OOORangeHead) MaxTime() int64 {
 	return oh.maxt
 }
--- a/tsdb/ooo_head_read.go
+++ b/tsdb/ooo_head_read.go
@ -0,0 +1,433 @@
 // Copyright 2022 The Prometheus Authors
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 // http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 package tsdb
 import (
 	"errors"
 	"math"
 	"sort"
 	"github.com/prometheus/prometheus/model/labels"
 	"github.com/prometheus/prometheus/storage"
 	"github.com/prometheus/prometheus/tsdb/chunkenc"
 	"github.com/prometheus/prometheus/tsdb/chunks"
 	"github.com/prometheus/prometheus/tsdb/index"
 	"github.com/prometheus/prometheus/tsdb/tombstones"
 )
 var _ IndexReader = &OOOHeadIndexReader{}
 // OOOHeadIndexReader implements IndexReader so ooo samples in the head can be
 // accessed.
 // It also has a reference to headIndexReader so we can leverage on its
 // IndexReader implementation for all the methods that remain the same. We
 // decided to do this to avoid code duplication.
 // The only methods that change are the ones about getting Series and Postings.
 type OOOHeadIndexReader struct {
 	*headIndexReader // A reference to the headIndexReader so we can reuse as many interface implementation as possible.
 }
 func NewOOOHeadIndexReader(head *Head, mint, maxt int64) *OOOHeadIndexReader {
 	hr := &headIndexReader{
 		head: head,
 		mint: mint,
 		maxt: maxt,
 	}
 	return &OOOHeadIndexReader{hr}
 }
 func (oh *OOOHeadIndexReader) Series(ref storage.SeriesRef, lbls *labels.Labels, chks *[]chunks.Meta) error {
 	return oh.series(ref, lbls, chks, 0)
 }
 // The passed lastMmapRef tells upto what max m-map chunk that we can consider.
 // If it is 0, it means all chunks need to be considered.
 // If it is non-0, then the oooHeadChunk must not be considered.
 func (oh *OOOHeadIndexReader) series(ref storage.SeriesRef, lbls *labels.Labels, chks *[]chunks.Meta, lastMmapRef chunks.ChunkDiskMapperRef) error {
 	s := oh.head.series.getByID(chunks.HeadSeriesRef(ref))
 	if s == nil {
 		oh.head.metrics.seriesNotFound.Inc()
 		return storage.ErrNotFound
 	}
 	*lbls = append((*lbls)[:0], s.lset...)
 	if chks == nil {
 		return nil
 	}
 	s.Lock()
 	defer s.Unlock()
 	*chks = (*chks)[:0]
 	tmpChks := make([]chunks.Meta, 0, len(s.oooMmappedChunks))
 	// We define these markers to track the last chunk reference while we
 	// fill the chunk meta.
 	// These markers are useful to give consistent responses to repeated queries
 	// even if new chunks that might be overlapping or not are added afterwards.
 	// Also, lastMinT and lastMaxT are initialized to the max int as a sentinel
 	// value to know they are unset.
 	var lastChunkRef chunks.ChunkRef
 	lastMinT, lastMaxT := int64(math.MaxInt64), int64(math.MaxInt64)
 	addChunk := func(minT, maxT int64, ref chunks.ChunkRef) {
 		// the first time we get called is for the last included chunk.
 		// set the markers accordingly
 		if lastMinT == int64(math.MaxInt64) {
 			lastChunkRef = ref
 			lastMinT = minT
 			lastMaxT = maxT
 		}
 		tmpChks = append(tmpChks, chunks.Meta{
 			MinTime:        minT,
 			MaxTime:        maxT,
 			Ref:            ref,
 			OOOLastRef:     lastChunkRef,
 			OOOLastMinTime: lastMinT,
 			OOOLastMaxTime: lastMaxT,
 		})
 	}
 	// Collect all chunks that overlap the query range, in order from most recent to most old,
 	// so we can set the correct markers.
 	if s.oooHeadChunk != nil {
 		c := s.oooHeadChunk
 		if c.OverlapsClosedInterval(oh.mint, oh.maxt) && lastMmapRef == 0 {
 			ref := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(len(s.oooMmappedChunks))))
 			addChunk(c.minTime, c.maxTime, ref)
 		}
 	}
 	for i := len(s.oooMmappedChunks) - 1; i >= 0; i-- {
 		c := s.oooMmappedChunks[i]
 		if c.OverlapsClosedInterval(oh.mint, oh.maxt) && (lastMmapRef == 0 || lastMmapRef.GreaterThanOrEqualTo(c.ref)) {
 			ref := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(i)))
 			addChunk(c.minTime, c.maxTime, ref)
 		}
 	}
 	// There is nothing to do if we did not collect any chunk
 	if len(tmpChks) == 0 {
 		return nil
 	}
 	// Next we want to sort all the collected chunks by min time so we can find
 	// those that overlap.
 	sort.Sort(metaByMinTimeAndMinRef(tmpChks))
 	// Next we want to iterate the sorted collected chunks and only return the
 	// chunks Meta the first chunk that overlaps with others.
 	// Example chunks of a series: 5:(100, 200) 6:(500, 600) 7:(150, 250) 8:(550, 650)
 	// In the example 5 overlaps with 7 and 6 overlaps with 8 so we only want to
 	// to return chunk Metas for chunk 5 and chunk 6
 	*chks = append(*chks, tmpChks[0])
 	maxTime := tmpChks[0].MaxTime // tracks the maxTime of the previous "to be merged chunk"
 	for _, c := range tmpChks[1:] {
 		if c.MinTime > maxTime {
 			*chks = append(*chks, c)
 			maxTime = c.MaxTime
 		} else if c.MaxTime > maxTime {
 			maxTime = c.MaxTime
 			(*chks)[len(*chks)-1].MaxTime = c.MaxTime
 		}
 	}
 	return nil
 }
 // LabelValues needs to be overridden from the headIndexReader implementation due
 // to the check that happens at the beginning where we make sure that the query
 // interval overlaps with the head minooot and maxooot.
 func (oh *OOOHeadIndexReader) LabelValues(name string, matchers ...*labels.Matcher) ([]string, error) {
 	if oh.maxt < oh.head.MinOOOTime() || oh.mint > oh.head.MaxOOOTime() {
 		return []string{}, nil
 	}
 	if len(matchers) == 0 {
 		return oh.head.postings.LabelValues(name), nil
 	}
 	return labelValuesWithMatchers(oh, name, matchers...)
 }
 type chunkMetaAndChunkDiskMapperRef struct {
 	meta     chunks.Meta
 	ref      chunks.ChunkDiskMapperRef
 	origMinT int64
 	origMaxT int64
 }
 type byMinTimeAndMinRef []chunkMetaAndChunkDiskMapperRef
 func (b byMinTimeAndMinRef) Len() int { return len(b) }
 func (b byMinTimeAndMinRef) Less(i, j int) bool {
 	if b[i].meta.MinTime == b[j].meta.MinTime {
 		return b[i].meta.Ref < b[j].meta.Ref
 	}
 	return b[i].meta.MinTime < b[j].meta.MinTime
 }
 func (b byMinTimeAndMinRef) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
 type metaByMinTimeAndMinRef []chunks.Meta
 func (b metaByMinTimeAndMinRef) Len() int { return len(b) }
 func (b metaByMinTimeAndMinRef) Less(i, j int) bool {
 	if b[i].MinTime == b[j].MinTime {
 		return b[i].Ref < b[j].Ref
 	}
 	return b[i].MinTime < b[j].MinTime
 }
 func (b metaByMinTimeAndMinRef) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
 func (oh *OOOHeadIndexReader) Postings(name string, values ...string) (index.Postings, error) {
 	switch len(values) {
 	case 0:
 		return index.EmptyPostings(), nil
 	case 1:
 		return oh.head.postings.Get(name, values[0]), nil // TODO(ganesh) Also call GetOOOPostings
 	default:
 		// TODO(ganesh) We want to only return postings for out of order series.
 		res := make([]index.Postings, 0, len(values))
 		for _, value := range values {
 			res = append(res, oh.head.postings.Get(name, value)) // TODO(ganesh) Also call GetOOOPostings
 		}
 		return index.Merge(res...), nil
 	}
 }
 type OOOHeadChunkReader struct {
 	head       *Head
 	mint, maxt int64
 }
 func NewOOOHeadChunkReader(head *Head, mint, maxt int64) *OOOHeadChunkReader {
 	return &OOOHeadChunkReader{
 		head: head,
 		mint: mint,
 		maxt: maxt,
 	}
 }
 func (cr OOOHeadChunkReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
 	sid, _ := chunks.HeadChunkRef(meta.Ref).Unpack()
 	s := cr.head.series.getByID(sid)
 	// This means that the series has been garbage collected.
 	if s == nil {
 		return nil, storage.ErrNotFound
 	}
 	s.Lock()
 	c, err := s.oooMergedChunk(meta, cr.head.chunkDiskMapper, cr.mint, cr.maxt)
 	s.Unlock()
 	if err != nil {
 		return nil, err
 	}
 	// This means that the query range did not overlap with the requested chunk.
 	if len(c.chunks) == 0 {
 		return nil, storage.ErrNotFound
 	}
 	return c, nil
 }
 func (cr OOOHeadChunkReader) Close() error {
 	return nil
 }
 type OOOCompactionHead struct {
 	oooIR       *OOOHeadIndexReader
 	lastMmapRef chunks.ChunkDiskMapperRef
 	lastWBLFile int
 	postings    []storage.SeriesRef
 	chunkRange  int64
 	mint, maxt  int64 // Among all the compactable chunks.
 }
 // NewOOOCompactionHead does the following:
 // 1. M-maps all the in-memory ooo chunks.
 // 2. Compute the expected block ranges while iterating through all ooo series and store it.
 // 3. Store the list of postings having ooo series.
 // 4. Cuts a new WBL file for the OOO WBL.
 // All the above together have a bit of CPU and memory overhead, and can have a bit of impact
 // on the sample append latency. So call NewOOOCompactionHead only right before compaction.
 func NewOOOCompactionHead(head *Head) (*OOOCompactionHead, error) {
 	newWBLFile, err := head.wbl.NextSegmentSync()
 	if err != nil {
 		return nil, err
 	}
 	ch := &OOOCompactionHead{
 		chunkRange:  head.chunkRange.Load(),
 		mint:        math.MaxInt64,
 		maxt:        math.MinInt64,
 		lastWBLFile: newWBLFile,
 	}
 	ch.oooIR = NewOOOHeadIndexReader(head, math.MinInt64, math.MaxInt64)
 	n, v := index.AllPostingsKey()
 	// TODO: verify this gets only ooo samples.
 	p, err := ch.oooIR.Postings(n, v)
 	if err != nil {
 		return nil, err
 	}
 	p = ch.oooIR.SortedPostings(p)
 	var lastSeq, lastOff int
 	for p.Next() {
 		seriesRef := p.At()
 		ms := head.series.getByID(chunks.HeadSeriesRef(seriesRef))
 		if ms == nil {
 			continue
 		}
 		// M-map the in-memory chunk and keep track of the last one.
 		// Also build the block ranges -> series map.
 		// TODO: consider having a lock specifically for ooo data.
 		ms.Lock()
 		mmapRef := ms.mmapCurrentOOOHeadChunk(head.chunkDiskMapper)
 		if mmapRef == 0 && len(ms.oooMmappedChunks) > 0 {
 			// Nothing was m-mapped. So take the mmapRef from the existing slice if it exists.
 			mmapRef = ms.oooMmappedChunks[len(ms.oooMmappedChunks)-1].ref
 		}
 		seq, off := mmapRef.Unpack()
 		if seq > lastSeq || (seq == lastSeq && off > lastOff) {
 			ch.lastMmapRef, lastSeq, lastOff = mmapRef, seq, off
 		}
 		if len(ms.oooMmappedChunks) > 0 {
 			ch.postings = append(ch.postings, seriesRef)
 			for _, c := range ms.oooMmappedChunks {
 				if c.minTime < ch.mint {
 					ch.mint = c.minTime
 				}
 				if c.maxTime > ch.maxt {
 					ch.maxt = c.maxTime
 				}
 			}
 		}
 		ms.Unlock()
 	}
 	return ch, nil
 }
 func (ch *OOOCompactionHead) Index() (IndexReader, error) {
 	return NewOOOCompactionHeadIndexReader(ch), nil
 }
 func (ch *OOOCompactionHead) Chunks() (ChunkReader, error) {
 	return NewOOOHeadChunkReader(ch.oooIR.head, ch.oooIR.mint, ch.oooIR.maxt), nil
 }
 func (ch *OOOCompactionHead) Tombstones() (tombstones.Reader, error) {
 	return tombstones.NewMemTombstones(), nil
 }
 func (ch *OOOCompactionHead) Meta() BlockMeta {
 	var id [16]byte
 	copy(id[:], "copy(id[:], \"ooo_compact_head\")")
 	return BlockMeta{
 		MinTime: ch.mint,
 		MaxTime: ch.maxt,
 		ULID:    id,
 		Stats: BlockStats{
 			NumSeries: uint64(len(ch.postings)),
 		},
 	}
 }
 // CloneForTimeRange clones the OOOCompactionHead such that the IndexReader and ChunkReader
 // obtained from this only looks at the m-map chunks within the given time ranges while not looking
 // beyond the ch.lastMmapRef.
 // Only the method of BlockReader interface are valid for the cloned OOOCompactionHead.
 func (ch *OOOCompactionHead) CloneForTimeRange(mint, maxt int64) *OOOCompactionHead {
 	return &OOOCompactionHead{
 		oooIR:       NewOOOHeadIndexReader(ch.oooIR.head, mint, maxt),
 		lastMmapRef: ch.lastMmapRef,
 		postings:    ch.postings,
 		chunkRange:  ch.chunkRange,
 		mint:        ch.mint,
 		maxt:        ch.maxt,
 	}
 }
 func (ch *OOOCompactionHead) Size() int64                            { return 0 }
 func (ch *OOOCompactionHead) MinTime() int64                         { return ch.mint }
 func (ch *OOOCompactionHead) MaxTime() int64                         { return ch.maxt }
 func (ch *OOOCompactionHead) ChunkRange() int64                      { return ch.chunkRange }
 func (ch *OOOCompactionHead) LastMmapRef() chunks.ChunkDiskMapperRef { return ch.lastMmapRef }
 func (ch *OOOCompactionHead) LastWBLFile() int                       { return ch.lastWBLFile }
 type OOOCompactionHeadIndexReader struct {
 	ch *OOOCompactionHead
 }
 func NewOOOCompactionHeadIndexReader(ch *OOOCompactionHead) IndexReader {
 	return &OOOCompactionHeadIndexReader{ch: ch}
 }
 func (ir *OOOCompactionHeadIndexReader) Symbols() index.StringIter {
 	return ir.ch.oooIR.Symbols()
 }
 func (ir *OOOCompactionHeadIndexReader) Postings(name string, values ...string) (index.Postings, error) {
 	n, v := index.AllPostingsKey()
 	if name != n || len(values) != 1 || values[0] != v {
 		return nil, errors.New("only AllPostingsKey is supported")
 	}
 	return index.NewListPostings(ir.ch.postings), nil
 }
 func (ir *OOOCompactionHeadIndexReader) SortedPostings(p index.Postings) index.Postings {
 	// This will already be sorted from the Postings() call above.
 	return p
 }
 func (ir *OOOCompactionHeadIndexReader) Series(ref storage.SeriesRef, lset *labels.Labels, chks *[]chunks.Meta) error {
 	return ir.ch.oooIR.series(ref, lset, chks, ir.ch.lastMmapRef)
 }
 func (ir *OOOCompactionHeadIndexReader) SortedLabelValues(name string, matchers ...*labels.Matcher) ([]string, error) {
 	return nil, errors.New("not implemented")
 }
 func (ir *OOOCompactionHeadIndexReader) LabelValues(name string, matchers ...*labels.Matcher) ([]string, error) {
 	return nil, errors.New("not implemented")
 }
 func (ir *OOOCompactionHeadIndexReader) PostingsForMatchers(concurrent bool, ms ...*labels.Matcher) (index.Postings, error) {
 	return nil, errors.New("not implemented")
 }
 func (ir *OOOCompactionHeadIndexReader) LabelNames(matchers ...*labels.Matcher) ([]string, error) {
 	return nil, errors.New("not implemented")
 }
 func (ir *OOOCompactionHeadIndexReader) LabelValueFor(id storage.SeriesRef, label string) (string, error) {
 	return "", errors.New("not implemented")
 }
 func (ir *OOOCompactionHeadIndexReader) LabelNamesFor(ids ...storage.SeriesRef) ([]string, error) {
 	return nil, errors.New("not implemented")
 }
 func (ir *OOOCompactionHeadIndexReader) Close() error {
 	return ir.ch.oooIR.Close()
 }
--- a/tsdb/ooo_head_read_test.go
+++ b/tsdb/ooo_head_read_test.go
--- a/tsdb/ooo_head_test.go
+++ b/tsdb/ooo_head_test.go
@ -0,0 +1,93 @@
 // Copyright 2022 The Prometheus Authors
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 // http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 package tsdb
 import (
 	"testing"
 	"github.com/stretchr/testify/require"
 )
 const testMaxSize int = 32
 // Formulas chosen to make testing easy:
 func valEven(pos int) int { return pos*2 + 2 } // s[0]=2, s[1]=4, s[2]=6, ..., s[31]=64 - Predictable pre-existing values
 func valOdd(pos int) int  { return pos*2 + 1 } // s[0]=1, s[1]=3, s[2]=5, ..., s[31]=63 - New values will interject at chosen position because they sort before the pre-existing vals.
 func samplify(v int) sample { return sample{int64(v), float64(v)} }
 func makeEvenSampleSlice(n int) []sample {
 	s := make([]sample, n)
 	for i := 0; i < n; i++ {
 		s[i] = samplify(valEven(i))
 	}
 	return s
 }
 // TestOOOInsert tests the following cases:
 // - Number of pre-existing samples anywhere from 0 to testMaxSize-1.
 // - Insert new sample before first pre-existing samples, after the last, and anywhere in between.
 // - With a chunk initial capacity of testMaxSize/8 and testMaxSize, which lets us test non-full and full chunks, and chunks that need to expand themselves.
 // Note: In all samples used, t always equals v in numeric value. when we talk about 'value' we just refer to a value that will be used for both sample.t and sample.v.
 func TestOOOInsert(t *testing.T) {
 	for numPreExisting := 0; numPreExisting <= testMaxSize; numPreExisting++ {
 		// For example, if we have numPreExisting 2, then:
 		// chunk.samples indexes filled        0   1
 		// chunk.samples with these values     2   4     // valEven
 		// we want to test inserting at index  0   1   2 // insertPos=0..numPreExisting
 		// we can do this by using values      1,  3   5 // valOdd(insertPos)
 		for insertPos := 0; insertPos <= numPreExisting; insertPos++ {
 			chunk := NewOOOChunk()
 			chunk.samples = makeEvenSampleSlice(numPreExisting)
 			newSample := samplify(valOdd(insertPos))
 			chunk.Insert(newSample.t, newSample.v)
 			var expSamples []sample
 			// Our expected new samples slice, will be first the original samples.
 			for i := 0; i < insertPos; i++ {
 				expSamples = append(expSamples, samplify(valEven(i)))
 			}
 			// Then the new sample.
 			expSamples = append(expSamples, newSample)
 			// Followed by any original samples that were pushed back by the new one.
 			for i := insertPos; i < numPreExisting; i++ {
 				expSamples = append(expSamples, samplify(valEven(i)))
 			}
 			require.Equal(t, expSamples, chunk.samples, "numPreExisting %d, insertPos %d", numPreExisting, insertPos)
 		}
 	}
 }
 // TestOOOInsertDuplicate tests the correct behavior when inserting a sample that is a duplicate of any
 // pre-existing samples, with between 1 and testMaxSize pre-existing samples and
 // with a chunk initial capacity of testMaxSize/8 and testMaxSize, which lets us test non-full and full chunks, and chunks that need to expand themselves.
 func TestOOOInsertDuplicate(t *testing.T) {
 	for num := 1; num <= testMaxSize; num++ {
 		for dupPos := 0; dupPos < num; dupPos++ {
 			chunk := NewOOOChunk()
 			chunk.samples = makeEvenSampleSlice(num)
 			dupSample := chunk.samples[dupPos]
 			dupSample.v = 0.123
 			ok := chunk.Insert(dupSample.t, dupSample.v)
 			expSamples := makeEvenSampleSlice(num) // We expect no change.
 			require.False(t, ok)
 			require.Equal(t, expSamples, chunk.samples, "num %d, dupPos %d", num, dupPos)
 		}
 	}
 }
--- a/tsdb/querier.go
+++ b/tsdb/querier.go
@ -569,7 +569,7 @@ func (p *populateWithDelGenericSeriesIterator) next() bool {
 	p.i++
 	p.currChkMeta = p.chks[p.i]
-	p.currChkMeta.Chunk, p.err = p.chunks.Chunk(p.currChkMeta.Ref)
+	p.currChkMeta.Chunk, p.err = p.chunks.Chunk(p.currChkMeta)
 	if p.err != nil {
 		p.err = errors.Wrapf(p.err, "cannot populate chunk %d", p.currChkMeta.Ref)
 		return false
@ -898,7 +898,7 @@ func newNopChunkReader() ChunkReader {
 	}
 }
-func (cr nopChunkReader) Chunk(ref chunks.ChunkRef) (chunkenc.Chunk, error) {
+func (cr nopChunkReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
 	return cr.emptyChunk, nil
 }
--- a/tsdb/querier_bench_test.go
+++ b/tsdb/querier_bench_test.go
@ -34,7 +34,7 @@ func BenchmarkQuerier(b *testing.B) {
 	opts := DefaultHeadOptions()
 	opts.ChunkRange = 1000
 	opts.ChunkDirRoot = chunkDir
-	h, err := NewHead(nil, nil, nil, opts, nil)
+	h, err := NewHead(nil, nil, nil, nil, opts, nil)
 	require.NoError(b, err)
 	defer func() {
 		require.NoError(b, h.Close())
@ -180,7 +180,7 @@ func BenchmarkQuerierSelect(b *testing.B) {
 	opts := DefaultHeadOptions()
 	opts.ChunkRange = 1000
 	opts.ChunkDirRoot = chunkDir
-	h, err := NewHead(nil, nil, nil, opts, nil)
+	h, err := NewHead(nil, nil, nil, nil, opts, nil)
 	require.NoError(b, err)
 	defer h.Close()
 	app := h.Appender(context.Background())
--- a/tsdb/querier_test.go
+++ b/tsdb/querier_test.go
@ -458,7 +458,7 @@ func TestBlockQuerier_AgainstHeadWithOpenChunks(t *testing.T) {
 		t.Run("", func(t *testing.T) {
 			opts := DefaultHeadOptions()
 			opts.ChunkRange = 2 * time.Hour.Milliseconds()
-			h, err := NewHead(nil, nil, nil, opts, nil)
+			h, err := NewHead(nil, nil, nil, nil, opts, nil)
 			require.NoError(t, err)
 			defer h.Close()
@ -627,10 +627,10 @@ func createFakeReaderAndNotPopulatedChunks(s ...[]tsdbutil.Sample) (*fakeChunksR
 	return f, chks
 }
-func (r *fakeChunksReader) Chunk(ref chunks.ChunkRef) (chunkenc.Chunk, error) {
+func (r *fakeChunksReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
-	chk, ok := r.chks[ref]
+	chk, ok := r.chks[meta.Ref]
 	if !ok {
-		return nil, errors.Errorf("chunk not found at ref %v", ref)
+		return nil, errors.Errorf("chunk not found at ref %v", meta.Ref)
 	}
 	return chk, nil
 }
@ -1016,8 +1016,8 @@ func BenchmarkMergedSeriesSet(b *testing.B) {
 type mockChunkReader map[chunks.ChunkRef]chunkenc.Chunk
-func (cr mockChunkReader) Chunk(id chunks.ChunkRef) (chunkenc.Chunk, error) {
+func (cr mockChunkReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
-	chk, ok := cr[id]
+	chk, ok := cr[meta.Ref]
 	if ok {
 		return chk, nil
 	}
@ -1643,7 +1643,7 @@ func TestPostingsForMatchers(t *testing.T) {
 	opts := DefaultHeadOptions()
 	opts.ChunkRange = 1000
 	opts.ChunkDirRoot = chunkDir
-	h, err := NewHead(nil, nil, nil, opts, nil)
+	h, err := NewHead(nil, nil, nil, nil, opts, nil)
 	require.NoError(t, err)
 	defer func() {
 		require.NoError(t, h.Close())
@ -1944,13 +1944,17 @@ func BenchmarkQueries(b *testing.B) {
 		},
 	}
-	queryTypes := make(map[string]storage.Querier)
+	type qt struct {
 		typ     string
 		querier storage.Querier
 	}
 	var queryTypes []qt // We use a slice instead of map to keep the order of test cases consistent.
 	defer func() {
 		for _, q := range queryTypes {
 			// Can't run a check for error here as some of these will fail as
 			// queryTypes is using the same slice for the different block queriers
 			// and would have been closed in the previous iteration.
-			q.Close()
+			q.querier.Close()
 		}
 	}()
@ -1991,21 +1995,38 @@ func BenchmarkQueries(b *testing.B) {
 					qs = append(qs, q)
 				}
-				queryTypes["_1-Block"] = storage.NewMergeQuerier(qs[:1], nil, storage.ChainedSeriesMerge)
+				queryTypes = append(queryTypes, qt{"_1-Block", storage.NewMergeQuerier(qs[:1], nil, storage.ChainedSeriesMerge)})
-				queryTypes["_3-Blocks"] = storage.NewMergeQuerier(qs[0:3], nil, storage.ChainedSeriesMerge)
+				queryTypes = append(queryTypes, qt{"_3-Blocks", storage.NewMergeQuerier(qs[0:3], nil, storage.ChainedSeriesMerge)})
-				queryTypes["_10-Blocks"] = storage.NewMergeQuerier(qs, nil, storage.ChainedSeriesMerge)
+				queryTypes = append(queryTypes, qt{"_10-Blocks", storage.NewMergeQuerier(qs, nil, storage.ChainedSeriesMerge)})
 				chunkDir := b.TempDir()
 				head := createHead(b, nil, series, chunkDir)
-				qHead, err := NewBlockQuerier(head, 1, nSamples)
+				qHead, err := NewBlockQuerier(NewRangeHead(head, 1, nSamples), 1, nSamples)
 				require.NoError(b, err)
-				queryTypes["_Head"] = qHead
+				queryTypes = append(queryTypes, qt{"_Head", qHead})
-				for qtype, querier := range queryTypes {
+				for _, oooPercentage := range []int{1, 3, 5, 10} {
-					b.Run(title+qtype+"_nSeries:"+strconv.Itoa(nSeries)+"_nSamples:"+strconv.Itoa(int(nSamples)), func(b *testing.B) {
+					chunkDir := b.TempDir()
 					totalOOOSamples := oooPercentage * int(nSamples) / 100
 					oooSampleFrequency := int(nSamples) / totalOOOSamples
 					head := createHeadWithOOOSamples(b, nil, series, chunkDir, oooSampleFrequency)
 					qHead, err := NewBlockQuerier(NewRangeHead(head, 1, nSamples), 1, nSamples)
 					require.NoError(b, err)
 					qOOOHead, err := NewBlockQuerier(NewOOORangeHead(head, 1, nSamples), 1, nSamples)
 					require.NoError(b, err)
 					queryTypes = append(queryTypes, qt{
 						fmt.Sprintf("_Head_oooPercent:%d", oooPercentage),
 						storage.NewMergeQuerier([]storage.Querier{qHead, qOOOHead}, nil, storage.ChainedSeriesMerge),
 					})
 				}
 				for _, q := range queryTypes {
 					b.Run(title+q.typ+"_nSeries:"+strconv.Itoa(nSeries)+"_nSamples:"+strconv.Itoa(int(nSamples)), func(b *testing.B) {
 						expExpansions, err := strconv.Atoi(string(title[len(title)-1]))
 						require.NoError(b, err)
-						benchQuery(b, expExpansions, querier, selectors)
+						benchQuery(b, expExpansions, q.querier, selectors)
 					})
 				}
 				require.NoError(b, head.Close())
@ -2025,6 +2046,7 @@ func benchQuery(b *testing.B, expExpansions int, q storage.Querier, selectors la
 			s.Labels()
 			it := s.Iterator()
 			for it.Next() {
 				_, _ = it.At()
 			}
 			actualExpansions++
 		}
--- a/tsdb/record/record.go
+++ b/tsdb/record/record.go
@ -43,6 +43,8 @@ const (
 	Tombstones Type = 3
 	// Exemplars is used to match WAL records of type Exemplars.
 	Exemplars Type = 4
 	// MmapMarkers is used to match OOO WBL records of type MmapMarkers.
 	MmapMarkers Type = 5
 	// Metadata is used to match WAL records of type Metadata.
 	Metadata Type = 6
 )
@ -57,6 +59,8 @@ func (rt Type) String() string {
 		return "exemplars"
 	case Tombstones:
 		return "tombstones"
 	case MmapMarkers:
 		return "mmapmarkers"
 	case Metadata:
 		return "metadata"
 	default:
@ -157,6 +161,12 @@ type RefExemplar struct {
 	Labels labels.Labels
 }
 // RefMmapMarker marks that the all the samples of the given series until now have been m-mapped to disk.
 type RefMmapMarker struct {
 	Ref     chunks.HeadSeriesRef
 	MmapRef chunks.ChunkDiskMapperRef
 }
 // Decoder decodes series, sample, metadata and tombstone records.
 // The zero value is ready to use.
 type Decoder struct{}
@ -168,7 +178,7 @@ func (d *Decoder) Type(rec []byte) Type {
 		return Unknown
 	}
 	switch t := Type(rec[0]); t {
-	case Series, Samples, Tombstones, Exemplars, Metadata:
+	case Series, Samples, Tombstones, Exemplars, MmapMarkers, Metadata:
 		return t
 	}
 	return Unknown
@ -354,6 +364,34 @@ func (d *Decoder) ExemplarsFromBuffer(dec *encoding.Decbuf, exemplars []RefExemp
 	return exemplars, nil
 }
 func (d *Decoder) MmapMarkers(rec []byte, markers []RefMmapMarker) ([]RefMmapMarker, error) {
 	dec := encoding.Decbuf{B: rec}
 	t := Type(dec.Byte())
 	if t != MmapMarkers {
 		return nil, errors.New("invalid record type")
 	}
 	if dec.Len() == 0 {
 		return markers, nil
 	}
 	for len(dec.B) > 0 && dec.Err() == nil {
 		ref := chunks.HeadSeriesRef(dec.Be64())
 		mmapRef := chunks.ChunkDiskMapperRef(dec.Be64())
 		markers = append(markers, RefMmapMarker{
 			Ref:     ref,
 			MmapRef: mmapRef,
 		})
 	}
 	if dec.Err() != nil {
 		return nil, errors.Wrapf(dec.Err(), "decode error after %d mmap markers", len(markers))
 	}
 	if len(dec.B) > 0 {
 		return nil, errors.Errorf("unexpected %d bytes left in entry", len(dec.B))
 	}
 	return markers, nil
 }
 // Encoder encodes series, sample, and tombstones records.
 // The zero value is ready to use.
 type Encoder struct{}
@ -467,3 +505,15 @@ func (e *Encoder) EncodeExemplarsIntoBuffer(exemplars []RefExemplar, buf *encodi
 		EncodeLabels(buf, ex.Labels)
 	}
 }
 func (e *Encoder) MmapMarkers(markers []RefMmapMarker, b []byte) []byte {
 	buf := encoding.Encbuf{B: b}
 	buf.PutByte(byte(MmapMarkers))
 	for _, s := range markers {
 		buf.PutBE64(uint64(s.Ref))
 		buf.PutBE64(uint64(s.MmapRef))
 	}
 	return buf.Get()
 }
--- a/tsdb/wal/wal.go
+++ b/tsdb/wal/wal.go
@ -40,6 +40,7 @@ const (
 	DefaultSegmentSize = 128 * 1024 * 1024 // 128 MB
 	pageSize           = 32 * 1024         // 32KB
 	recordHeaderSize   = 7
 	WblDirName         = "wbl"
 )
 // The table gets initialized with sync.Once but may still cause a race
@ -204,32 +205,32 @@ func newWALMetrics(r prometheus.Registerer) *walMetrics {
 	m := &walMetrics{}
 	m.fsyncDuration = prometheus.NewSummary(prometheus.SummaryOpts{
-		Name:       "prometheus_tsdb_wal_fsync_duration_seconds",
+		Name:       "fsync_duration_seconds",
 		Help:       "Duration of WAL fsync.",
 		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
 	})
 	m.pageFlushes = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "prometheus_tsdb_wal_page_flushes_total",
+		Name: "page_flushes_total",
 		Help: "Total number of page flushes.",
 	})
 	m.pageCompletions = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "prometheus_tsdb_wal_completed_pages_total",
+		Name: "completed_pages_total",
 		Help: "Total number of completed pages.",
 	})
 	m.truncateFail = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "prometheus_tsdb_wal_truncations_failed_total",
+		Name: "truncations_failed_total",
 		Help: "Total number of WAL truncations that failed.",
 	})
 	m.truncateTotal = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "prometheus_tsdb_wal_truncations_total",
+		Name: "truncations_total",
 		Help: "Total number of WAL truncations attempted.",
 	})
 	m.currentSegment = prometheus.NewGauge(prometheus.GaugeOpts{
-		Name: "prometheus_tsdb_wal_segment_current",
+		Name: "segment_current",
 		Help: "WAL segment index that TSDB is currently writing to.",
 	})
 	m.writesFailed = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "prometheus_tsdb_wal_writes_failed_total",
+		Name: "writes_failed_total",
 		Help: "Total number of WAL writes that failed.",
 	})
@ -274,7 +275,11 @@ func NewSize(logger log.Logger, reg prometheus.Registerer, dir string, segmentSi
 		stopc:       make(chan chan struct{}),
 		compress:    compress,
 	}
-	w.metrics = newWALMetrics(reg)
+	prefix := "prometheus_tsdb_wal_"
 	if filepath.Base(dir) == WblDirName {
 		prefix = "prometheus_tsdb_out_of_order_wal_"
 	}
 	w.metrics = newWALMetrics(prometheus.WrapRegistererWithPrefix(prefix, reg))
 	_, last, err := Segments(w.Dir())
 	if err != nil {
@ -459,36 +464,46 @@ func SegmentName(dir string, i int) string {
 	return filepath.Join(dir, fmt.Sprintf("%08d", i))
 }
-// NextSegment creates the next segment and closes the previous one.
+// NextSegment creates the next segment and closes the previous one asynchronously.
-func (w *WAL) NextSegment() error {
+// It returns the file number of the new file.
 func (w *WAL) NextSegment() (int, error) {
 	w.mtx.Lock()
 	defer w.mtx.Unlock()
-	return w.nextSegment()
+	return w.nextSegment(true)
 }
 // NextSegmentSync creates the next segment and closes the previous one in sync.
 // It returns the file number of the new file.
 func (w *WAL) NextSegmentSync() (int, error) {
 	w.mtx.Lock()
 	defer w.mtx.Unlock()
 	return w.nextSegment(false)
 }
 // nextSegment creates the next segment and closes the previous one.
-func (w *WAL) nextSegment() error {
+// It returns the file number of the new file.
 func (w *WAL) nextSegment(async bool) (int, error) {
 	if w.closed {
-		return errors.New("wal is closed")
+		return 0, errors.New("wal is closed")
 	}
 	// Only flush the current page if it actually holds data.
 	if w.page.alloc > 0 {
 		if err := w.flushPage(true); err != nil {
-			return err
+			return 0, err
 		}
 	}
 	next, err := CreateSegment(w.Dir(), w.segment.Index()+1)
 	if err != nil {
-		return errors.Wrap(err, "create new segment file")
+		return 0, errors.Wrap(err, "create new segment file")
 	}
 	prev := w.segment
 	if err := w.setSegment(next); err != nil {
-		return err
+		return 0, err
 	}
 	// Don't block further writes by fsyncing the last segment.
-	w.actorc <- func() {
+	f := func() {
 		if err := w.fsync(prev); err != nil {
 			level.Error(w.logger).Log("msg", "sync previous segment", "err", err)
 		}
@ -496,7 +511,12 @@ func (w *WAL) nextSegment() error {
 			level.Error(w.logger).Log("msg", "close previous segment", "err", err)
 		}
 	}
-	return nil
+	if async {
 		w.actorc <- f
 	} else {
 		f()
 	}
 	return next.Index(), nil
 }
 func (w *WAL) setSegment(segment *Segment) error {
@ -638,7 +658,7 @@ func (w *WAL) log(rec []byte, final bool) error {
 	left += (pageSize - recordHeaderSize) * (w.pagesPerSegment() - w.donePages - 1) // Free pages in the active segment.
 	if len(rec) > left {
-		if err := w.nextSegment(); err != nil {
+		if _, err := w.nextSegment(true); err != nil {
 			return err
 		}
 	}
@ -745,6 +765,13 @@ func (w *WAL) fsync(f *Segment) error {
 	return err
 }
 // Sync forces a file sync on the current wal segment. This function is meant
 // to be used only on tests due to different behaviour on Operating Systems
 // like windows and linux
 func (w *WAL) Sync() error {
 	return w.fsync(w.segment)
 }
 // Close flushes all writes and closes active segment.
 func (w *WAL) Close() (err error) {
 	w.mtx.Lock()
--- a/tsdb/wal/watcher_test.go
+++ b/tsdb/wal/watcher_test.go
@ -364,14 +364,16 @@ func TestReadCheckpoint(t *testing.T) {
 			err := os.Mkdir(wdir, 0o777)
 			require.NoError(t, err)
-			os.Create(SegmentName(wdir, 30))
+			f, err := os.Create(SegmentName(wdir, 30))
 			require.NoError(t, err)
 			require.NoError(t, f.Close())
 			enc := record.Encoder{}
 			w, err := NewSize(nil, nil, wdir, 128*pageSize, compress)
 			require.NoError(t, err)
-			defer func() {
+			t.Cleanup(func() {
 				require.NoError(t, w.Close())
-			}()
+			})
 			// Write to the initial segment then checkpoint.
 			for i := 0; i < seriesCount; i++ {
@ -396,8 +398,11 @@ func TestReadCheckpoint(t *testing.T) {
 					require.NoError(t, w.Log(sample))
 				}
 			}
-			Checkpoint(log.NewNopLogger(), w, 30, 31, func(x chunks.HeadSeriesRef) bool { return true }, 0)
+			_, err = w.NextSegmentSync()
-			w.Truncate(32)
+			require.NoError(t, err)
 			_, err = Checkpoint(log.NewNopLogger(), w, 30, 31, func(x chunks.HeadSeriesRef) bool { return true }, 0)
 			require.NoError(t, err)
 			require.NoError(t, w.Truncate(32))
 			// Start read after checkpoint, no more data written.
 			_, _, err = Segments(w.Dir())
--- a/web/api/v1/api_test.go
+++ b/web/api/v1/api_test.go
@ -2314,7 +2314,7 @@ func (f *fakeDB) Stats(statsByLabelName string) (_ *tsdb.Stats, retErr error) {
 	}()
 	opts := tsdb.DefaultHeadOptions()
 	opts.ChunkRange = 1000
-	h, _ := tsdb.NewHead(nil, nil, nil, opts, nil)
+	h, _ := tsdb.NewHead(nil, nil, nil, nil, opts, nil)
 	return h.Stats(statsByLabelName), nil
 }