Add out-of-order sample support to the TSDB (#11075)

* Introduce out-of-order TSDB support This implementation is based on this design doc: https://docs.google.com/document/d/1Kppm7qL9C-BJB1j6yb6-9ObG3AbdZnFUBYPNNWwDBYM/edit?usp=sharing This commit adds support to accept out-of-order ("OOO") sample into the TSDB up to a configurable time allowance. If OOO is enabled, overlapping querying are automatically enabled. Most of the additions have been borrowed from https://github.com/grafana/mimir-prometheus/ Here is the list ist of the original commits cherry picked from mimir-prometheus into this branch: - 4b2198d7ec - 2836e5513f - 00b379c3a5 - ff0dc75758 - a632c73352 - c6f3d4ab33 - 5e8406a1d4 - abde1e0ba1 - e70e769889 - df59320886 Co-authored-by: Jesus Vazquez <jesus.vazquez@grafana.com> Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com> Co-authored-by: Dieter Plaetinck <dieter@grafana.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * gofumpt files Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Add license header to missing files Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix OOO tests due to existing chunk disk mapper implementation Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix truncate int overflow Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Add Sync method to the WAL and update tests Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * remove useless sync Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Update minOOOTime after truncating Head * Update minOOOTime after truncating Head Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix lint Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Add a unit test Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Load OutOfOrderTimeWindow only once per appender Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix OOO Head LabelValues and PostingsForMatchers Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix replay of OOO mmap chunks Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Remove unnecessary err check Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Prevent panic with ApplyConfig Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Run OOO compaction after restart if there is OOO data from WBL Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Apply Bartek's suggestions Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Refactor OOO compaction Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Address comments and TODOs - Added a comment explaining why we need the allow overlapping compaction toggle - Clarified TSDBConfig OutOfOrderTimeWindow doc - Added an owner to all the TODOs in the code Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Run go format Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix remaining review comments Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix tests Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Change wbl reference when truncating ooo in TestHeadMinOOOTimeUpdate Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> * Fix TestWBLAndMmapReplay test failure on windows Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Address most of the feedback Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Refactor the block meta for out of order Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix windows error Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> * Fix review comments Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Jesus Vazquez <jesus.vazquez@grafana.com> Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com> Signed-off-by: Ganesh Vernekar 15064823+codesome@users.noreply.github.com Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> Co-authored-by: Ganesh Vernekar <ganeshvern@gmail.com> Co-authored-by: Dieter Plaetinck <dieter@grafana.com> Co-authored-by: Oleg Zaytsev <mail@olegzaytsev.com> Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com>
2025-03-05 20:59:13 -08:00 · 2022-09-20 19:05:50 +02:00 · 2022-09-20 19:05:50 +02:00 · c1b669bf9b
parent af6167df58
commit c1b669bf9b
38 changed files with 6655 additions and 380 deletions
--- a/cmd/prometheus/main.go
+++ b/cmd/prometheus/main.go
@ -463,6 +463,9 @@ func main() {
 		}
 		cfg.tsdb.MaxExemplars = int64(cfgFile.StorageConfig.ExemplarsConfig.MaxExemplars)
 	}
+	if cfgFile.StorageConfig.TSDBConfig != nil {
+		cfg.tsdb.OutOfOrderTimeWindow = cfgFile.StorageConfig.TSDBConfig.OutOfOrderTimeWindow
+	}

 	// Now that the validity of the config is established, set the config
 	// success metrics accordingly, although the config isn't really loaded
@ -1537,6 +1540,7 @@ type tsdbOptions struct {
 	StripeSize                     int
 	MinBlockDuration               model.Duration
 	MaxBlockDuration               model.Duration
+	OutOfOrderTimeWindow           int64
 	EnableExemplarStorage          bool
 	MaxExemplars                   int64
 	EnableMemorySnapshotOnShutdown bool
@ -1549,7 +1553,8 @@ func (opts tsdbOptions) ToTSDBOptions() tsdb.Options {
 		RetentionDuration:              int64(time.Duration(opts.RetentionDuration) / time.Millisecond),
 		MaxBytes:                       int64(opts.MaxBytes),
 		NoLockfile:                     opts.NoLockfile,
-		AllowOverlappingBlocks:         opts.AllowOverlappingBlocks,
+		AllowOverlappingCompaction:     opts.AllowOverlappingBlocks,
+		AllowOverlappingQueries:        opts.AllowOverlappingBlocks,
 		WALCompression:                 opts.WALCompression,
 		HeadChunksWriteQueueSize:       opts.HeadChunksWriteQueueSize,
 		StripeSize:                     opts.StripeSize,
@ -1558,6 +1563,7 @@ func (opts tsdbOptions) ToTSDBOptions() tsdb.Options {
 		EnableExemplarStorage:          opts.EnableExemplarStorage,
 		MaxExemplars:                   opts.MaxExemplars,
 		EnableMemorySnapshotOnShutdown: opts.EnableMemorySnapshotOnShutdown,
+		OutOfOrderTimeWindow:           opts.OutOfOrderTimeWindow,
 	}
 }

--- a/cmd/promtool/rules_test.go
+++ b/cmd/promtool/rules_test.go
@ -117,7 +117,8 @@ func TestBackfillRuleIntegration(t *testing.T) {
 				}

 				opts := tsdb.DefaultOptions()
-				opts.AllowOverlappingBlocks = true
+				opts.AllowOverlappingQueries = true
+				opts.AllowOverlappingCompaction = true
 				db, err := tsdb.Open(tmpDir, nil, nil, opts, nil)
 				require.NoError(t, err)

@ -245,7 +246,8 @@ func TestBackfillLabels(t *testing.T) {
 	}

 	opts := tsdb.DefaultOptions()
-	opts.AllowOverlappingBlocks = true
+	opts.AllowOverlappingQueries = true
+	opts.AllowOverlappingCompaction = true
 	db, err := tsdb.Open(tmpDir, nil, nil, opts, nil)
 	require.NoError(t, err)

--- a/cmd/promtool/tsdb.go
+++ b/cmd/promtool/tsdb.go
@ -597,7 +597,7 @@ func analyzeCompaction(block tsdb.BlockReader, indexr tsdb.IndexReader) (err err

 		for _, chk := range chks {
 			// Load the actual data of the chunk.
-			chk, err := chunkr.Chunk(chk.Ref)
+			chk, err := chunkr.Chunk(chk)
 			if err != nil {
 				return err
 			}
--- a/config/config.go
+++ b/config/config.go
@ -501,9 +501,37 @@ func (c *ScrapeConfig) MarshalYAML() (interface{}, error) {

 // StorageConfig configures runtime reloadable configuration options.
 type StorageConfig struct {
+	TSDBConfig      *TSDBConfig      `yaml:"tsdb,omitempty"`
 	ExemplarsConfig *ExemplarsConfig `yaml:"exemplars,omitempty"`
 }

+// TSDBConfig configures runtime reloadable configuration options.
+type TSDBConfig struct {
+	// OutOfOrderTimeWindow sets how long back in time an out-of-order sample can be inserted
+	// into the TSDB. This flag is typically set while unmarshaling the configuration file and translating
+	// OutOfOrderTimeWindowFlag's duration. The unit of this flag is expected to be the same as any
+	// other timestamp in the TSDB.
+	OutOfOrderTimeWindow int64
+
+	// OutOfOrderTimeWindowFlag holds the parsed duration from the config file.
+	// During unmarshall, this is converted into milliseconds and stored in OutOfOrderTimeWindow.
+	// This should not be used directly and must be converted into OutOfOrderTimeWindow.
+	OutOfOrderTimeWindowFlag model.Duration `yaml:"out_of_order_time_window,omitempty"`
+}
+
+// UnmarshalYAML implements the yaml.Unmarshaler interface.
+func (t *TSDBConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
+	*t = TSDBConfig{}
+	type plain TSDBConfig
+	if err := unmarshal((*plain)(t)); err != nil {
+		return err
+	}
+
+	t.OutOfOrderTimeWindow = time.Duration(t.OutOfOrderTimeWindowFlag).Milliseconds()
+
+	return nil
+}
+
 type TracingClientType string

 const (
--- a/storage/interface.go
+++ b/storage/interface.go
@ -27,10 +27,15 @@ import (

 // The errors exposed.
 var (
-	ErrNotFound                    = errors.New("not found")
-	ErrOutOfOrderSample            = errors.New("out of order sample")
+	ErrNotFound = errors.New("not found")
+	// ErrOutOfOrderSample is when out of order support is disabled and the sample is out of order.
+	ErrOutOfOrderSample = errors.New("out of order sample")
+	// ErrOutOfBounds is when out of order support is disabled and the sample is older than the min valid time for the append.
+	ErrOutOfBounds = errors.New("out of bounds")
+	// ErrTooOldSample is when out of order support is enabled but the sample is outside the time window allowed.
+	ErrTooOldSample = errors.New("too old sample")
+	// ErrDuplicateSampleForTimestamp is when the sample has same timestamp but different value.
 	ErrDuplicateSampleForTimestamp = errors.New("duplicate sample for timestamp")
-	ErrOutOfBounds                 = errors.New("out of bounds")
 	ErrOutOfOrderExemplar          = errors.New("out of order exemplar")
 	ErrDuplicateExemplar           = errors.New("duplicate exemplar")
 	ErrExemplarLabelLength         = fmt.Errorf("label length for exemplar exceeds maximum of %d UTF-8 characters", exemplar.ExemplarMaxLabelSetLength)
--- a/storage/merge.go
+++ b/storage/merge.go
@ -717,3 +717,56 @@ func (h *chunkIteratorHeap) Pop() interface{} {
 	*h = old[0 : n-1]
 	return x
 }
+
+// NewConcatenatingChunkSeriesMerger returns a VerticalChunkSeriesMergeFunc that simply concatenates the
+// chunks from the series. The resultant stream of chunks for a series might be overlapping and unsorted.
+func NewConcatenatingChunkSeriesMerger() VerticalChunkSeriesMergeFunc {
+	return func(series ...ChunkSeries) ChunkSeries {
+		if len(series) == 0 {
+			return nil
+		}
+		return &ChunkSeriesEntry{
+			Lset: series[0].Labels(),
+			ChunkIteratorFn: func() chunks.Iterator {
+				iterators := make([]chunks.Iterator, 0, len(series))
+				for _, s := range series {
+					iterators = append(iterators, s.Iterator())
+				}
+				return &concatenatingChunkIterator{
+					iterators: iterators,
+				}
+			},
+		}
+	}
+}
+
+type concatenatingChunkIterator struct {
+	iterators []chunks.Iterator
+	idx       int
+
+	curr chunks.Meta
+}
+
+func (c *concatenatingChunkIterator) At() chunks.Meta {
+	return c.curr
+}
+
+func (c *concatenatingChunkIterator) Next() bool {
+	if c.idx >= len(c.iterators) {
+		return false
+	}
+	if c.iterators[c.idx].Next() {
+		c.curr = c.iterators[c.idx].At()
+		return true
+	}
+	c.idx++
+	return c.Next()
+}
+
+func (c *concatenatingChunkIterator) Err() error {
+	errs := tsdb_errors.NewMulti()
+	for _, iter := range c.iterators {
+		errs.Add(iter.Err())
+	}
+	return errs.Err()
+}
--- a/storage/merge_test.go
+++ b/storage/merge_test.go
@ -499,6 +499,140 @@ func TestCompactingChunkSeriesMerger(t *testing.T) {
 	}
 }

+func TestConcatenatingChunkSeriesMerger(t *testing.T) {
+	m := NewConcatenatingChunkSeriesMerger()
+
+	for _, tc := range []struct {
+		name     string
+		input    []ChunkSeries
+		expected ChunkSeries
+	}{
+		{
+			name: "single empty series",
+			input: []ChunkSeries{
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil),
+			},
+			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil),
+		},
+		{
+			name: "single series",
+			input: []ChunkSeries{
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}}),
+			},
+			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}}),
+		},
+		{
+			name: "two empty series",
+			input: []ChunkSeries{
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil),
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil),
+			},
+			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), nil, nil),
+		},
+		{
+			name: "two non overlapping",
+			input: []ChunkSeries{
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}, sample{5, 5}}),
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{7, 7}, sample{9, 9}}, []tsdbutil.Sample{sample{10, 10}}),
+			},
+			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}, sample{5, 5}}, []tsdbutil.Sample{sample{7, 7}, sample{9, 9}}, []tsdbutil.Sample{sample{10, 10}}),
+		},
+		{
+			name: "two overlapping",
+			input: []ChunkSeries{
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}, sample{8, 8}}),
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{7, 7}, sample{9, 9}}, []tsdbutil.Sample{sample{10, 10}}),
+			},
+			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
+				[]tsdbutil.Sample{sample{1, 1}, sample{2, 2}}, []tsdbutil.Sample{sample{3, 3}, sample{8, 8}},
+				[]tsdbutil.Sample{sample{7, 7}, sample{9, 9}}, []tsdbutil.Sample{sample{10, 10}},
+			),
+		},
+		{
+			name: "two duplicated",
+			input: []ChunkSeries{
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}}),
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{2, 2}, sample{3, 3}, sample{5, 5}}),
+			},
+			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
+				[]tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}},
+				[]tsdbutil.Sample{sample{2, 2}, sample{3, 3}, sample{5, 5}},
+			),
+		},
+		{
+			name: "three overlapping",
+			input: []ChunkSeries{
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}}),
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{2, 2}, sample{3, 3}, sample{6, 6}}),
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{0, 0}, sample{4, 4}}),
+			},
+			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
+				[]tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}},
+				[]tsdbutil.Sample{sample{2, 2}, sample{3, 3}, sample{6, 6}},
+				[]tsdbutil.Sample{sample{0, 0}, sample{4, 4}},
+			),
+		},
+		{
+			name: "three in chained overlap",
+			input: []ChunkSeries{
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}}),
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{4, 4}, sample{6, 66}}),
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{6, 6}, sample{10, 10}}),
+			},
+			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
+				[]tsdbutil.Sample{sample{1, 1}, sample{2, 2}, sample{3, 3}, sample{5, 5}},
+				[]tsdbutil.Sample{sample{4, 4}, sample{6, 66}},
+				[]tsdbutil.Sample{sample{6, 6}, sample{10, 10}},
+			),
+		},
+		{
+			name: "three in chained overlap complex",
+			input: []ChunkSeries{
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{0, 0}, sample{5, 5}}, []tsdbutil.Sample{sample{10, 10}, sample{15, 15}}),
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{2, 2}, sample{20, 20}}, []tsdbutil.Sample{sample{25, 25}, sample{30, 30}}),
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{sample{18, 18}, sample{26, 26}}, []tsdbutil.Sample{sample{31, 31}, sample{35, 35}}),
+			},
+			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
+				[]tsdbutil.Sample{sample{0, 0}, sample{5, 5}}, []tsdbutil.Sample{sample{10, 10}, sample{15, 15}},
+				[]tsdbutil.Sample{sample{2, 2}, sample{20, 20}}, []tsdbutil.Sample{sample{25, 25}, sample{30, 30}},
+				[]tsdbutil.Sample{sample{18, 18}, sample{26, 26}}, []tsdbutil.Sample{sample{31, 31}, sample{35, 35}},
+			),
+		},
+		{
+			name: "110 overlapping",
+			input: []ChunkSeries{
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), tsdbutil.GenerateSamples(0, 110)), // [0 - 110)
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), tsdbutil.GenerateSamples(60, 50)), // [60 - 110)
+			},
+			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
+				tsdbutil.GenerateSamples(0, 110),
+				tsdbutil.GenerateSamples(60, 50),
+			),
+		},
+		{
+			name: "150 overlapping samples, simply concatenated and no splits",
+			input: []ChunkSeries{
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), tsdbutil.GenerateSamples(0, 90)),  // [0 - 90)
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), tsdbutil.GenerateSamples(60, 90)), // [90 - 150)
+			},
+			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"),
+				tsdbutil.GenerateSamples(0, 90),
+				tsdbutil.GenerateSamples(60, 90),
+			),
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			merged := m(tc.input...)
+			require.Equal(t, tc.expected.Labels(), merged.Labels())
+			actChks, actErr := ExpandChunks(merged.Iterator())
+			expChks, expErr := ExpandChunks(tc.expected.Iterator())
+
+			require.Equal(t, expErr, actErr)
+			require.Equal(t, expChks, actChks)
+		})
+	}
+}
+
 type mockQuerier struct {
 	LabelQuerier

--- a/tsdb/agent/db.go
+++ b/tsdb/agent/db.go
@ -567,8 +567,7 @@ func (db *DB) truncate(mint int64) error {

 	// Start a new segment so low ingestion volume instances don't have more WAL
 	// than needed.
-	err = db.wal.NextSegment()
-	if err != nil {
+	if _, err := db.wal.NextSegment(); err != nil {
 		return errors.Wrap(err, "next segment")
 	}

--- a/tsdb/block.go
+++ b/tsdb/block.go
@ -116,7 +116,7 @@ type ChunkWriter interface {
 // ChunkReader provides reading access of serialized time series data.
 type ChunkReader interface {
 	// Chunk returns the series data chunk with the given reference.
-	Chunk(ref chunks.ChunkRef) (chunkenc.Chunk, error)
+	Chunk(meta chunks.Meta) (chunkenc.Chunk, error)

 	// Close releases all underlying resources of the reader.
 	Close() error
@ -189,12 +189,39 @@ type BlockMetaCompaction struct {
 	// this block.
 	Parents []BlockDesc `json:"parents,omitempty"`
 	Failed  bool        `json:"failed,omitempty"`
+	// Additional information about the compaction, for example, block created from out-of-order chunks.
+	Hints []string `json:"hints,omitempty"`
+}
+
+func (bm *BlockMetaCompaction) SetOutOfOrder() {
+	if bm.containsHint(CompactionHintFromOutOfOrder) {
+		return
+	}
+	bm.Hints = append(bm.Hints, CompactionHintFromOutOfOrder)
+	sort.Strings(bm.Hints)
+}
+
+func (bm *BlockMetaCompaction) FromOutOfOrder() bool {
+	return bm.containsHint(CompactionHintFromOutOfOrder)
+}
+
+func (bm *BlockMetaCompaction) containsHint(hint string) bool {
+	for _, h := range bm.Hints {
+		if h == hint {
+			return true
+		}
+	}
+	return false
 }

 const (
 	indexFilename = "index"
 	metaFilename  = "meta.json"
 	metaVersion1  = 1
+
+	// CompactionHintFromOutOfOrder is a hint noting that the block
+	// was created from out-of-order chunks.
+	CompactionHintFromOutOfOrder = "from-out-of-order"
 )

 func chunkDir(dir string) string { return filepath.Join(dir, "chunks") }
--- a/tsdb/block_test.go
+++ b/tsdb/block_test.go
@ -27,6 +27,7 @@ import (
 	"testing"

 	"github.com/go-kit/log"
+	prom_testutil "github.com/prometheus/client_golang/prometheus/testutil"
 	"github.com/stretchr/testify/require"

 	"github.com/prometheus/prometheus/model/labels"
@ -487,7 +488,7 @@ func createBlockFromHead(tb testing.TB, dir string, head *Head) string {
 func createHead(tb testing.TB, w *wal.WAL, series []storage.Series, chunkDir string) *Head {
 	opts := DefaultHeadOptions()
 	opts.ChunkDirRoot = chunkDir
-	head, err := NewHead(nil, nil, w, opts, nil)
+	head, err := NewHead(nil, nil, w, nil, opts, nil)
 	require.NoError(tb, err)

 	app := head.Appender(context.Background())
@ -506,6 +507,66 @@ func createHead(tb testing.TB, w *wal.WAL, series []storage.Series, chunkDir str
 	return head
 }

+func createHeadWithOOOSamples(tb testing.TB, w *wal.WAL, series []storage.Series, chunkDir string, oooSampleFrequency int) *Head {
+	opts := DefaultHeadOptions()
+	opts.ChunkDirRoot = chunkDir
+	opts.OutOfOrderTimeWindow.Store(10000000000)
+	head, err := NewHead(nil, nil, w, nil, opts, nil)
+	require.NoError(tb, err)
+
+	oooSampleLabels := make([]labels.Labels, 0, len(series))
+	oooSamples := make([]tsdbutil.SampleSlice, 0, len(series))
+
+	totalSamples := 0
+	app := head.Appender(context.Background())
+	for _, s := range series {
+		ref := storage.SeriesRef(0)
+		it := s.Iterator()
+		lset := s.Labels()
+		os := tsdbutil.SampleSlice{}
+		count := 0
+		for it.Next() {
+			totalSamples++
+			count++
+			t, v := it.At()
+			if count%oooSampleFrequency == 0 {
+				os = append(os, sample{t: t, v: v})
+				continue
+			}
+			ref, err = app.Append(ref, lset, t, v)
+			require.NoError(tb, err)
+		}
+		require.NoError(tb, it.Err())
+		if len(os) > 0 {
+			oooSampleLabels = append(oooSampleLabels, lset)
+			oooSamples = append(oooSamples, os)
+		}
+	}
+	require.NoError(tb, app.Commit())
+
+	oooSamplesAppended := 0
+	require.Equal(tb, float64(0), prom_testutil.ToFloat64(head.metrics.outOfOrderSamplesAppended))
+
+	app = head.Appender(context.Background())
+	for i, lset := range oooSampleLabels {
+		ref := storage.SeriesRef(0)
+		for _, sample := range oooSamples[i] {
+			ref, err = app.Append(ref, lset, sample.T(), sample.V())
+			require.NoError(tb, err)
+			oooSamplesAppended++
+		}
+	}
+	require.NoError(tb, app.Commit())
+
+	actOOOAppended := prom_testutil.ToFloat64(head.metrics.outOfOrderSamplesAppended)
+	require.GreaterOrEqual(tb, actOOOAppended, float64(oooSamplesAppended-len(series)))
+	require.LessOrEqual(tb, actOOOAppended, float64(oooSamplesAppended))
+
+	require.Equal(tb, float64(totalSamples), prom_testutil.ToFloat64(head.metrics.samplesAppended))
+
+	return head
+}
+
 const (
 	defaultLabelName  = "labelName"
 	defaultLabelValue = "labelValue"
--- a/tsdb/blockwriter.go
+++ b/tsdb/blockwriter.go
@ -39,7 +39,7 @@ type BlockWriter struct {
 }

 // ErrNoSeriesAppended is returned if the series count is zero while flushing blocks.
-var ErrNoSeriesAppended error = errors.New("no series appended, aborting")
+var ErrNoSeriesAppended = errors.New("no series appended, aborting")

 // NewBlockWriter create a new block writer.
 //
@ -71,7 +71,7 @@ func (w *BlockWriter) initHead() error {
 	opts := DefaultHeadOptions()
 	opts.ChunkRange = w.blockSize
 	opts.ChunkDirRoot = w.chunkDir
-	h, err := NewHead(nil, w.logger, nil, opts, NewHeadStats())
+	h, err := NewHead(nil, w.logger, nil, nil, opts, NewHeadStats())
 	if err != nil {
 		return errors.Wrap(err, "tsdb.NewHead")
 	}
--- a/tsdb/chunkenc/chunk.go
+++ b/tsdb/chunkenc/chunk.go
@ -39,6 +39,21 @@ const (
 	EncXOR
 )

+// Chunk encodings for out-of-order chunks.
+// These encodings must be only used by the Head block for its internal bookkeeping.
+const (
+	OutOfOrderMask = 0b10000000
+	EncOOOXOR      = EncXOR | OutOfOrderMask
+)
+
+func IsOutOfOrderChunk(e Encoding) bool {
+	return (e & OutOfOrderMask) != 0
+}
+
+func IsValidEncoding(e Encoding) bool {
+	return e == EncXOR || e == EncOOOXOR
+}
+
 // Chunk holds a sequence of sample pairs that can be iterated over and appended to.
 type Chunk interface {
 	// Bytes returns the underlying byte slice of the chunk.
@ -155,7 +170,7 @@ func NewPool() Pool {

 func (p *pool) Get(e Encoding, b []byte) (Chunk, error) {
 	switch e {
-	case EncXOR:
+	case EncXOR, EncOOOXOR:
 		c := p.xor.Get().(*XORChunk)
 		c.b.stream = b
 		c.b.count = 0
@ -166,7 +181,7 @@ func (p *pool) Get(e Encoding, b []byte) (Chunk, error) {

 func (p *pool) Put(c Chunk) error {
 	switch c.Encoding() {
-	case EncXOR:
+	case EncXOR, EncOOOXOR:
 		xc, ok := c.(*XORChunk)
 		// This may happen often with wrapped chunks. Nothing we can really do about
 		// it but returning an error would cause a lot of allocations again. Thus,
@ -188,7 +203,7 @@ func (p *pool) Put(c Chunk) error {
 // bytes.
 func FromData(e Encoding, d []byte) (Chunk, error) {
 	switch e {
-	case EncXOR:
+	case EncXOR, EncOOOXOR:
 		return &XORChunk{b: bstream{count: 0, stream: d}}, nil
 	}
 	return nil, errors.Errorf("invalid chunk encoding %q", e)
--- a/tsdb/chunkenc/xor.go
+++ b/tsdb/chunkenc/xor.go
@ -457,3 +457,12 @@ func (it *xorIterator) readValue() bool {
 	it.numRead++
 	return true
 }
+
+// OOOXORChunk holds a XORChunk and overrides the Encoding() method.
+type OOOXORChunk struct {
+	*XORChunk
+}
+
+func (c *OOOXORChunk) Encoding() Encoding {
+	return EncOOOXOR
+}
--- a/tsdb/chunks/chunks.go
+++ b/tsdb/chunks/chunks.go
@ -121,6 +121,15 @@ type Meta struct {
 	// Time range the data covers.
 	// When MaxTime == math.MaxInt64 the chunk is still open and being appended to.
 	MinTime, MaxTime int64
+
+	// OOOLastRef, OOOLastMinTime and OOOLastMaxTime are kept as markers for
+	// overlapping chunks.
+	// These fields point to the last created out of order Chunk (the head) that existed
+	// when Series() was called and was overlapping.
+	// Series() and Chunk() method responses should be consistent for the same
+	// query even if new data is added in between the calls.
+	OOOLastRef                     ChunkRef
+	OOOLastMinTime, OOOLastMaxTime int64
 }

 // Iterator iterates over the chunks of a single time series.
@ -556,8 +565,8 @@ func (s *Reader) Size() int64 {
 }

 // Chunk returns a chunk from a given reference.
-func (s *Reader) Chunk(ref ChunkRef) (chunkenc.Chunk, error) {
-	sgmIndex, chkStart := BlockChunkRef(ref).Unpack()
+func (s *Reader) Chunk(meta Meta) (chunkenc.Chunk, error) {
+	sgmIndex, chkStart := BlockChunkRef(meta.Ref).Unpack()

 	if sgmIndex >= len(s.bs) {
 		return nil, errors.Errorf("segment index %d out of range", sgmIndex)
--- a/tsdb/chunks/chunks_test.go
+++ b/tsdb/chunks/chunks_test.go
@ -23,6 +23,6 @@ func TestReaderWithInvalidBuffer(t *testing.T) {
 	b := realByteSlice([]byte{0x81, 0x81, 0x81, 0x81, 0x81, 0x81})
 	r := &Reader{bs: []ByteSlice{b}}

-	_, err := r.Chunk(0)
+	_, err := r.Chunk(Meta{Ref: 0})
 	require.Error(t, err)
 }
--- a/tsdb/chunks/head_chunks.go
+++ b/tsdb/chunks/head_chunks.go
@ -87,6 +87,18 @@ func (ref ChunkDiskMapperRef) Unpack() (seq, offset int) {
 	return seq, offset
 }

+func (ref ChunkDiskMapperRef) GreaterThanOrEqualTo(r ChunkDiskMapperRef) bool {
+	s1, o1 := ref.Unpack()
+	s2, o2 := r.Unpack()
+	return s1 > s2 || (s1 == s2 && o1 >= o2)
+}
+
+func (ref ChunkDiskMapperRef) GreaterThan(r ChunkDiskMapperRef) bool {
+	s1, o1 := ref.Unpack()
+	s2, o2 := r.Unpack()
+	return s1 > s2 || (s1 == s2 && o1 > o2)
+}
+
 // CorruptionErr is an error that's returned when corruption is encountered.
 type CorruptionErr struct {
 	Dir       string
@ -736,7 +748,7 @@ func (cdm *ChunkDiskMapper) Chunk(ref ChunkDiskMapperRef) (chunkenc.Chunk, error
 // and runs the provided function with information about each chunk. It returns on the first error encountered.
 // NOTE: This method needs to be called at least once after creating ChunkDiskMapper
 // to set the maxt of all the file.
-func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, numSamples uint16) error) (err error) {
+func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, numSamples uint16, encoding chunkenc.Encoding) error) (err error) {
 	cdm.writePathMtx.Lock()
 	defer cdm.writePathMtx.Unlock()

@ -799,7 +811,8 @@ func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chu
 				break
 			}

-			idx += ChunkEncodingSize // Skip encoding.
+			chkEnc := chunkenc.Encoding(mmapFile.byteSlice.Range(idx, idx+ChunkEncodingSize)[0])
+			idx += ChunkEncodingSize
 			dataLen, n := binary.Uvarint(mmapFile.byteSlice.Range(idx, idx+MaxChunkLengthFieldSize))
 			idx += n

@ -834,7 +847,7 @@ func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chu
 				mmapFile.maxt = maxt
 			}

-			if err := f(seriesRef, chunkRef, mint, maxt, numSamples); err != nil {
+			if err := f(seriesRef, chunkRef, mint, maxt, numSamples, chkEnc); err != nil {
 				if cerr, ok := err.(*CorruptionErr); ok {
 					cerr.Dir = cdm.dir.Name()
 					cerr.FileIndex = segID
@ -857,12 +870,8 @@ func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chu
 	return nil
 }

-// Truncate deletes the head chunk files which are strictly below the mint.
-// mint should be in milliseconds.
-func (cdm *ChunkDiskMapper) Truncate(mint int64) error {
-	if !cdm.fileMaxtSet {
-		return errors.New("maxt of the files are not set")
-	}
+// Truncate deletes the head chunk files whose file number is less than given fileNo.
+func (cdm *ChunkDiskMapper) Truncate(fileNo uint32) error {
 	cdm.readPathMtx.RLock()

 	// Sort the file indices, else if files deletion fails in between,
@ -875,12 +884,10 @@ func (cdm *ChunkDiskMapper) Truncate(mint int64) error {

 	var removedFiles []int
 	for _, seq := range chkFileIndices {
-		if seq == cdm.curFileSequence || cdm.mmappedChunkFiles[seq].maxt >= mint {
+		if seq == cdm.curFileSequence || uint32(seq) >= fileNo {
 			break
 		}
-		if cdm.mmappedChunkFiles[seq].maxt < mint {
-			removedFiles = append(removedFiles, seq)
-		}
+		removedFiles = append(removedFiles, seq)
 	}
 	cdm.readPathMtx.RUnlock()

--- a/tsdb/chunks/head_chunks_test.go
+++ b/tsdb/chunks/head_chunks_test.go
@ -58,6 +58,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) {
 		mint, maxt int64
 		numSamples uint16
 		chunk      chunkenc.Chunk
+		isOOO      bool
 	}
 	expectedData := []expectedDataType{}

@ -67,7 +68,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) {
 	for hrw.curFileSequence < 3 || hrw.chkWriter.Buffered() == 0 {
 		addChunks := func(numChunks int) {
 			for i := 0; i < numChunks; i++ {
-				seriesRef, chkRef, mint, maxt, chunk := createChunk(t, totalChunks, hrw)
+				seriesRef, chkRef, mint, maxt, chunk, isOOO := createChunk(t, totalChunks, hrw)
 				totalChunks++
 				expectedData = append(expectedData, expectedDataType{
 					seriesRef:  seriesRef,
@ -76,6 +77,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) {
 					chunkRef:   chkRef,
 					chunk:      chunk,
 					numSamples: uint16(chunk.NumSamples()),
+					isOOO:      isOOO,
 				})

 				if hrw.curFileSequence != 1 {
@ -147,7 +149,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) {
 	hrw = createChunkDiskMapper(t, dir)

 	idx := 0
-	require.NoError(t, hrw.IterateAllChunks(func(seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, numSamples uint16) error {
+	require.NoError(t, hrw.IterateAllChunks(func(seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, numSamples uint16, encoding chunkenc.Encoding) error {
 		t.Helper()

 		expData := expectedData[idx]
@ -156,6 +158,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) {
 		require.Equal(t, expData.maxt, maxt)
 		require.Equal(t, expData.maxt, maxt)
 		require.Equal(t, expData.numSamples, numSamples)
+		require.Equal(t, expData.isOOO, chunkenc.IsOutOfOrderChunk(encoding))

 		actChunk, err := hrw.Chunk(expData.chunkRef)
 		require.NoError(t, err)
@ -178,9 +181,7 @@ func TestChunkDiskMapper_Truncate(t *testing.T) {
 	}()

 	timeRange := 0
-	fileTimeStep := 100
-	var thirdFileMinT, sixthFileMinT int64
-	addChunk := func() int {
+	addChunk := func() {
 		t.Helper()

 		step := 100
@ -194,8 +195,6 @@ func TestChunkDiskMapper_Truncate(t *testing.T) {
 		<-awaitCb
 		require.NoError(t, err)
 		timeRange += step
-
-		return mint
 	}

 	verifyFiles := func(remainingFiles []int) {
@ -216,17 +215,12 @@ func TestChunkDiskMapper_Truncate(t *testing.T) {
 	// Create segments 1 to 7.
 	for i := 1; i <= 7; i++ {
 		hrw.CutNewFile()
-		mint := int64(addChunk())
-		if i == 3 {
-			thirdFileMinT = mint
-		} else if i == 6 {
-			sixthFileMinT = mint
-		}
+		addChunk()
 	}
 	verifyFiles([]int{1, 2, 3, 4, 5, 6, 7})

 	// Truncating files.
-	require.NoError(t, hrw.Truncate(thirdFileMinT))
+	require.NoError(t, hrw.Truncate(3))

 	// Add a chunk to trigger cutting of new file.
 	addChunk()
@ -245,11 +239,11 @@ func TestChunkDiskMapper_Truncate(t *testing.T) {
 	verifyFiles([]int{3, 4, 5, 6, 7, 8, 9})

 	// Truncating files after restart.
-	require.NoError(t, hrw.Truncate(sixthFileMinT))
+	require.NoError(t, hrw.Truncate(6))
 	verifyFiles([]int{6, 7, 8, 9})

 	// Truncating a second time without adding a chunk shouldn't create a new file.
-	require.NoError(t, hrw.Truncate(sixthFileMinT+1))
+	require.NoError(t, hrw.Truncate(6))
 	verifyFiles([]int{6, 7, 8, 9})

 	// Add a chunk to trigger cutting of new file.
@ -257,8 +251,12 @@ func TestChunkDiskMapper_Truncate(t *testing.T) {

 	verifyFiles([]int{6, 7, 8, 9, 10})

+	// Truncation by file number.
+	require.NoError(t, hrw.Truncate(8))
+	verifyFiles([]int{8, 9, 10})
+
 	// Truncating till current time should not delete the current active file.
-	require.NoError(t, hrw.Truncate(int64(timeRange+(2*fileTimeStep))))
+	require.NoError(t, hrw.Truncate(10))

 	// Add a chunk to trigger cutting of new file.
 	addChunk()
@ -335,8 +333,7 @@ func TestChunkDiskMapper_Truncate_PreservesFileSequence(t *testing.T) {

 	// Truncating files till 2. It should not delete anything after 3 (inclusive)
 	// though files 4 and 6 are empty.
-	file2Maxt := hrw.mmappedChunkFiles[2].maxt
-	require.NoError(t, hrw.Truncate(file2Maxt+1))
+	require.NoError(t, hrw.Truncate(3))
 	verifyFiles([]int{3, 4, 5, 6})

 	// Add chunk, so file 6 is not empty anymore.
@ -344,8 +341,7 @@ func TestChunkDiskMapper_Truncate_PreservesFileSequence(t *testing.T) {
 	verifyFiles([]int{3, 4, 5, 6})

 	// Truncating till file 3 should also delete file 4, because it is empty.
-	file3Maxt := hrw.mmappedChunkFiles[3].maxt
-	require.NoError(t, hrw.Truncate(file3Maxt+1))
+	require.NoError(t, hrw.Truncate(5))
 	addChunk()
 	verifyFiles([]int{5, 6, 7})

@ -381,7 +377,7 @@ func TestHeadReadWriter_TruncateAfterFailedIterateChunks(t *testing.T) {
 	hrw = createChunkDiskMapper(t, dir)

 	// Forcefully failing IterateAllChunks.
-	require.Error(t, hrw.IterateAllChunks(func(_ HeadSeriesRef, _ ChunkDiskMapperRef, _, _ int64, _ uint16) error {
+	require.Error(t, hrw.IterateAllChunks(func(_ HeadSeriesRef, _ ChunkDiskMapperRef, _, _ int64, _ uint16, _ chunkenc.Encoding) error {
 		return errors.New("random error")
 	}))

@ -471,7 +467,9 @@ func createChunkDiskMapper(t *testing.T, dir string) *ChunkDiskMapper {
 	hrw, err := NewChunkDiskMapper(nil, dir, chunkenc.NewPool(), DefaultWriteBufferSize, writeQueueSize)
 	require.NoError(t, err)
 	require.False(t, hrw.fileMaxtSet)
-	require.NoError(t, hrw.IterateAllChunks(func(_ HeadSeriesRef, _ ChunkDiskMapperRef, _, _ int64, _ uint16) error { return nil }))
+	require.NoError(t, hrw.IterateAllChunks(func(_ HeadSeriesRef, _ ChunkDiskMapperRef, _, _ int64, _ uint16, _ chunkenc.Encoding) error {
+		return nil
+	}))
 	require.True(t, hrw.fileMaxtSet)

 	return hrw
@ -488,13 +486,17 @@ func randomChunk(t *testing.T) chunkenc.Chunk {
 	return chunk
 }

-func createChunk(t *testing.T, idx int, hrw *ChunkDiskMapper) (seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, chunk chunkenc.Chunk) {
+func createChunk(t *testing.T, idx int, hrw *ChunkDiskMapper) (seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, chunk chunkenc.Chunk, isOOO bool) {
 	var err error
 	seriesRef = HeadSeriesRef(rand.Int63())
 	mint = int64((idx)*1000 + 1)
 	maxt = int64((idx + 1) * 1000)
 	chunk = randomChunk(t)
 	awaitCb := make(chan struct{})
+	if rand.Intn(2) == 0 {
+		isOOO = true
+		chunk = &chunkenc.OOOXORChunk{XORChunk: chunk.(*chunkenc.XORChunk)}
+	}
 	chunkRef = hrw.WriteChunk(seriesRef, mint, maxt, chunk, func(cbErr error) {
 		require.NoError(t, err)
 		close(awaitCb)
--- a/tsdb/compact_test.go
+++ b/tsdb/compact_test.go
@ -1080,7 +1080,7 @@ func BenchmarkCompactionFromHead(b *testing.B) {
 			opts := DefaultHeadOptions()
 			opts.ChunkRange = 1000
 			opts.ChunkDirRoot = chunkDir
-			h, err := NewHead(nil, nil, nil, opts, nil)
+			h, err := NewHead(nil, nil, nil, nil, opts, nil)
 			require.NoError(b, err)
 			for ln := 0; ln < labelNames; ln++ {
 				app := h.Appender(context.Background())
--- a/tsdb/db.go
+++ b/tsdb/db.go
@ -33,6 +33,7 @@ import (
 	"github.com/oklog/ulid"
 	"github.com/pkg/errors"
 	"github.com/prometheus/client_golang/prometheus"
+	"go.uber.org/atomic"
 	"golang.org/x/sync/errgroup"

 	"github.com/prometheus/prometheus/config"
@ -69,18 +70,19 @@ var ErrNotReady = errors.New("TSDB not ready")
 // millisecond precision timestamps.
 func DefaultOptions() *Options {
 	return &Options{
-		WALSegmentSize:            wal.DefaultSegmentSize,
-		MaxBlockChunkSegmentSize:  chunks.DefaultChunkSegmentSize,
-		RetentionDuration:         int64(15 * 24 * time.Hour / time.Millisecond),
-		MinBlockDuration:          DefaultBlockDuration,
-		MaxBlockDuration:          DefaultBlockDuration,
-		NoLockfile:                false,
-		AllowOverlappingBlocks:    false,
-		WALCompression:            false,
-		StripeSize:                DefaultStripeSize,
-		HeadChunksWriteBufferSize: chunks.DefaultWriteBufferSize,
-		IsolationDisabled:         defaultIsolationDisabled,
-		HeadChunksWriteQueueSize:  chunks.DefaultWriteQueueSize,
+		WALSegmentSize:             wal.DefaultSegmentSize,
+		MaxBlockChunkSegmentSize:   chunks.DefaultChunkSegmentSize,
+		RetentionDuration:          int64(15 * 24 * time.Hour / time.Millisecond),
+		MinBlockDuration:           DefaultBlockDuration,
+		MaxBlockDuration:           DefaultBlockDuration,
+		NoLockfile:                 false,
+		AllowOverlappingCompaction: false,
+		AllowOverlappingQueries:    false,
+		WALCompression:             false,
+		StripeSize:                 DefaultStripeSize,
+		HeadChunksWriteBufferSize:  chunks.DefaultWriteBufferSize,
+		IsolationDisabled:          defaultIsolationDisabled,
+		OutOfOrderCapMax:           DefaultOutOfOrderCapMax,
 	}
 }

@ -112,9 +114,19 @@ type Options struct {
 	// NoLockfile disables creation and consideration of a lock file.
 	NoLockfile bool

-	// Overlapping blocks are allowed if AllowOverlappingBlocks is true.
-	// This in-turn enables vertical compaction and vertical query merge.
-	AllowOverlappingBlocks bool
+	// Querying on overlapping blocks are allowed if AllowOverlappingQueries is true.
+	// Since querying is a required operation for TSDB, if there are going to be
+	// overlapping blocks, then this should be set to true.
+	// NOTE: Do not use this directly in DB. Use it via DB.AllowOverlappingQueries().
+	AllowOverlappingQueries bool
+
+	// Compaction of overlapping blocks are allowed if AllowOverlappingCompaction is true.
+	// This is an optional flag for overlapping blocks.
+	// The reason why this flag exists is because there are various users of the TSDB
+	// that do not want vertical compaction happening on ingest time. Instead,
+	// they'd rather keep overlapping blocks and let another component do the overlapping compaction later.
+	// For Prometheus, this will always be enabled if overlapping queries is enabled.
+	AllowOverlappingCompaction bool

 	// WALCompression will turn on Snappy compression for records on the WAL.
 	WALCompression bool
@ -160,6 +172,15 @@ type Options struct {

 	// Disables isolation between reads and in-flight appends.
 	IsolationDisabled bool
+
+	// OutOfOrderTimeWindow specifies how much out of order is allowed, if any.
+	// This can change during run-time, so this value from here should only be used
+	// while initialising.
+	OutOfOrderTimeWindow int64
+
+	// OutOfOrderCapMax is maximum capacity for OOO chunks (in samples).
+	// If it is <=0, the default value is assumed.
+	OutOfOrderCapMax int64
 }

 type BlocksToDeleteFunc func(blocks []*Block) map[ulid.ULID]struct{}
@ -197,6 +218,13 @@ type DB struct {

 	// Cancel a running compaction when a shutdown is initiated.
 	compactCancel context.CancelFunc
+
+	// oooWasEnabled is true if out of order support was enabled at least one time
+	// during the time TSDB was up. In which case we need to keep supporting
+	// out-of-order compaction and vertical queries.
+	oooWasEnabled atomic.Bool
+
+	registerer prometheus.Registerer
 }

 type dbMetrics struct {
@ -372,9 +400,17 @@ func (db *DBReadOnly) FlushWAL(dir string) (returnErr error) {
 	if err != nil {
 		return err
 	}
+	var wbl *wal.WAL
+	wblDir := filepath.Join(db.dir, wal.WblDirName)
+	if _, err := os.Stat(wblDir); !os.IsNotExist(err) {
+		wbl, err = wal.Open(db.logger, wblDir)
+		if err != nil {
+			return err
+		}
+	}
 	opts := DefaultHeadOptions()
 	opts.ChunkDirRoot = db.dir
-	head, err := NewHead(nil, db.logger, w, opts, NewHeadStats())
+	head, err := NewHead(nil, db.logger, w, wbl, opts, NewHeadStats())
 	if err != nil {
 		return err
 	}
@ -430,7 +466,7 @@ func (db *DBReadOnly) loadDataAsQueryable(maxt int64) (storage.SampleAndChunkQue

 	opts := DefaultHeadOptions()
 	opts.ChunkDirRoot = db.dir
-	head, err := NewHead(nil, db.logger, nil, opts, NewHeadStats())
+	head, err := NewHead(nil, db.logger, nil, nil, opts, NewHeadStats())
 	if err != nil {
 		return nil, err
 	}
@ -448,9 +484,17 @@ func (db *DBReadOnly) loadDataAsQueryable(maxt int64) (storage.SampleAndChunkQue
 		if err != nil {
 			return nil, err
 		}
+		var wbl *wal.WAL
+		wblDir := filepath.Join(db.dir, wal.WblDirName)
+		if _, err := os.Stat(wblDir); !os.IsNotExist(err) {
+			wbl, err = wal.Open(db.logger, wblDir)
+			if err != nil {
+				return nil, err
+			}
+		}
 		opts := DefaultHeadOptions()
 		opts.ChunkDirRoot = db.dir
-		head, err = NewHead(nil, db.logger, w, opts, NewHeadStats())
+		head, err = NewHead(nil, db.logger, w, wbl, opts, NewHeadStats())
 		if err != nil {
 			return nil, err
 		}
@ -598,6 +642,15 @@ func validateOpts(opts *Options, rngs []int64) (*Options, []int64) {
 	if opts.MinBlockDuration > opts.MaxBlockDuration {
 		opts.MaxBlockDuration = opts.MinBlockDuration
 	}
+	if opts.OutOfOrderTimeWindow > 0 {
+		opts.AllowOverlappingQueries = true
+	}
+	if opts.OutOfOrderCapMax <= 0 {
+		opts.OutOfOrderCapMax = DefaultOutOfOrderCapMax
+	}
+	if opts.OutOfOrderTimeWindow < 0 {
+		opts.OutOfOrderTimeWindow = 0
+	}

 	if len(rngs) == 0 {
 		// Start with smallest block duration and create exponential buckets until the exceed the
@ -634,6 +687,7 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
 	}

 	walDir := filepath.Join(dir, "wal")
+	wblDir := filepath.Join(dir, wal.WblDirName)

 	// Migrate old WAL if one exists.
 	if err := MigrateWAL(l, walDir); err != nil {
@ -656,6 +710,7 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
 		autoCompact:    true,
 		chunkPool:      chunkenc.NewPool(),
 		blocksToDelete: opts.BlocksToDelete,
+		registerer:     r,
 	}
 	defer func() {
 		// Close files if startup fails somewhere.
@ -694,7 +749,7 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
 	}
 	db.compactCancel = cancel

-	var wlog *wal.WAL
+	var wlog, wblog *wal.WAL
 	segmentSize := wal.DefaultSegmentSize
 	// Wal is enabled.
 	if opts.WALSegmentSize >= 0 {
@ -706,8 +761,19 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
 		if err != nil {
 			return nil, err
 		}
+		// Check if there is a WBL on disk, in which case we should replay that data.
+		wblSize, err := fileutil.DirSize(wblDir)
+		if err != nil && !os.IsNotExist(err) {
+			return nil, err
+		}
+		if opts.OutOfOrderTimeWindow > 0 || wblSize > 0 {
+			wblog, err = wal.NewSize(l, r, wblDir, segmentSize, opts.WALCompression)
+			if err != nil {
+				return nil, err
+			}
+		}
 	}
-
+	db.oooWasEnabled.Store(opts.OutOfOrderTimeWindow > 0)
 	headOpts := DefaultHeadOptions()
 	headOpts.ChunkRange = rngs[0]
 	headOpts.ChunkDirRoot = dir
@ -719,11 +785,13 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
 	headOpts.EnableExemplarStorage = opts.EnableExemplarStorage
 	headOpts.MaxExemplars.Store(opts.MaxExemplars)
 	headOpts.EnableMemorySnapshotOnShutdown = opts.EnableMemorySnapshotOnShutdown
+	headOpts.OutOfOrderTimeWindow.Store(opts.OutOfOrderTimeWindow)
+	headOpts.OutOfOrderCapMax.Store(opts.OutOfOrderCapMax)
 	if opts.IsolationDisabled {
 		// We only override this flag if isolation is disabled at DB level. We use the default otherwise.
 		headOpts.IsolationDisabled = opts.IsolationDisabled
 	}
-	db.head, err = NewHead(r, l, wlog, headOpts, stats.Head)
+	db.head, err = NewHead(r, l, wlog, wblog, headOpts, stats.Head)
 	if err != nil {
 		return nil, err
 	}
@ -741,20 +809,36 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
 	}
 	// Set the min valid time for the ingested samples
 	// to be no lower than the maxt of the last block.
-	blocks := db.Blocks()
 	minValidTime := int64(math.MinInt64)
-	if len(blocks) > 0 {
-		minValidTime = blocks[len(blocks)-1].Meta().MaxTime
+	// We do not consider blocks created from out-of-order samples for Head's minValidTime
+	// since minValidTime is only for the in-order data and we do not want to discard unnecessary
+	// samples from the Head.
+	inOrderMaxTime, ok := db.inOrderBlocksMaxTime()
+	if ok {
+		minValidTime = inOrderMaxTime
 	}

 	if initErr := db.head.Init(minValidTime); initErr != nil {
 		db.head.metrics.walCorruptionsTotal.Inc()
-		level.Warn(db.logger).Log("msg", "Encountered WAL read error, attempting repair", "err", initErr)
-		if err := wlog.Repair(initErr); err != nil {
-			return nil, errors.Wrap(err, "repair corrupted WAL")
+		isOOOErr := isErrLoadOOOWal(initErr)
+		if isOOOErr {
+			level.Warn(db.logger).Log("msg", "Encountered OOO WAL read error, attempting repair", "err", initErr)
+			if err := wblog.Repair(initErr); err != nil {
+				return nil, errors.Wrap(err, "repair corrupted OOO WAL")
+			}
+		} else {
+			level.Warn(db.logger).Log("msg", "Encountered WAL read error, attempting repair", "err", initErr)
+			if err := wlog.Repair(initErr); err != nil {
+				return nil, errors.Wrap(err, "repair corrupted WAL")
+			}
 		}
 	}

+	if db.head.MinOOOTime() != int64(math.MaxInt64) {
+		// Some OOO data was replayed from the disk that needs compaction and cleanup.
+		db.oooWasEnabled.Store(true)
+	}
+
 	go db.run()

 	return db, nil
@ -846,8 +930,58 @@ func (db *DB) Appender(ctx context.Context) storage.Appender {
 	return dbAppender{db: db, Appender: db.head.Appender(ctx)}
 }

+// ApplyConfig applies a new config to the DB.
+// Behaviour of 'OutOfOrderTimeWindow' is as follows:
+// OOO enabled = oooTimeWindow > 0. OOO disabled = oooTimeWindow is 0.
+// 1) Before: OOO disabled, Now: OOO enabled =>
+//   - A new WBL is created for the head block.
+//   - OOO compaction is enabled.
+//   - Overlapping queries are enabled.
+//
+// 2) Before: OOO enabled, Now: OOO enabled =>
+//   - Only the time window is updated.
+//
+// 3) Before: OOO enabled, Now: OOO disabled =>
+//   - Time Window set to 0. So no new OOO samples will be allowed.
+//   - OOO WBL will stay and will be eventually cleaned up.
+//   - OOO Compaction and overlapping queries will remain enabled until a restart or until all OOO samples are compacted.
+//
+// 4) Before: OOO disabled, Now: OOO disabled => no-op.
 func (db *DB) ApplyConfig(conf *config.Config) error {
-	return db.head.ApplyConfig(conf)
+	oooTimeWindow := int64(0)
+	if conf.StorageConfig.TSDBConfig != nil {
+		oooTimeWindow = conf.StorageConfig.TSDBConfig.OutOfOrderTimeWindow
+	}
+	if oooTimeWindow < 0 {
+		oooTimeWindow = 0
+	}
+
+	// Create WBL if it was not present and if OOO is enabled with WAL enabled.
+	var wblog *wal.WAL
+	var err error
+	if db.head.wbl != nil {
+		// The existing WBL from the disk might have been replayed while OOO was disabled.
+		wblog = db.head.wbl
+	} else if !db.oooWasEnabled.Load() && oooTimeWindow > 0 && db.opts.WALSegmentSize >= 0 {
+		segmentSize := wal.DefaultSegmentSize
+		// Wal is set to a custom size.
+		if db.opts.WALSegmentSize > 0 {
+			segmentSize = db.opts.WALSegmentSize
+		}
+		oooWalDir := filepath.Join(db.dir, wal.WblDirName)
+		wblog, err = wal.NewSize(db.logger, db.registerer, oooWalDir, segmentSize, db.opts.WALCompression)
+		if err != nil {
+			return err
+		}
+	}
+
+	db.opts.OutOfOrderTimeWindow = oooTimeWindow
+	db.head.ApplyConfig(conf, wblog)
+
+	if !db.oooWasEnabled.Load() {
+		db.oooWasEnabled.Store(oooTimeWindow > 0)
+	}
+	return nil
 }

 // dbAppender wraps the DB's head appender and triggers compactions on commit
@ -946,6 +1080,14 @@ func (db *DB) Compact() (returnErr error) {
 			"block_range", db.head.chunkRange.Load(),
 		)
 	}
+
+	if lastBlockMaxt != math.MinInt64 {
+		// The head was compacted, so we compact OOO head as well.
+		if err := db.compactOOOHead(); err != nil {
+			return errors.Wrap(err, "compact ooo head")
+		}
+	}
+
 	return db.compactBlocks()
 }

@ -964,6 +1106,102 @@ func (db *DB) CompactHead(head *RangeHead) error {
 	return nil
 }

+// CompactOOOHead compacts the OOO Head.
+func (db *DB) CompactOOOHead() error {
+	db.cmtx.Lock()
+	defer db.cmtx.Unlock()
+
+	return db.compactOOOHead()
+}
+
+func (db *DB) compactOOOHead() error {
+	if !db.oooWasEnabled.Load() {
+		return nil
+	}
+	oooHead, err := NewOOOCompactionHead(db.head)
+	if err != nil {
+		return errors.Wrap(err, "get ooo compaction head")
+	}
+
+	ulids, err := db.compactOOO(db.dir, oooHead)
+	if err != nil {
+		return errors.Wrap(err, "compact ooo head")
+	}
+	if err := db.reloadBlocks(); err != nil {
+		errs := tsdb_errors.NewMulti(err)
+		for _, uid := range ulids {
+			if errRemoveAll := os.RemoveAll(filepath.Join(db.dir, uid.String())); errRemoveAll != nil {
+				errs.Add(errRemoveAll)
+			}
+		}
+		return errors.Wrap(errs.Err(), "reloadBlocks blocks after failed compact ooo head")
+	}
+
+	lastWBLFile, minOOOMmapRef := oooHead.LastWBLFile(), oooHead.LastMmapRef()
+	if lastWBLFile != 0 || minOOOMmapRef != 0 {
+		if err := db.head.truncateOOO(lastWBLFile, minOOOMmapRef); err != nil {
+			return errors.Wrap(err, "truncate ooo wbl")
+		}
+	}
+
+	return nil
+}
+
+// compactOOO creates a new block per possible block range in the compactor's directory from the OOO Head given.
+// Each ULID in the result corresponds to a block in a unique time range.
+func (db *DB) compactOOO(dest string, oooHead *OOOCompactionHead) (_ []ulid.ULID, err error) {
+	start := time.Now()
+
+	blockSize := oooHead.ChunkRange()
+	oooHeadMint, oooHeadMaxt := oooHead.MinTime(), oooHead.MaxTime()
+	ulids := make([]ulid.ULID, 0)
+	defer func() {
+		if err != nil {
+			// Best effort removal of created block on any error.
+			for _, uid := range ulids {
+				_ = os.RemoveAll(filepath.Join(db.dir, uid.String()))
+			}
+		}
+	}()
+
+	for t := blockSize * (oooHeadMint / blockSize); t <= oooHeadMaxt; t = t + blockSize {
+		mint, maxt := t, t+blockSize
+		// Block intervals are half-open: [b.MinTime, b.MaxTime). Block intervals are always +1 than the total samples it includes.
+		uid, err := db.compactor.Write(dest, oooHead.CloneForTimeRange(mint, maxt-1), mint, maxt, nil)
+		if err != nil {
+			return nil, err
+		}
+		if uid.Compare(ulid.ULID{}) != 0 {
+			ulids = append(ulids, uid)
+			blockDir := filepath.Join(dest, uid.String())
+			meta, _, err := readMetaFile(blockDir)
+			if err != nil {
+				return ulids, errors.Wrap(err, "read meta")
+			}
+			meta.Compaction.SetOutOfOrder()
+			_, err = writeMetaFile(db.logger, blockDir, meta)
+			if err != nil {
+				return ulids, errors.Wrap(err, "write meta")
+			}
+		}
+	}
+
+	if len(ulids) == 0 {
+		level.Info(db.logger).Log(
+			"msg", "compact ooo head resulted in no blocks",
+			"duration", time.Since(start),
+		)
+		return nil, nil
+	}
+
+	level.Info(db.logger).Log(
+		"msg", "out-of-order compaction completed",
+		"duration", time.Since(start),
+		"ulids", fmt.Sprintf("%v", ulids),
+	)
+	return ulids, nil
+}
+
 // compactHead compacts the given RangeHead.
 // The compaction mutex should be held before calling this method.
 func (db *DB) compactHead(head *RangeHead) error {
@ -1038,10 +1276,11 @@ func (db *DB) reload() error {
 	if err := db.reloadBlocks(); err != nil {
 		return errors.Wrap(err, "reloadBlocks")
 	}
-	if len(db.blocks) == 0 {
+	maxt, ok := db.inOrderBlocksMaxTime()
+	if !ok {
 		return nil
 	}
-	if err := db.head.Truncate(db.blocks[len(db.blocks)-1].MaxTime()); err != nil {
+	if err := db.head.Truncate(maxt); err != nil {
 		return errors.Wrap(err, "head truncate")
 	}
 	return nil
@ -1121,7 +1360,7 @@ func (db *DB) reloadBlocks() (err error) {
 	sort.Slice(toLoad, func(i, j int) bool {
 		return toLoad[i].Meta().MinTime < toLoad[j].Meta().MinTime
 	})
-	if !db.opts.AllowOverlappingBlocks {
+	if !db.AllowOverlappingQueries() {
 		if err := validateBlockSequence(toLoad); err != nil {
 			return errors.Wrap(err, "invalid block sequence")
 		}
@ -1151,6 +1390,10 @@ func (db *DB) reloadBlocks() (err error) {
 	return nil
 }

+func (db *DB) AllowOverlappingQueries() bool {
+	return db.opts.AllowOverlappingQueries || db.oooWasEnabled.Load()
+}
+
 func openBlocks(l log.Logger, dir string, loaded []*Block, chunkPool chunkenc.Pool) (blocks []*Block, corrupted map[ulid.ULID]error, err error) {
 	bDirs, err := blockDirs(dir)
 	if err != nil {
@ -1428,6 +1671,21 @@ func (db *DB) Blocks() []*Block {
 	return db.blocks
 }

+// inOrderBlocksMaxTime returns the max time among the blocks that were not totally created
+// out of out-of-order data. If the returned boolean is true, it means there is at least
+// one such block.
+func (db *DB) inOrderBlocksMaxTime() (maxt int64, ok bool) {
+	maxt, ok = int64(math.MinInt64), false
+	// If blocks are overlapping, last block might not have the max time. So check all blocks.
+	for _, b := range db.Blocks() {
+		if !b.meta.Compaction.FromOutOfOrder() && b.meta.MaxTime > maxt {
+			ok = true
+			maxt = b.meta.MaxTime
+		}
+	}
+	return maxt, ok
+}
+
 // Head returns the databases's head.
 func (db *DB) Head() *Head {
 	return db.head
@ -1526,13 +1784,13 @@ func (db *DB) Querier(_ context.Context, mint, maxt int64) (storage.Querier, err
 			blocks = append(blocks, b)
 		}
 	}
-	var headQuerier storage.Querier
+	var inOrderHeadQuerier storage.Querier
 	if maxt >= db.head.MinTime() {
 		rh := NewRangeHead(db.head, mint, maxt)
 		var err error
-		headQuerier, err = NewBlockQuerier(rh, mint, maxt)
+		inOrderHeadQuerier, err = NewBlockQuerier(rh, mint, maxt)
 		if err != nil {
-			return nil, errors.Wrapf(err, "open querier for head %s", rh)
+			return nil, errors.Wrapf(err, "open block querier for head %s", rh)
 		}

 		// Getting the querier above registers itself in the queue that the truncation waits on.
@ -1540,20 +1798,30 @@ func (db *DB) Querier(_ context.Context, mint, maxt int64) (storage.Querier, err
 		// won't run into a race later since any truncation that comes after will wait on this querier if it overlaps.
 		shouldClose, getNew, newMint := db.head.IsQuerierCollidingWithTruncation(mint, maxt)
 		if shouldClose {
-			if err := headQuerier.Close(); err != nil {
-				return nil, errors.Wrapf(err, "closing head querier %s", rh)
+			if err := inOrderHeadQuerier.Close(); err != nil {
+				return nil, errors.Wrapf(err, "closing head block querier %s", rh)
 			}
-			headQuerier = nil
+			inOrderHeadQuerier = nil
 		}
 		if getNew {
 			rh := NewRangeHead(db.head, newMint, maxt)
-			headQuerier, err = NewBlockQuerier(rh, newMint, maxt)
+			inOrderHeadQuerier, err = NewBlockQuerier(rh, newMint, maxt)
 			if err != nil {
-				return nil, errors.Wrapf(err, "open querier for head while getting new querier %s", rh)
+				return nil, errors.Wrapf(err, "open block querier for head while getting new querier %s", rh)
 			}
 		}
 	}

+	var outOfOrderHeadQuerier storage.Querier
+	if overlapsClosedInterval(mint, maxt, db.head.MinOOOTime(), db.head.MaxOOOTime()) {
+		rh := NewOOORangeHead(db.head, mint, maxt)
+		var err error
+		outOfOrderHeadQuerier, err = NewBlockQuerier(rh, mint, maxt)
+		if err != nil {
+			return nil, errors.Wrapf(err, "open block querier for ooo head %s", rh)
+		}
+	}
+
 	blockQueriers := make([]storage.Querier, 0, len(blocks))
 	for _, b := range blocks {
 		q, err := NewBlockQuerier(b, mint, maxt)
@ -1568,14 +1836,18 @@ func (db *DB) Querier(_ context.Context, mint, maxt int64) (storage.Querier, err
 		}
 		return nil, errors.Wrapf(err, "open querier for block %s", b)
 	}
-	if headQuerier != nil {
-		blockQueriers = append(blockQueriers, headQuerier)
+	if inOrderHeadQuerier != nil {
+		blockQueriers = append(blockQueriers, inOrderHeadQuerier)
+	}
+	if outOfOrderHeadQuerier != nil {
+		blockQueriers = append(blockQueriers, outOfOrderHeadQuerier)
 	}
 	return storage.NewMergeQuerier(blockQueriers, nil, storage.ChainedSeriesMerge), nil
 }

-// ChunkQuerier returns a new chunk querier over the data partition for the given time range.
-func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQuerier, error) {
+// blockQueriersForRange returns individual block chunk queriers from the persistent blocks, in-order head block, and the
+// out-of-order head block, overlapping with the given time range.
+func (db *DB) blockChunkQuerierForRange(mint, maxt int64) ([]storage.ChunkQuerier, error) {
 	var blocks []BlockReader

 	db.mtx.RLock()
@ -1586,11 +1858,11 @@ func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQu
 			blocks = append(blocks, b)
 		}
 	}
-	var headQuerier storage.ChunkQuerier
+	var inOrderHeadQuerier storage.ChunkQuerier
 	if maxt >= db.head.MinTime() {
 		rh := NewRangeHead(db.head, mint, maxt)
 		var err error
-		headQuerier, err = NewBlockChunkQuerier(rh, mint, maxt)
+		inOrderHeadQuerier, err = NewBlockChunkQuerier(rh, mint, maxt)
 		if err != nil {
 			return nil, errors.Wrapf(err, "open querier for head %s", rh)
 		}
@ -1600,20 +1872,30 @@ func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQu
 		// won't run into a race later since any truncation that comes after will wait on this querier if it overlaps.
 		shouldClose, getNew, newMint := db.head.IsQuerierCollidingWithTruncation(mint, maxt)
 		if shouldClose {
-			if err := headQuerier.Close(); err != nil {
+			if err := inOrderHeadQuerier.Close(); err != nil {
 				return nil, errors.Wrapf(err, "closing head querier %s", rh)
 			}
-			headQuerier = nil
+			inOrderHeadQuerier = nil
 		}
 		if getNew {
 			rh := NewRangeHead(db.head, newMint, maxt)
-			headQuerier, err = NewBlockChunkQuerier(rh, newMint, maxt)
+			inOrderHeadQuerier, err = NewBlockChunkQuerier(rh, newMint, maxt)
 			if err != nil {
 				return nil, errors.Wrapf(err, "open querier for head while getting new querier %s", rh)
 			}
 		}
 	}

+	var outOfOrderHeadQuerier storage.ChunkQuerier
+	if overlapsClosedInterval(mint, maxt, db.head.MinOOOTime(), db.head.MaxOOOTime()) {
+		rh := NewOOORangeHead(db.head, mint, maxt)
+		var err error
+		outOfOrderHeadQuerier, err = NewBlockChunkQuerier(rh, mint, maxt)
+		if err != nil {
+			return nil, errors.Wrapf(err, "open block chunk querier for ooo head %s", rh)
+		}
+	}
+
 	blockQueriers := make([]storage.ChunkQuerier, 0, len(blocks))
 	for _, b := range blocks {
 		q, err := NewBlockChunkQuerier(b, mint, maxt)
@ -1628,10 +1910,22 @@ func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQu
 		}
 		return nil, errors.Wrapf(err, "open querier for block %s", b)
 	}
-	if headQuerier != nil {
-		blockQueriers = append(blockQueriers, headQuerier)
+	if inOrderHeadQuerier != nil {
+		blockQueriers = append(blockQueriers, inOrderHeadQuerier)
+	}
+	if outOfOrderHeadQuerier != nil {
+		blockQueriers = append(blockQueriers, outOfOrderHeadQuerier)
 	}

+	return blockQueriers, nil
+}
+
+// ChunkQuerier returns a new chunk querier over the data partition for the given time range.
+func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQuerier, error) {
+	blockQueriers, err := db.blockChunkQuerierForRange(mint, maxt)
+	if err != nil {
+		return nil, err
+	}
 	return storage.NewMergeChunkQuerier(blockQueriers, nil, storage.NewCompactingChunkSeriesMerger(storage.ChainedSeriesMerge)), nil
 }

--- a/tsdb/db_test.go
+++ b/tsdb/db_test.go
--- a/tsdb/head.go
+++ b/tsdb/head.go
@ -25,9 +25,10 @@ import (
 	"github.com/go-kit/log/level"
 	"github.com/oklog/ulid"
 	"github.com/pkg/errors"
-	"github.com/prometheus/client_golang/prometheus"
 	"go.uber.org/atomic"

+	"github.com/prometheus/client_golang/prometheus"
+
 	"github.com/prometheus/prometheus/config"
 	"github.com/prometheus/prometheus/model/exemplar"
 	"github.com/prometheus/prometheus/model/labels"
@ -62,15 +63,19 @@ var (
 type Head struct {
 	chunkRange               atomic.Int64
 	numSeries                atomic.Uint64
-	minTime, maxTime         atomic.Int64 // Current min and max of the samples included in the head.
+	minOOOTime, maxOOOTime   atomic.Int64 // TODO(jesusvazquez) These should be updated after garbage collection.
+	minTime, maxTime         atomic.Int64 // Current min and max of the samples included in the head. TODO(jesusvazquez) Ensure these are properly tracked.
 	minValidTime             atomic.Int64 // Mint allowed to be added to the head. It shouldn't be lower than the maxt of the last persisted block.
 	lastWALTruncationTime    atomic.Int64
 	lastMemoryTruncationTime atomic.Int64
 	lastSeriesID             atomic.Uint64
+	// All the ooo m-map chunks should be after this. This is used to truncate old ooo m-map chunks.
+	// This should be typecasted to chunks.ChunkDiskMapperRef after loading.
+	minOOOMmapRef atomic.Uint64

 	metrics         *headMetrics
 	opts            *HeadOptions
-	wal             *wal.WAL
+	wal, wbl        *wal.WAL
 	exemplarMetrics *ExemplarMetrics
 	exemplars       ExemplarStorage
 	logger          log.Logger
@ -87,6 +92,7 @@ type Head struct {
 	deletedMtx sync.Mutex
 	deleted    map[chunks.HeadSeriesRef]int // Deleted series, and what WAL segment they must be kept until.

+	// TODO(codesome): Extend MemPostings to return only OOOPostings, Set OOOStatus, ... Like an additional map of ooo postings.
 	postings *index.MemPostings // Postings lists for terms.

 	tombstones *tombstones.MemTombstones
@ -130,6 +136,8 @@ type HeadOptions struct {
 	ChunkPool            chunkenc.Pool
 	ChunkWriteBufferSize int
 	ChunkWriteQueueSize  int
+	OutOfOrderTimeWindow atomic.Int64
+	OutOfOrderCapMax     atomic.Int64

 	// StripeSize sets the number of entries in the hash map, it must be a power of 2.
 	// A larger StripeSize will allocate more memory up-front, but will increase performance when handling a large number of series.
@ -142,8 +150,13 @@ type HeadOptions struct {
 	IsolationDisabled bool
 }

+const (
+	// DefaultOutOfOrderCapMax is the default maximum size of an in-memory out-of-order chunk.
+	DefaultOutOfOrderCapMax int64 = 32
+)
+
 func DefaultHeadOptions() *HeadOptions {
-	return &HeadOptions{
+	ho := &HeadOptions{
 		ChunkRange:           DefaultBlockDuration,
 		ChunkDirRoot:         "",
 		ChunkPool:            chunkenc.NewPool(),
@ -153,6 +166,8 @@ func DefaultHeadOptions() *HeadOptions {
 		SeriesCallback:       &noopSeriesLifecycleCallback{},
 		IsolationDisabled:    defaultIsolationDisabled,
 	}
+	ho.OutOfOrderCapMax.Store(DefaultOutOfOrderCapMax)
+	return ho
 }

 // SeriesLifecycleCallback specifies a list of callbacks that will be called during a lifecycle of a series.
@ -171,11 +186,23 @@ type SeriesLifecycleCallback interface {
 }

 // NewHead opens the head block in dir.
-func NewHead(r prometheus.Registerer, l log.Logger, wal *wal.WAL, opts *HeadOptions, stats *HeadStats) (*Head, error) {
+func NewHead(r prometheus.Registerer, l log.Logger, wal, wbl *wal.WAL, opts *HeadOptions, stats *HeadStats) (*Head, error) {
 	var err error
 	if l == nil {
 		l = log.NewNopLogger()
 	}
+
+	if opts.OutOfOrderTimeWindow.Load() < 0 {
+		opts.OutOfOrderTimeWindow.Store(0)
+	}
+
+	// Time window can be set on runtime. So the capMin and capMax should be valid
+	// even if ooo is not enabled yet.
+	capMax := opts.OutOfOrderCapMax.Load()
+	if capMax <= 0 || capMax > 255 {
+		return nil, errors.Errorf("OOOCapMax of %d is invalid. must be > 0 and <= 255", capMax)
+	}
+
 	if opts.ChunkRange < 1 {
 		return nil, errors.Errorf("invalid chunk range %d", opts.ChunkRange)
 	}
@ -193,6 +220,7 @@ func NewHead(r prometheus.Registerer, l log.Logger, wal *wal.WAL, opts *HeadOpti

 	h := &Head{
 		wal:    wal,
+		wbl:    wbl,
 		logger: l,
 		opts:   opts,
 		memChunkPool: sync.Pool{
@ -254,35 +282,40 @@ func (h *Head) resetInMemoryState() error {
 	h.chunkRange.Store(h.opts.ChunkRange)
 	h.minTime.Store(math.MaxInt64)
 	h.maxTime.Store(math.MinInt64)
+	h.minOOOTime.Store(math.MaxInt64)
+	h.maxOOOTime.Store(math.MinInt64)
 	h.lastWALTruncationTime.Store(math.MinInt64)
 	h.lastMemoryTruncationTime.Store(math.MinInt64)
 	return nil
 }

 type headMetrics struct {
-	activeAppenders          prometheus.Gauge
-	series                   prometheus.GaugeFunc
-	seriesCreated            prometheus.Counter
-	seriesRemoved            prometheus.Counter
-	seriesNotFound           prometheus.Counter
-	chunks                   prometheus.Gauge
-	chunksCreated            prometheus.Counter
-	chunksRemoved            prometheus.Counter
-	gcDuration               prometheus.Summary
-	samplesAppended          prometheus.Counter
-	outOfBoundSamples        prometheus.Counter
-	outOfOrderSamples        prometheus.Counter
-	walTruncateDuration      prometheus.Summary
-	walCorruptionsTotal      prometheus.Counter
-	walTotalReplayDuration   prometheus.Gauge
-	headTruncateFail         prometheus.Counter
-	headTruncateTotal        prometheus.Counter
-	checkpointDeleteFail     prometheus.Counter
-	checkpointDeleteTotal    prometheus.Counter
-	checkpointCreationFail   prometheus.Counter
-	checkpointCreationTotal  prometheus.Counter
-	mmapChunkCorruptionTotal prometheus.Counter
-	snapshotReplayErrorTotal prometheus.Counter // Will be either 0 or 1.
+	activeAppenders           prometheus.Gauge
+	series                    prometheus.GaugeFunc
+	seriesCreated             prometheus.Counter
+	seriesRemoved             prometheus.Counter
+	seriesNotFound            prometheus.Counter
+	chunks                    prometheus.Gauge
+	chunksCreated             prometheus.Counter
+	chunksRemoved             prometheus.Counter
+	gcDuration                prometheus.Summary
+	samplesAppended           prometheus.Counter
+	outOfOrderSamplesAppended prometheus.Counter
+	outOfBoundSamples         prometheus.Counter
+	outOfOrderSamples         prometheus.Counter
+	tooOldSamples             prometheus.Counter
+	walTruncateDuration       prometheus.Summary
+	walCorruptionsTotal       prometheus.Counter
+	dataTotalReplayDuration   prometheus.Gauge
+	headTruncateFail          prometheus.Counter
+	headTruncateTotal         prometheus.Counter
+	checkpointDeleteFail      prometheus.Counter
+	checkpointDeleteTotal     prometheus.Counter
+	checkpointCreationFail    prometheus.Counter
+	checkpointCreationTotal   prometheus.Counter
+	mmapChunkCorruptionTotal  prometheus.Counter
+	snapshotReplayErrorTotal  prometheus.Counter // Will be either 0 or 1.
+	oooHistogram              prometheus.Histogram
 }

 func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
@ -333,7 +366,7 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
 			Name: "prometheus_tsdb_wal_corruptions_total",
 			Help: "Total number of WAL corruptions.",
 		}),
-		walTotalReplayDuration: prometheus.NewGauge(prometheus.GaugeOpts{
+		dataTotalReplayDuration: prometheus.NewGauge(prometheus.GaugeOpts{
 			Name: "prometheus_tsdb_data_replay_duration_seconds",
 			Help: "Time taken to replay the data on disk.",
 		}),
@ -341,13 +374,21 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
 			Name: "prometheus_tsdb_head_samples_appended_total",
 			Help: "Total number of appended samples.",
 		}),
+		outOfOrderSamplesAppended: prometheus.NewCounter(prometheus.CounterOpts{
+			Name: "prometheus_tsdb_head_out_of_order_samples_appended_total",
+			Help: "Total number of appended out of order samples.",
+		}),
 		outOfBoundSamples: prometheus.NewCounter(prometheus.CounterOpts{
 			Name: "prometheus_tsdb_out_of_bound_samples_total",
-			Help: "Total number of out of bound samples ingestion failed attempts.",
+			Help: "Total number of out of bound samples ingestion failed attempts with out of order support disabled.",
 		}),
 		outOfOrderSamples: prometheus.NewCounter(prometheus.CounterOpts{
 			Name: "prometheus_tsdb_out_of_order_samples_total",
-			Help: "Total number of out of order samples ingestion failed attempts.",
+			Help: "Total number of out of order samples ingestion failed attempts due to out of order being disabled.",
+		}),
+		tooOldSamples: prometheus.NewCounter(prometheus.CounterOpts{
+			Name: "prometheus_tsdb_too_old_samples_total",
+			Help: "Total number of out of order samples ingestion failed attempts with out of support enabled, but sample outside of time window.",
 		}),
 		headTruncateFail: prometheus.NewCounter(prometheus.CounterOpts{
 			Name: "prometheus_tsdb_head_truncations_failed_total",
@ -381,6 +422,19 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
 			Name: "prometheus_tsdb_snapshot_replay_error_total",
 			Help: "Total number snapshot replays that failed.",
 		}),
+		oooHistogram: prometheus.NewHistogram(prometheus.HistogramOpts{
+			Name: "prometheus_tsdb_sample_ooo_delta",
+			Help: "Delta in seconds by which a sample is considered out of order (reported regardless of OOO time window and whether sample is accepted or not).",
+			Buckets: []float64{
+				60 * 10,      // 10 min
+				60 * 30,      // 30 min
+				60 * 60,      // 60 min
+				60 * 60 * 2,  // 2h
+				60 * 60 * 3,  // 3h
+				60 * 60 * 6,  // 6h
+				60 * 60 * 12, // 12h
+			},
+		}),
 	}

 	if r != nil {
@ -396,10 +450,12 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
 			m.gcDuration,
 			m.walTruncateDuration,
 			m.walCorruptionsTotal,
-			m.walTotalReplayDuration,
+			m.dataTotalReplayDuration,
 			m.samplesAppended,
+			m.outOfOrderSamplesAppended,
 			m.outOfBoundSamples,
 			m.outOfOrderSamples,
+			m.tooOldSamples,
 			m.headTruncateFail,
 			m.headTruncateTotal,
 			m.checkpointDeleteFail,
@ -517,8 +573,9 @@ func (h *Head) Init(minValidTime int64) error {
 	}

 	mmapChunkReplayStart := time.Now()
-	mmappedChunks, err := h.loadMmappedChunks(refSeries)
+	mmappedChunks, oooMmappedChunks, lastMmapRef, err := h.loadMmappedChunks(refSeries)
 	if err != nil {
+		// TODO(codesome): clear out all m-map chunks here for refSeries.
 		level.Error(h.logger).Log("msg", "Loading on-disk chunks failed", "err", err)
 		if _, ok := errors.Cause(err).(*chunks.CorruptionErr); ok {
 			h.metrics.mmapChunkCorruptionTotal.Inc()
@ -529,7 +586,7 @@ func (h *Head) Init(minValidTime int64) error {

 		// If this fails, data will be recovered from WAL.
 		// Hence we wont lose any data (given WAL is not corrupt).
-		mmappedChunks, err = h.removeCorruptedMmappedChunks(err)
+		mmappedChunks, oooMmappedChunks, lastMmapRef, err = h.removeCorruptedMmappedChunks(err)
 		if err != nil {
 			return err
 		}
@ -572,7 +629,7 @@ func (h *Head) Init(minValidTime int64) error {

 		// A corrupted checkpoint is a hard error for now and requires user
 		// intervention. There's likely little data that can be recovered anyway.
-		if err := h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks); err != nil {
+		if err := h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks, oooMmappedChunks); err != nil {
 			return errors.Wrap(err, "backfill checkpoint")
 		}
 		h.updateWALReplayStatusRead(startFrom)
@ -605,7 +662,7 @@ func (h *Head) Init(minValidTime int64) error {
 		if err != nil {
 			return errors.Wrapf(err, "segment reader (offset=%d)", offset)
 		}
-		err = h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks)
+		err = h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks, oooMmappedChunks)
 		if err := sr.Close(); err != nil {
 			level.Warn(h.logger).Log("msg", "Error while closing the wal segments reader", "err", err)
 		}
@ -615,26 +672,94 @@ func (h *Head) Init(minValidTime int64) error {
 		level.Info(h.logger).Log("msg", "WAL segment loaded", "segment", i, "maxSegment", endAt)
 		h.updateWALReplayStatusRead(i)
 	}
+	walReplayDuration := time.Since(walReplayStart)

-	walReplayDuration := time.Since(start)
-	h.metrics.walTotalReplayDuration.Set(walReplayDuration.Seconds())
+	wblReplayStart := time.Now()
+	if h.wbl != nil {
+		// Replay OOO WAL.
+		startFrom, endAt, e = wal.Segments(h.wbl.Dir())
+		if e != nil {
+			return errors.Wrap(e, "finding OOO WAL segments")
+		}
+		h.startWALReplayStatus(startFrom, endAt)
+
+		for i := startFrom; i <= endAt; i++ {
+			s, err := wal.OpenReadSegment(wal.SegmentName(h.wbl.Dir(), i))
+			if err != nil {
+				return errors.Wrap(err, fmt.Sprintf("open WBL segment: %d", i))
+			}
+
+			sr := wal.NewSegmentBufReader(s)
+			err = h.loadWBL(wal.NewReader(sr), multiRef, lastMmapRef)
+			if err := sr.Close(); err != nil {
+				level.Warn(h.logger).Log("msg", "Error while closing the wbl segments reader", "err", err)
+			}
+			if err != nil {
+				return err
+			}
+			level.Info(h.logger).Log("msg", "WBL segment loaded", "segment", i, "maxSegment", endAt)
+			h.updateWALReplayStatusRead(i)
+		}
+	}
+
+	wblReplayDuration := time.Since(wblReplayStart)
+
+	totalReplayDuration := time.Since(start)
+	h.metrics.dataTotalReplayDuration.Set(totalReplayDuration.Seconds())
 	level.Info(h.logger).Log(
 		"msg", "WAL replay completed",
 		"checkpoint_replay_duration", checkpointReplayDuration.String(),
-		"wal_replay_duration", time.Since(walReplayStart).String(),
-		"total_replay_duration", walReplayDuration.String(),
+		"wal_replay_duration", walReplayDuration.String(),
+		"wbl_replay_duration", wblReplayDuration.String(),
+		"total_replay_duration", totalReplayDuration.String(),
 	)

 	return nil
 }

-func (h *Head) loadMmappedChunks(refSeries map[chunks.HeadSeriesRef]*memSeries) (map[chunks.HeadSeriesRef][]*mmappedChunk, error) {
+func (h *Head) loadMmappedChunks(refSeries map[chunks.HeadSeriesRef]*memSeries) (map[chunks.HeadSeriesRef][]*mmappedChunk, map[chunks.HeadSeriesRef][]*mmappedChunk, chunks.ChunkDiskMapperRef, error) {
 	mmappedChunks := map[chunks.HeadSeriesRef][]*mmappedChunk{}
-	if err := h.chunkDiskMapper.IterateAllChunks(func(seriesRef chunks.HeadSeriesRef, chunkRef chunks.ChunkDiskMapperRef, mint, maxt int64, numSamples uint16) error {
-		if maxt < h.minValidTime.Load() {
+	oooMmappedChunks := map[chunks.HeadSeriesRef][]*mmappedChunk{}
+	var lastRef, secondLastRef chunks.ChunkDiskMapperRef
+	if err := h.chunkDiskMapper.IterateAllChunks(func(seriesRef chunks.HeadSeriesRef, chunkRef chunks.ChunkDiskMapperRef, mint, maxt int64, numSamples uint16, encoding chunkenc.Encoding) error {
+		secondLastRef = lastRef
+		lastRef = chunkRef
+		isOOO := chunkenc.IsOutOfOrderChunk(encoding)
+		if !isOOO && maxt < h.minValidTime.Load() {
 			return nil
 		}
+
+		// We ignore any chunk that doesn't have a valid encoding
+		if !chunkenc.IsValidEncoding(encoding) {
+			return nil
+		}
+
 		ms, ok := refSeries[seriesRef]
+
+		if isOOO {
+			if !ok {
+				oooMmappedChunks[seriesRef] = append(oooMmappedChunks[seriesRef], &mmappedChunk{
+					ref:        chunkRef,
+					minTime:    mint,
+					maxTime:    maxt,
+					numSamples: numSamples,
+				})
+				return nil
+			}
+
+			h.metrics.chunks.Inc()
+			h.metrics.chunksCreated.Inc()
+
+			ms.oooMmappedChunks = append(ms.oooMmappedChunks, &mmappedChunk{
+				ref:        chunkRef,
+				minTime:    mint,
+				maxTime:    maxt,
+				numSamples: numSamples,
+			})
+
+			return nil
+		}
+
 		if !ok {
 			slice := mmappedChunks[seriesRef]
 			if len(slice) > 0 && slice[len(slice)-1].maxTime >= mint {
@ -677,45 +802,57 @@ func (h *Head) loadMmappedChunks(refSeries map[chunks.HeadSeriesRef]*memSeries)
 		}
 		return nil
 	}); err != nil {
-		return nil, errors.Wrap(err, "iterate on on-disk chunks")
+		// secondLastRef because the lastRef caused an error.
+		return nil, nil, secondLastRef, errors.Wrap(err, "iterate on on-disk chunks")
 	}
-	return mmappedChunks, nil
+	return mmappedChunks, oooMmappedChunks, lastRef, nil
 }

 // removeCorruptedMmappedChunks attempts to delete the corrupted mmapped chunks and if it fails, it clears all the previously
 // loaded mmapped chunks.
-func (h *Head) removeCorruptedMmappedChunks(err error) (map[chunks.HeadSeriesRef][]*mmappedChunk, error) {
+func (h *Head) removeCorruptedMmappedChunks(err error) (map[chunks.HeadSeriesRef][]*mmappedChunk, map[chunks.HeadSeriesRef][]*mmappedChunk, chunks.ChunkDiskMapperRef, error) {
+	level.Info(h.logger).Log("msg", "Deleting mmapped chunk files")
 	// We never want to preserve the in-memory series from snapshots if we are repairing m-map chunks.
 	if err := h.resetInMemoryState(); err != nil {
-		return nil, err
+		return map[chunks.HeadSeriesRef][]*mmappedChunk{}, map[chunks.HeadSeriesRef][]*mmappedChunk{}, 0, err
 	}

 	level.Info(h.logger).Log("msg", "Deleting mmapped chunk files")

 	if err := h.chunkDiskMapper.DeleteCorrupted(err); err != nil {
 		level.Info(h.logger).Log("msg", "Deletion of corrupted mmap chunk files failed, discarding chunk files completely", "err", err)
-		if err := h.chunkDiskMapper.Truncate(math.MaxInt64); err != nil {
+		if err := h.chunkDiskMapper.Truncate(math.MaxUint32); err != nil {
 			level.Error(h.logger).Log("msg", "Deletion of all mmap chunk files failed", "err", err)
 		}
-		return map[chunks.HeadSeriesRef][]*mmappedChunk{}, nil
+		return map[chunks.HeadSeriesRef][]*mmappedChunk{}, map[chunks.HeadSeriesRef][]*mmappedChunk{}, 0, nil
 	}

 	level.Info(h.logger).Log("msg", "Deletion of mmap chunk files successful, reattempting m-mapping the on-disk chunks")
-	mmappedChunks, err := h.loadMmappedChunks(make(map[chunks.HeadSeriesRef]*memSeries))
+	mmappedChunks, oooMmappedChunks, lastRef, err := h.loadMmappedChunks(make(map[chunks.HeadSeriesRef]*memSeries))
 	if err != nil {
 		level.Error(h.logger).Log("msg", "Loading on-disk chunks failed, discarding chunk files completely", "err", err)
-		if err := h.chunkDiskMapper.Truncate(math.MaxInt64); err != nil {
+		if err := h.chunkDiskMapper.Truncate(math.MaxUint32); err != nil {
 			level.Error(h.logger).Log("msg", "Deletion of all mmap chunk files failed after failed loading", "err", err)
 		}
 		mmappedChunks = map[chunks.HeadSeriesRef][]*mmappedChunk{}
 	}

-	return mmappedChunks, nil
+	return mmappedChunks, oooMmappedChunks, lastRef, nil
 }

-func (h *Head) ApplyConfig(cfg *config.Config) error {
+func (h *Head) ApplyConfig(cfg *config.Config, wbl *wal.WAL) {
+	oooTimeWindow := int64(0)
+	if cfg.StorageConfig.TSDBConfig != nil {
+		oooTimeWindow = cfg.StorageConfig.TSDBConfig.OutOfOrderTimeWindow
+	}
+	if oooTimeWindow < 0 {
+		oooTimeWindow = 0
+	}
+
+	h.SetOutOfOrderTimeWindow(oooTimeWindow, wbl)
+
 	if !h.opts.EnableExemplarStorage {
-		return nil
+		return
 	}

 	// Head uses opts.MaxExemplars in combination with opts.EnableExemplarStorage
@ -726,12 +863,21 @@ func (h *Head) ApplyConfig(cfg *config.Config) error {
 	newSize := h.opts.MaxExemplars.Load()

 	if prevSize == newSize {
-		return nil
+		return
 	}

 	migrated := h.exemplars.(*CircularExemplarStorage).Resize(newSize)
 	level.Info(h.logger).Log("msg", "Exemplar storage resized", "from", prevSize, "to", newSize, "migrated", migrated)
-	return nil
+}
+
+// SetOutOfOrderTimeWindow updates the out of order related parameters.
+// If the Head already has a WBL set, then the wbl will be ignored.
+func (h *Head) SetOutOfOrderTimeWindow(oooTimeWindow int64, wbl *wal.WAL) {
+	if oooTimeWindow > 0 && h.wbl == nil {
+		h.wbl = wbl
+	}
+
+	h.opts.OutOfOrderTimeWindow.Store(oooTimeWindow)
 }

 // PostingsCardinalityStats returns top 10 highest cardinality stats By label and value names.
@ -773,6 +919,27 @@ func (h *Head) updateMinMaxTime(mint, maxt int64) {
 	}
 }

+func (h *Head) updateMinOOOMaxOOOTime(mint, maxt int64) {
+	for {
+		lt := h.MinOOOTime()
+		if mint >= lt {
+			break
+		}
+		if h.minOOOTime.CompareAndSwap(lt, mint) {
+			break
+		}
+	}
+	for {
+		ht := h.MaxOOOTime()
+		if maxt <= ht {
+			break
+		}
+		if h.maxOOOTime.CompareAndSwap(ht, maxt) {
+			break
+		}
+	}
+}
+
 // SetMinValidTime sets the minimum timestamp the head can ingest.
 func (h *Head) SetMinValidTime(minValidTime int64) {
 	h.minValidTime.Store(minValidTime)
@ -838,30 +1005,7 @@ func (h *Head) truncateMemory(mint int64) (err error) {
 	}

 	h.metrics.headTruncateTotal.Inc()
-	start := time.Now()
-
-	actualMint := h.gc()
-	level.Info(h.logger).Log("msg", "Head GC completed", "duration", time.Since(start))
-	h.metrics.gcDuration.Observe(time.Since(start).Seconds())
-	if actualMint > h.minTime.Load() {
-		// The actual mint of the Head is higher than the one asked to truncate.
-		appendableMinValidTime := h.appendableMinValidTime()
-		if actualMint < appendableMinValidTime {
-			h.minTime.Store(actualMint)
-			h.minValidTime.Store(actualMint)
-		} else {
-			// The actual min time is in the appendable window.
-			// So we set the mint to the appendableMinValidTime.
-			h.minTime.Store(appendableMinValidTime)
-			h.minValidTime.Store(appendableMinValidTime)
-		}
-	}
-
-	// Truncate the chunk m-mapper.
-	if err := h.chunkDiskMapper.Truncate(mint); err != nil {
-		return errors.Wrap(err, "truncate chunks.HeadReadWriter")
-	}
-	return nil
+	return h.truncateSeriesAndChunkDiskMapper("truncateMemory")
 }

 // WaitForPendingReadersInTimeRange waits for queries overlapping with given range to finish querying.
@ -950,7 +1094,7 @@ func (h *Head) truncateWAL(mint int64) error {
 	}
 	// Start a new segment, so low ingestion volume TSDB don't have more WAL than
 	// needed.
-	if err := h.wal.NextSegment(); err != nil {
+	if _, err := h.wal.NextSegment(); err != nil {
 		return errors.Wrap(err, "next segment")
 	}
 	last-- // Never consider last segment for checkpoint.
@ -1016,6 +1160,59 @@ func (h *Head) truncateWAL(mint int64) error {
 	return nil
 }

+// truncateOOO
+//   - truncates the OOO WBL files whose index is strictly less than lastWBLFile.
+//   - garbage collects all the m-map chunks from the memory that are less than or equal to minOOOMmapRef
+//     and then deletes the series that do not have any data anymore.
+func (h *Head) truncateOOO(lastWBLFile int, minOOOMmapRef chunks.ChunkDiskMapperRef) error {
+	curMinOOOMmapRef := chunks.ChunkDiskMapperRef(h.minOOOMmapRef.Load())
+	if minOOOMmapRef.GreaterThan(curMinOOOMmapRef) {
+		h.minOOOMmapRef.Store(uint64(minOOOMmapRef))
+		if err := h.truncateSeriesAndChunkDiskMapper("truncateOOO"); err != nil {
+			return err
+		}
+	}
+
+	return h.wbl.Truncate(lastWBLFile)
+}
+
+// truncateSeriesAndChunkDiskMapper is a helper function for truncateMemory and truncateOOO.
+// It runs GC on the Head and truncates the ChunkDiskMapper accordingly.
+func (h *Head) truncateSeriesAndChunkDiskMapper(caller string) error {
+	start := time.Now()
+	headMaxt := h.MaxTime()
+	actualMint, minOOOTime, minMmapFile := h.gc()
+	level.Info(h.logger).Log("msg", "Head GC completed", "caller", caller, "duration", time.Since(start))
+	h.metrics.gcDuration.Observe(time.Since(start).Seconds())
+
+	if actualMint > h.minTime.Load() {
+		// The actual mint of the head is higher than the one asked to truncate.
+		appendableMinValidTime := h.appendableMinValidTime()
+		if actualMint < appendableMinValidTime {
+			h.minTime.Store(actualMint)
+			h.minValidTime.Store(actualMint)
+		} else {
+			// The actual min time is in the appendable window.
+			// So we set the mint to the appendableMinValidTime.
+			h.minTime.Store(appendableMinValidTime)
+			h.minValidTime.Store(appendableMinValidTime)
+		}
+	}
+	if headMaxt-h.opts.OutOfOrderTimeWindow.Load() < minOOOTime {
+		// The allowed OOO window is lower than the min OOO time seen during GC.
+		// So it is possible that some OOO sample was inserted that was less that minOOOTime.
+		// So we play safe and set it to the min that was possible.
+		minOOOTime = headMaxt - h.opts.OutOfOrderTimeWindow.Load()
+	}
+	h.minOOOTime.Store(minOOOTime)
+
+	// Truncate the chunk m-mapper.
+	if err := h.chunkDiskMapper.Truncate(uint32(minMmapFile)); err != nil {
+		return errors.Wrap(err, "truncate chunks.HeadReadWriter by file number")
+	}
+	return nil
+}
+
 type Stats struct {
 	NumSeries         uint64
 	MinTime, MaxTime  int64
@ -1149,14 +1346,20 @@ func (h *Head) Delete(mint, maxt int64, ms ...*labels.Matcher) error {
 }

 // gc removes data before the minimum timestamp from the head.
-// It returns the actual min times of the chunks present in the Head.
-func (h *Head) gc() int64 {
+// It returns
+// * The actual min times of the chunks present in the Head.
+// * The min OOO time seen during the GC.
+// * Min mmap file number seen in the series (in-order and out-of-order) after gc'ing the series.
+func (h *Head) gc() (actualInOrderMint, minOOOTime int64, minMmapFile int) {
 	// Only data strictly lower than this timestamp must be deleted.
 	mint := h.MinTime()
+	// Only ooo m-map chunks strictly lower than or equal to this ref
+	// must be deleted.
+	minOOOMmapRef := chunks.ChunkDiskMapperRef(h.minOOOMmapRef.Load())

 	// Drop old chunks and remember series IDs and hashes if they can be
 	// deleted entirely.
-	deleted, chunksRemoved, actualMint := h.series.gc(mint)
+	deleted, chunksRemoved, actualInOrderMint, minOOOTime, minMmapFile := h.series.gc(mint, minOOOMmapRef)
 	seriesRemoved := len(deleted)

 	h.metrics.seriesRemoved.Add(float64(seriesRemoved))
@ -1186,7 +1389,7 @@ func (h *Head) gc() int64 {
 		h.deletedMtx.Unlock()
 	}

-	return actualMint
+	return actualInOrderMint, minOOOTime, minMmapFile
 }

 // Tombstones returns a new reader over the head's tombstones
@ -1224,6 +1427,18 @@ func (h *Head) MaxTime() int64 {
 	return h.maxTime.Load()
 }

+// MinOOOTime returns the lowest time bound on visible data in the out of order
+// head.
+func (h *Head) MinOOOTime() int64 {
+	return h.minOOOTime.Load()
+}
+
+// MaxOOOTime returns the highest timestamp on visible data in the out of order
+// head.
+func (h *Head) MaxOOOTime() int64 {
+	return h.maxOOOTime.Load()
+}
+
 // compactable returns whether the head has a compactable range.
 // The head has a compactable range when the head time range is 1.5 times the chunk range.
 // The 0.5 acts as a buffer of the appendable window.
@ -1241,6 +1456,9 @@ func (h *Head) Close() error {
 	if h.wal != nil {
 		errs.Add(h.wal.Close())
 	}
+	if h.wbl != nil {
+		errs.Add(h.wbl.Close())
+	}
 	if errs.Err() == nil && h.opts.EnableMemorySnapshotOnShutdown {
 		errs.Add(h.performChunkSnapshot())
 	}
@ -1271,7 +1489,7 @@ func (h *Head) getOrCreate(hash uint64, lset labels.Labels) (*memSeries, bool, e

 func (h *Head) getOrCreateWithID(id chunks.HeadSeriesRef, hash uint64, lset labels.Labels) (*memSeries, bool, error) {
 	s, created, err := h.series.getOrSet(hash, lset, func() *memSeries {
-		return newMemSeries(lset, id, h.chunkRange.Load(), h.opts.IsolationDisabled)
+		return newMemSeries(lset, id, h.chunkRange.Load(), h.opts.OutOfOrderCapMax.Load(), h.opts.IsolationDisabled)
 	})
 	if err != nil {
 		return nil, false, err
@ -1333,7 +1551,7 @@ const (
 )

 // stripeSeries holds series by HeadSeriesRef ("ID") and also by hash of their labels.
-// ID-based lookups via (getByID()) are preferred over getByHash() for performance reasons.
+// ID-based lookups via getByID() are preferred over getByHash() for performance reasons.
 // It locks modulo ranges of IDs and hashes to reduce lock contention.
 // The locks are padded to not be on the same cache line. Filling the padded space
 // with the maps was profiled to be slower – likely due to the additional pointer
@ -1375,13 +1593,16 @@ func newStripeSeries(stripeSize int, seriesCallback SeriesLifecycleCallback) *st
 // note: returning map[chunks.HeadSeriesRef]struct{} would be more accurate,
 // but the returned map goes into postings.Delete() which expects a map[storage.SeriesRef]struct
 // and there's no easy way to cast maps.
-func (s *stripeSeries) gc(mint int64) (map[storage.SeriesRef]struct{}, int, int64) {
+// minMmapFile is the min mmap file number seen in the series (in-order and out-of-order) after gc'ing the series.
+func (s *stripeSeries) gc(mint int64, minOOOMmapRef chunks.ChunkDiskMapperRef) (_ map[storage.SeriesRef]struct{}, _ int, _, _ int64, minMmapFile int) {
 	var (
 		deleted                  = map[storage.SeriesRef]struct{}{}
 		deletedForCallback       = []labels.Labels{}
 		rmChunks                 = 0
 		actualMint         int64 = math.MaxInt64
+		minOOOTime         int64 = math.MaxInt64
 	)
+	minMmapFile = math.MaxInt32
 	// Run through all series and truncate old chunks. Mark those with no
 	// chunks left as deleted and store their ID.
 	for i := 0; i < s.size; i++ {
@ -1390,9 +1611,32 @@ func (s *stripeSeries) gc(mint int64) (map[storage.SeriesRef]struct{}, int, int6
 		for hash, all := range s.hashes[i] {
 			for _, series := range all {
 				series.Lock()
-				rmChunks += series.truncateChunksBefore(mint)
+				rmChunks += series.truncateChunksBefore(mint, minOOOMmapRef)

-				if len(series.mmappedChunks) > 0 || series.headChunk != nil || series.pendingCommit {
+				if len(series.mmappedChunks) > 0 {
+					seq, _ := series.mmappedChunks[0].ref.Unpack()
+					if seq < minMmapFile {
+						minMmapFile = seq
+					}
+				}
+				if len(series.oooMmappedChunks) > 0 {
+					seq, _ := series.oooMmappedChunks[0].ref.Unpack()
+					if seq < minMmapFile {
+						minMmapFile = seq
+					}
+					for _, ch := range series.oooMmappedChunks {
+						if ch.minTime < minOOOTime {
+							minOOOTime = ch.minTime
+						}
+					}
+				}
+				if series.oooHeadChunk != nil {
+					if series.oooHeadChunk.minTime < minOOOTime {
+						minOOOTime = series.oooHeadChunk.minTime
+					}
+				}
+				if len(series.mmappedChunks) > 0 || len(series.oooMmappedChunks) > 0 ||
+					series.headChunk != nil || series.oooHeadChunk != nil || series.pendingCommit {
 					seriesMint := series.minTime()
 					if seriesMint < actualMint {
 						actualMint = seriesMint
@ -1435,7 +1679,7 @@ func (s *stripeSeries) gc(mint int64) (map[storage.SeriesRef]struct{}, int, int6
 		actualMint = mint
 	}

-	return deleted, rmChunks, actualMint
+	return deleted, rmChunks, actualMint, minOOOTime, minMmapFile
 }

 func (s *stripeSeries) getByID(id chunks.HeadSeriesRef) *memSeries {
@ -1528,11 +1772,16 @@ type memSeries struct {
 	//
 	// pN is the pointer to the mmappedChunk referered to by HeadChunkID=N
 	mmappedChunks []*mmappedChunk
+	headChunk     *memChunk          // Most recent chunk in memory that's still being built.
+	firstChunkID  chunks.HeadChunkID // HeadChunkID for mmappedChunks[0]

-	mmMaxTime    int64     // Max time of any mmapped chunk, only used during WAL replay.
-	headChunk    *memChunk // Most recent chunk in memory that's still being built.
-	chunkRange   int64
-	firstChunkID chunks.HeadChunkID // HeadChunkID for mmappedChunks[0]
+	oooMmappedChunks []*mmappedChunk    // Immutable chunks on disk containing OOO samples.
+	oooHeadChunk     *oooHeadChunk      // Most recent chunk for ooo samples in memory that's still being built.
+	firstOOOChunkID  chunks.HeadChunkID // HeadOOOChunkID for oooMmappedChunks[0]
+
+	mmMaxTime  int64 // Max time of any mmapped chunk, only used during WAL replay.
+	chunkRange int64
+	oooCapMax  uint8

 	nextAt int64 // Timestamp at which to cut the next chunk.

@ -1551,12 +1800,13 @@ type memSeries struct {
 	pendingCommit bool // Whether there are samples waiting to be committed to this series.
 }

-func newMemSeries(lset labels.Labels, id chunks.HeadSeriesRef, chunkRange int64, isolationDisabled bool) *memSeries {
+func newMemSeries(lset labels.Labels, id chunks.HeadSeriesRef, chunkRange, oooCapMax int64, isolationDisabled bool) *memSeries {
 	s := &memSeries{
 		lset:       lset,
 		ref:        id,
 		chunkRange: chunkRange,
 		nextAt:     math.MinInt64,
+		oooCapMax:  uint8(oooCapMax),
 	}
 	if !isolationDisabled {
 		s.txs = newTxRing(4)
@ -1575,6 +1825,7 @@ func (s *memSeries) minTime() int64 {
 }

 func (s *memSeries) maxTime() int64 {
+	// The highest timestamps will always be in the regular (non-OOO) chunks, even if OOO is enabled.
 	c := s.head()
 	if c != nil {
 		return c.maxTime
@ -1588,26 +1839,39 @@ func (s *memSeries) maxTime() int64 {
 // truncateChunksBefore removes all chunks from the series that
 // have no timestamp at or after mint.
 // Chunk IDs remain unchanged.
-func (s *memSeries) truncateChunksBefore(mint int64) (removed int) {
+func (s *memSeries) truncateChunksBefore(mint int64, minOOOMmapRef chunks.ChunkDiskMapperRef) int {
+	var removedInOrder int
 	if s.headChunk != nil && s.headChunk.maxTime < mint {
 		// If head chunk is truncated, we can truncate all mmapped chunks.
-		removed = 1 + len(s.mmappedChunks)
-		s.firstChunkID += chunks.HeadChunkID(removed)
+		removedInOrder = 1 + len(s.mmappedChunks)
+		s.firstChunkID += chunks.HeadChunkID(removedInOrder)
 		s.headChunk = nil
 		s.mmappedChunks = nil
-		return removed
 	}
 	if len(s.mmappedChunks) > 0 {
 		for i, c := range s.mmappedChunks {
 			if c.maxTime >= mint {
 				break
 			}
-			removed = i + 1
+			removedInOrder = i + 1
 		}
-		s.mmappedChunks = append(s.mmappedChunks[:0], s.mmappedChunks[removed:]...)
-		s.firstChunkID += chunks.HeadChunkID(removed)
+		s.mmappedChunks = append(s.mmappedChunks[:0], s.mmappedChunks[removedInOrder:]...)
+		s.firstChunkID += chunks.HeadChunkID(removedInOrder)
 	}
-	return removed
+
+	var removedOOO int
+	if len(s.oooMmappedChunks) > 0 {
+		for i, c := range s.oooMmappedChunks {
+			if c.ref.GreaterThan(minOOOMmapRef) {
+				break
+			}
+			removedOOO = i + 1
+		}
+		s.oooMmappedChunks = append(s.oooMmappedChunks[:0], s.oooMmappedChunks[removedOOO:]...)
+		s.firstOOOChunkID += chunks.HeadChunkID(removedOOO)
+	}
+
+	return removedInOrder + removedOOO
 }

 // cleanupAppendIDsBelow cleans up older appendIDs. Has to be called after
@ -1627,6 +1891,16 @@ type memChunk struct {
 	minTime, maxTime int64
 }

+type oooHeadChunk struct {
+	chunk            *OOOChunk
+	minTime, maxTime int64 // can probably be removed and pulled out of the chunk instead
+}
+
+// OverlapsClosedInterval returns true if the chunk overlaps [mint, maxt].
+func (mc *oooHeadChunk) OverlapsClosedInterval(mint, maxt int64) bool {
+	return overlapsClosedInterval(mc.minTime, mc.maxTime, mint, maxt)
+}
+
 // OverlapsClosedInterval returns true if the chunk overlaps [mint, maxt].
 func (mc *memChunk) OverlapsClosedInterval(mint, maxt int64) bool {
 	return overlapsClosedInterval(mc.minTime, mc.maxTime, mint, maxt)
@ -1655,12 +1929,15 @@ func (noopSeriesLifecycleCallback) PostCreation(labels.Labels)      {}
 func (noopSeriesLifecycleCallback) PostDeletion(...labels.Labels)   {}

 func (h *Head) Size() int64 {
-	var walSize int64
+	var walSize, wblSize int64
 	if h.wal != nil {
 		walSize, _ = h.wal.Size()
 	}
+	if h.wbl != nil {
+		wblSize, _ = h.wbl.Size()
+	}
 	cdmSize, _ := h.chunkDiskMapper.Size()
-	return walSize + cdmSize
+	return walSize + wblSize + cdmSize
 }

 func (h *RangeHead) Size() int64 {
--- a/tsdb/head_append.go
+++ b/tsdb/head_append.go
@ -137,6 +137,8 @@ func (h *Head) appender() *headAppender {
 		minValidTime:          h.appendableMinValidTime(),
 		mint:                  math.MaxInt64,
 		maxt:                  math.MinInt64,
+		headMaxt:              h.MaxTime(),
+		oooTimeWindow:         h.opts.OutOfOrderTimeWindow.Load(),
 		samples:               h.getAppendBuffer(),
 		sampleSeries:          h.getSeriesBuffer(),
 		exemplars:             exemplarsBuf,
@ -252,9 +254,11 @@ type exemplarWithSeriesRef struct {
 }

 type headAppender struct {
-	head         *Head
-	minValidTime int64 // No samples below this timestamp are allowed.
-	mint, maxt   int64
+	head          *Head
+	minValidTime  int64 // No samples below this timestamp are allowed.
+	mint, maxt    int64
+	headMaxt      int64 // We track it here to not take the lock for every sample appended.
+	oooTimeWindow int64 // Use the same for the entire append, and don't load the atomic for each sample.

 	series         []record.RefSeries      // New series held by this appender.
 	metadata       []record.RefMetadata    // New metadata held by this appender.
@ -268,7 +272,9 @@ type headAppender struct {
 }

 func (a *headAppender) Append(ref storage.SeriesRef, lset labels.Labels, t int64, v float64) (storage.SeriesRef, error) {
-	if t < a.minValidTime {
+	// For OOO inserts, this restriction is irrelevant and will be checked later once we confirm the sample is an in-order append.
+	// If OOO inserts are disabled, we may as well as check this as early as we can and avoid more work.
+	if a.oooTimeWindow == 0 && t < a.minValidTime {
 		a.head.metrics.outOfBoundSamples.Inc()
 		return 0, storage.ErrOutOfBounds
 	}
@ -300,15 +306,25 @@ func (a *headAppender) Append(ref storage.SeriesRef, lset labels.Labels, t int64
 	}

 	s.Lock()
-	if err := s.appendable(t, v); err != nil {
-		s.Unlock()
-		if err == storage.ErrOutOfOrderSample {
+	// TODO(codesome): If we definitely know at this point that the sample is ooo, then optimise
+	// to skip that sample from the WAL and write only in the WBL.
+	_, delta, err := s.appendable(t, v, a.headMaxt, a.minValidTime, a.oooTimeWindow)
+	if err == nil {
+		s.pendingCommit = true
+	}
+	s.Unlock()
+	if delta > 0 {
+		a.head.metrics.oooHistogram.Observe(float64(delta))
+	}
+	if err != nil {
+		switch err {
+		case storage.ErrOutOfOrderSample:
 			a.head.metrics.outOfOrderSamples.Inc()
+		case storage.ErrTooOldSample:
+			a.head.metrics.tooOldSamples.Inc()
 		}
 		return 0, err
 	}
-	s.pendingCommit = true
-	s.Unlock()

 	if t < a.mint {
 		a.mint = t
@ -326,25 +342,46 @@ func (a *headAppender) Append(ref storage.SeriesRef, lset labels.Labels, t int64
 	return storage.SeriesRef(s.ref), nil
 }

-// appendable checks whether the given sample is valid for appending to the series.
-func (s *memSeries) appendable(t int64, v float64) error {
-	c := s.head()
-	if c == nil {
-		return nil
+// appendable checks whether the given sample is valid for appending to the series. (if we return false and no error)
+// The sample belongs to the out of order chunk if we return true and no error.
+// An error signifies the sample cannot be handled.
+func (s *memSeries) appendable(t int64, v float64, headMaxt, minValidTime, oooTimeWindow int64) (isOOO bool, oooDelta int64, err error) {
+	// Check if we can append in the in-order chunk.
+	if t >= minValidTime {
+		if s.head() == nil {
+			// The series has no sample and was freshly created.
+			return false, 0, nil
+		}
+		msMaxt := s.maxTime()
+		if t > msMaxt {
+			return false, 0, nil
+		}
+		if t == msMaxt {
+			// We are allowing exact duplicates as we can encounter them in valid cases
+			// like federation and erroring out at that time would be extremely noisy.
+			// This only checks against the latest in-order sample.
+			// The OOO headchunk has its own method to detect these duplicates.
+			if math.Float64bits(s.sampleBuf[3].v) != math.Float64bits(v) {
+				return false, 0, storage.ErrDuplicateSampleForTimestamp
+			}
+			// Sample is identical (ts + value) with most current (highest ts) sample in sampleBuf.
+			return false, 0, nil
+		}
 	}

-	if t > c.maxTime {
-		return nil
+	// The sample cannot go in the in-order chunk. Check if it can go in the out-of-order chunk.
+	if oooTimeWindow > 0 && t >= headMaxt-oooTimeWindow {
+		return true, headMaxt - t, nil
 	}
-	if t < c.maxTime {
-		return storage.ErrOutOfOrderSample
+
+	// The sample cannot go in both in-order and out-of-order chunk.
+	if oooTimeWindow > 0 {
+		return true, headMaxt - t, storage.ErrTooOldSample
 	}
-	// We are allowing exact duplicates as we can encounter them in valid cases
-	// like federation and erroring out at that time would be extremely noisy.
-	if math.Float64bits(s.sampleBuf[3].v) != math.Float64bits(v) {
-		return storage.ErrDuplicateSampleForTimestamp
+	if t < minValidTime {
+		return false, headMaxt - t, storage.ErrOutOfBounds
 	}
-	return nil
+	return false, headMaxt - t, storage.ErrOutOfOrderSample
 }

 // AppendExemplar for headAppender assumes the series ref already exists, and so it doesn't
@ -487,6 +524,7 @@ func exemplarsForEncoding(es []exemplarWithSeriesRef) []record.RefExemplar {
 }

 // Commit writes to the WAL and adds the data to the Head.
+// TODO(codesome): Refactor this method to reduce indentation and make it more readable.
 func (a *headAppender) Commit() (err error) {
 	if a.closed {
 		return ErrAppenderClosed
@ -517,24 +555,143 @@ func (a *headAppender) Commit() (err error) {
 	defer a.head.putMetadataBuffer(a.metadata)
 	defer a.head.iso.closeAppend(a.appendID)

-	total := len(a.samples)
-	var series *memSeries
+	var (
+		samplesAppended = len(a.samples)
+		oooAccepted     int   // number of samples out of order but accepted: with ooo enabled and within time window
+		oooRejected     int   // number of samples rejected due to: out of order but OOO support disabled.
+		tooOldRejected  int   // number of samples rejected due to: that are out of order but too old (OOO support enabled, but outside time window)
+		oobRejected     int   // number of samples rejected due to: out of bounds: with t < minValidTime (OOO support disabled)
+		inOrderMint     int64 = math.MaxInt64
+		inOrderMaxt     int64 = math.MinInt64
+		ooomint         int64 = math.MaxInt64
+		ooomaxt         int64 = math.MinInt64
+		wblSamples      []record.RefSample
+		oooMmapMarkers  map[chunks.HeadSeriesRef]chunks.ChunkDiskMapperRef
+		oooRecords      [][]byte
+		series          *memSeries
+		enc             record.Encoder
+	)
+	defer func() {
+		for i := range oooRecords {
+			a.head.putBytesBuffer(oooRecords[i][:0])
+		}
+	}()
+	collectOOORecords := func() {
+		if a.head.wbl == nil {
+			// WBL is not enabled. So no need to collect.
+			wblSamples = nil
+			oooMmapMarkers = nil
+			return
+		}
+		// The m-map happens before adding a new sample. So we collect
+		// the m-map markers first, and then samples.
+		// WBL Graphically:
+		//   WBL Before this Commit(): [old samples before this commit for chunk 1]
+		//   WBL After this Commit():  [old samples before this commit for chunk 1][new samples in this commit for chunk 1]mmapmarker1[samples for chunk 2]mmapmarker2[samples for chunk 3]
+		if oooMmapMarkers != nil {
+			markers := make([]record.RefMmapMarker, 0, len(oooMmapMarkers))
+			for ref, mmapRef := range oooMmapMarkers {
+				markers = append(markers, record.RefMmapMarker{
+					Ref:     ref,
+					MmapRef: mmapRef,
+				})
+			}
+			r := enc.MmapMarkers(markers, a.head.getBytesBuffer())
+			oooRecords = append(oooRecords, r)
+		}
+
+		if len(wblSamples) > 0 {
+			r := enc.Samples(wblSamples, a.head.getBytesBuffer())
+			oooRecords = append(oooRecords, r)
+		}
+
+		wblSamples = nil
+		oooMmapMarkers = nil
+	}
 	for i, s := range a.samples {
 		series = a.sampleSeries[i]
 		series.Lock()
-		ok, chunkCreated := series.append(s.T, s.V, a.appendID, a.head.chunkDiskMapper)
-		series.cleanupAppendIDsBelow(a.cleanupAppendIDsBelow)
-		series.pendingCommit = false
-		series.Unlock()

-		if !ok {
-			total--
-			a.head.metrics.outOfOrderSamples.Inc()
+		oooSample, _, err := series.appendable(s.T, s.V, a.headMaxt, a.minValidTime, a.oooTimeWindow)
+		switch err {
+		case storage.ErrOutOfOrderSample:
+			samplesAppended--
+			oooRejected++
+		case storage.ErrOutOfBounds:
+			samplesAppended--
+			oobRejected++
+		case storage.ErrTooOldSample:
+			samplesAppended--
+			tooOldRejected++
+		case nil:
+			// Do nothing.
+		default:
+			samplesAppended--
 		}
+
+		var ok, chunkCreated bool
+
+		if err == nil && oooSample {
+			// Sample is OOO and OOO handling is enabled
+			// and the delta is within the OOO tolerance.
+			var mmapRef chunks.ChunkDiskMapperRef
+			ok, chunkCreated, mmapRef = series.insert(s.T, s.V, a.head.chunkDiskMapper)
+			if chunkCreated {
+				r, ok := oooMmapMarkers[series.ref]
+				if !ok || r != 0 {
+					// !ok means there are no markers collected for these samples yet. So we first flush the samples
+					// before setting this m-map marker.
+
+					// r != 0 means we have already m-mapped a chunk for this series in the same Commit().
+					// Hence, before we m-map again, we should add the samples and m-map markers
+					// seen till now to the WBL records.
+					collectOOORecords()
+				}
+
+				if oooMmapMarkers == nil {
+					oooMmapMarkers = make(map[chunks.HeadSeriesRef]chunks.ChunkDiskMapperRef)
+				}
+				oooMmapMarkers[series.ref] = mmapRef
+			}
+			if ok {
+				wblSamples = append(wblSamples, s)
+				if s.T < ooomint {
+					ooomint = s.T
+				}
+				if s.T > ooomaxt {
+					ooomaxt = s.T
+				}
+				oooAccepted++
+			} else {
+				// Sample is an exact duplicate of the last sample.
+				// NOTE: We can only detect updates if they clash with a sample in the OOOHeadChunk,
+				// not with samples in already flushed OOO chunks.
+				// TODO(codesome): Add error reporting? It depends on addressing https://github.com/prometheus/prometheus/discussions/10305.
+				samplesAppended--
+			}
+		} else if err == nil {
+			ok, chunkCreated = series.append(s.T, s.V, a.appendID, a.head.chunkDiskMapper)
+			if ok {
+				if s.T < inOrderMint {
+					inOrderMint = s.T
+				}
+				if s.T > inOrderMaxt {
+					inOrderMaxt = s.T
+				}
+			} else {
+				// The sample is an exact duplicate, and should be silently dropped.
+				samplesAppended--
+			}
+		}
+
 		if chunkCreated {
 			a.head.metrics.chunks.Inc()
 			a.head.metrics.chunksCreated.Inc()
 		}
+
+		series.cleanupAppendIDsBelow(a.cleanupAppendIDsBelow)
+		series.pendingCommit = false
+		series.Unlock()
 	}

 	for i, m := range a.metadata {
@ -544,12 +701,48 @@ func (a *headAppender) Commit() (err error) {
 		series.Unlock()
 	}

-	a.head.metrics.samplesAppended.Add(float64(total))
-	a.head.updateMinMaxTime(a.mint, a.maxt)
+	a.head.metrics.outOfOrderSamples.Add(float64(oooRejected))
+	a.head.metrics.outOfBoundSamples.Add(float64(oobRejected))
+	a.head.metrics.tooOldSamples.Add(float64(tooOldRejected))
+	a.head.metrics.samplesAppended.Add(float64(samplesAppended))
+	a.head.metrics.outOfOrderSamplesAppended.Add(float64(oooAccepted))
+	a.head.updateMinMaxTime(inOrderMint, inOrderMaxt)
+	a.head.updateMinOOOMaxOOOTime(ooomint, ooomaxt)

+	collectOOORecords()
+	if a.head.wbl != nil {
+		if err := a.head.wbl.Log(oooRecords...); err != nil {
+			// TODO(codesome): Currently WBL logging of ooo samples is best effort here since we cannot try logging
+			// until we have found what samples become OOO. We can try having a metric for this failure.
+			// Returning the error here is not correct because we have already put the samples into the memory,
+			// hence the append/insert was a success.
+			level.Error(a.head.logger).Log("msg", "Failed to log out of order samples into the WAL", "err", err)
+		}
+	}
 	return nil
 }

+// insert is like append, except it inserts. Used for OOO samples.
+func (s *memSeries) insert(t int64, v float64, chunkDiskMapper *chunks.ChunkDiskMapper) (inserted, chunkCreated bool, mmapRef chunks.ChunkDiskMapperRef) {
+	c := s.oooHeadChunk
+	if c == nil || c.chunk.NumSamples() == int(s.oooCapMax) {
+		// Note: If no new samples come in then we rely on compaction to clean up stale in-memory OOO chunks.
+		c, mmapRef = s.cutNewOOOHeadChunk(t, chunkDiskMapper)
+		chunkCreated = true
+	}
+
+	ok := c.chunk.Insert(t, v)
+	if ok {
+		if chunkCreated || t < c.minTime {
+			c.minTime = t
+		}
+		if chunkCreated || t > c.maxTime {
+			c.maxTime = t
+		}
+	}
+	return ok, chunkCreated, mmapRef
+}
+
 // append adds the sample (t, v) to the series. The caller also has to provide
 // the appendID for isolation. (The appendID can be zero, which results in no
 // isolation for this append.)
@ -567,7 +760,7 @@ func (s *memSeries) append(t int64, v float64, appendID uint64, chunkDiskMapper
 			// Out of order sample. Sample timestamp is already in the mmapped chunks, so ignore it.
 			return false, false
 		}
-		// There is no chunk in this series yet, create the first chunk for the sample.
+		// There is no head chunk in this series yet, create the first chunk for the sample.
 		c = s.cutNewHeadChunk(t, chunkDiskMapper)
 		chunkCreated = true
 	}
@ -651,6 +844,36 @@ func (s *memSeries) cutNewHeadChunk(mint int64, chunkDiskMapper *chunks.ChunkDis
 	return s.headChunk
 }

+func (s *memSeries) cutNewOOOHeadChunk(mint int64, chunkDiskMapper *chunks.ChunkDiskMapper) (*oooHeadChunk, chunks.ChunkDiskMapperRef) {
+	ref := s.mmapCurrentOOOHeadChunk(chunkDiskMapper)
+
+	s.oooHeadChunk = &oooHeadChunk{
+		chunk:   NewOOOChunk(),
+		minTime: mint,
+		maxTime: math.MinInt64,
+	}
+
+	return s.oooHeadChunk, ref
+}
+
+func (s *memSeries) mmapCurrentOOOHeadChunk(chunkDiskMapper *chunks.ChunkDiskMapper) chunks.ChunkDiskMapperRef {
+	if s.oooHeadChunk == nil {
+		// There is no head chunk, so nothing to m-map here.
+		return 0
+	}
+	xor, _ := s.oooHeadChunk.chunk.ToXOR() // Encode to XorChunk which is more compact and implements all of the needed functionality.
+	oooXor := &chunkenc.OOOXORChunk{XORChunk: xor}
+	chunkRef := chunkDiskMapper.WriteChunk(s.ref, s.oooHeadChunk.minTime, s.oooHeadChunk.maxTime, oooXor, handleChunkWriteError)
+	s.oooMmappedChunks = append(s.oooMmappedChunks, &mmappedChunk{
+		ref:        chunkRef,
+		numSamples: uint16(xor.NumSamples()),
+		minTime:    s.oooHeadChunk.minTime,
+		maxTime:    s.oooHeadChunk.maxTime,
+	})
+	s.oooHeadChunk = nil
+	return chunkRef
+}
+
 func (s *memSeries) mmapCurrentHeadChunk(chunkDiskMapper *chunks.ChunkDiskMapper) {
 	if s.headChunk == nil {
 		// There is no head chunk, so nothing to m-map here.
--- a/tsdb/head_bench_test.go
+++ b/tsdb/head_bench_test.go
@ -30,7 +30,7 @@ func BenchmarkHeadStripeSeriesCreate(b *testing.B) {
 	opts := DefaultHeadOptions()
 	opts.ChunkRange = 1000
 	opts.ChunkDirRoot = chunkDir
-	h, err := NewHead(nil, nil, nil, opts, nil)
+	h, err := NewHead(nil, nil, nil, nil, opts, nil)
 	require.NoError(b, err)
 	defer h.Close()

@ -45,7 +45,7 @@ func BenchmarkHeadStripeSeriesCreateParallel(b *testing.B) {
 	opts := DefaultHeadOptions()
 	opts.ChunkRange = 1000
 	opts.ChunkDirRoot = chunkDir
-	h, err := NewHead(nil, nil, nil, opts, nil)
+	h, err := NewHead(nil, nil, nil, nil, opts, nil)
 	require.NoError(b, err)
 	defer h.Close()

@ -69,7 +69,7 @@ func BenchmarkHeadStripeSeriesCreate_PreCreationFailure(b *testing.B) {
 	// Mock the PreCreation() callback to fail on each series.
 	opts.SeriesCallback = failingSeriesLifecycleCallback{}

-	h, err := NewHead(nil, nil, nil, opts, nil)
+	h, err := NewHead(nil, nil, nil, nil, opts, nil)
 	require.NoError(b, err)
 	defer h.Close()

--- a/tsdb/head_read.go
+++ b/tsdb/head_read.go
@ -183,11 +183,20 @@ func (h *headIndexReader) Series(ref storage.SeriesRef, lbls *labels.Labels, chk
 	return nil
 }

-// headChunkID returns the HeadChunkID corresponding to .mmappedChunks[pos]
+// headChunkID returns the HeadChunkID referred to by the given position.
+// * 0 <= pos < len(s.mmappedChunks) refer to s.mmappedChunks[pos]
+// * pos == len(s.mmappedChunks) refers to s.headChunk
 func (s *memSeries) headChunkID(pos int) chunks.HeadChunkID {
 	return chunks.HeadChunkID(pos) + s.firstChunkID
 }

+// oooHeadChunkID returns the HeadChunkID referred to by the given position.
+// * 0 <= pos < len(s.oooMmappedChunks) refer to s.oooMmappedChunks[pos]
+// * pos == len(s.oooMmappedChunks) refers to s.oooHeadChunk
+func (s *memSeries) oooHeadChunkID(pos int) chunks.HeadChunkID {
+	return chunks.HeadChunkID(pos) + s.firstOOOChunkID
+}
+
 // LabelValueFor returns label value for the given label name in the series referred to by ID.
 func (h *headIndexReader) LabelValueFor(id storage.SeriesRef, label string) (string, error) {
 	memSeries := h.head.series.getByID(chunks.HeadSeriesRef(id))
@ -258,8 +267,8 @@ func (h *headChunkReader) Close() error {
 }

 // Chunk returns the chunk for the reference number.
-func (h *headChunkReader) Chunk(ref chunks.ChunkRef) (chunkenc.Chunk, error) {
-	sid, cid := chunks.HeadChunkRef(ref).Unpack()
+func (h *headChunkReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
+	sid, cid := chunks.HeadChunkRef(meta.Ref).Unpack()

 	s := h.head.series.getByID(sid)
 	// This means that the series has been garbage collected.
@ -330,6 +339,260 @@ func (s *memSeries) chunk(id chunks.HeadChunkID, chunkDiskMapper *chunks.ChunkDi
 	return mc, true, nil
 }

+// oooMergedChunk returns the requested chunk based on the given chunks.Meta
+// reference from memory or by m-mapping it from the disk. The returned chunk
+// might be a merge of all the overlapping chunks, if any, amongst all the
+// chunks in the OOOHead.
+// This function is not thread safe unless the caller holds a lock.
+func (s *memSeries) oooMergedChunk(meta chunks.Meta, cdm *chunks.ChunkDiskMapper, mint, maxt int64) (chunk *mergedOOOChunks, err error) {
+	_, cid := chunks.HeadChunkRef(meta.Ref).Unpack()
+
+	// ix represents the index of chunk in the s.mmappedChunks slice. The chunk meta's are
+	// incremented by 1 when new chunk is created, hence (meta - firstChunkID) gives the slice index.
+	// The max index for the s.mmappedChunks slice can be len(s.mmappedChunks)-1, hence if the ix
+	// is len(s.mmappedChunks), it represents the next chunk, which is the head chunk.
+	ix := int(cid) - int(s.firstOOOChunkID)
+	if ix < 0 || ix > len(s.oooMmappedChunks) {
+		return nil, storage.ErrNotFound
+	}
+
+	if ix == len(s.oooMmappedChunks) {
+		if s.oooHeadChunk == nil {
+			return nil, errors.New("invalid ooo head chunk")
+		}
+	}
+
+	// We create a temporary slice of chunk metas to hold the information of all
+	// possible chunks that may overlap with the requested chunk.
+	tmpChks := make([]chunkMetaAndChunkDiskMapperRef, 0, len(s.oooMmappedChunks))
+
+	oooHeadRef := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(len(s.oooMmappedChunks))))
+	if s.oooHeadChunk != nil && s.oooHeadChunk.OverlapsClosedInterval(mint, maxt) {
+		// We only want to append the head chunk if this chunk existed when
+		// Series() was called. This brings consistency in case new data
+		// is added in between Series() and Chunk() calls.
+		if oooHeadRef == meta.OOOLastRef {
+			tmpChks = append(tmpChks, chunkMetaAndChunkDiskMapperRef{
+				meta: chunks.Meta{
+					// Ignoring samples added before and after the last known min and max time for this chunk.
+					MinTime: meta.OOOLastMinTime,
+					MaxTime: meta.OOOLastMaxTime,
+					Ref:     oooHeadRef,
+				},
+			})
+		}
+	}
+
+	for i, c := range s.oooMmappedChunks {
+		chunkRef := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(i)))
+		// We can skip chunks that came in later than the last known OOOLastRef.
+		if chunkRef > meta.OOOLastRef {
+			break
+		}
+
+		if chunkRef == meta.OOOLastRef {
+			tmpChks = append(tmpChks, chunkMetaAndChunkDiskMapperRef{
+				meta: chunks.Meta{
+					MinTime: meta.OOOLastMinTime,
+					MaxTime: meta.OOOLastMaxTime,
+					Ref:     chunkRef,
+				},
+				ref:      c.ref,
+				origMinT: c.minTime,
+				origMaxT: c.maxTime,
+			})
+		} else if c.OverlapsClosedInterval(mint, maxt) {
+			tmpChks = append(tmpChks, chunkMetaAndChunkDiskMapperRef{
+				meta: chunks.Meta{
+					MinTime: c.minTime,
+					MaxTime: c.maxTime,
+					Ref:     chunkRef,
+				},
+				ref: c.ref,
+			})
+		}
+	}
+
+	// Next we want to sort all the collected chunks by min time so we can find
+	// those that overlap and stop when we know the rest don't.
+	sort.Sort(byMinTimeAndMinRef(tmpChks))
+
+	mc := &mergedOOOChunks{}
+	absoluteMax := int64(math.MinInt64)
+	for _, c := range tmpChks {
+		if c.meta.Ref != meta.Ref && (len(mc.chunks) == 0 || c.meta.MinTime > absoluteMax) {
+			continue
+		}
+		if c.meta.Ref == oooHeadRef {
+			var xor *chunkenc.XORChunk
+			// If head chunk min and max time match the meta OOO markers
+			// that means that the chunk has not expanded so we can append
+			// it as it is.
+			if s.oooHeadChunk.minTime == meta.OOOLastMinTime && s.oooHeadChunk.maxTime == meta.OOOLastMaxTime {
+				xor, err = s.oooHeadChunk.chunk.ToXOR() // TODO(jesus.vazquez) (This is an optimization idea that has no priority and might not be that useful) See if we could use a copy of the underlying slice. That would leave the more expensive ToXOR() function only for the usecase where Bytes() is called.
+			} else {
+				// We need to remove samples that are outside of the markers
+				xor, err = s.oooHeadChunk.chunk.ToXORBetweenTimestamps(meta.OOOLastMinTime, meta.OOOLastMaxTime)
+			}
+			if err != nil {
+				return nil, errors.Wrap(err, "failed to convert ooo head chunk to xor chunk")
+			}
+			c.meta.Chunk = xor
+		} else {
+			chk, err := cdm.Chunk(c.ref)
+			if err != nil {
+				if _, ok := err.(*chunks.CorruptionErr); ok {
+					return nil, errors.Wrap(err, "invalid ooo mmapped chunk")
+				}
+				return nil, err
+			}
+			if c.meta.Ref == meta.OOOLastRef &&
+				(c.origMinT != meta.OOOLastMinTime || c.origMaxT != meta.OOOLastMaxTime) {
+				// The head expanded and was memory mapped so now we need to
+				// wrap the chunk within a chunk that doesnt allows us to iterate
+				// through samples out of the OOOLastMinT and OOOLastMaxT
+				// markers.
+				c.meta.Chunk = boundedChunk{chk, meta.OOOLastMinTime, meta.OOOLastMaxTime}
+			} else {
+				c.meta.Chunk = chk
+			}
+		}
+		mc.chunks = append(mc.chunks, c.meta)
+		if c.meta.MaxTime > absoluteMax {
+			absoluteMax = c.meta.MaxTime
+		}
+	}
+
+	return mc, nil
+}
+
+var _ chunkenc.Chunk = &mergedOOOChunks{}
+
+// mergedOOOChunks holds the list of overlapping chunks. This struct satisfies
+// chunkenc.Chunk.
+type mergedOOOChunks struct {
+	chunks []chunks.Meta
+}
+
+// Bytes is a very expensive method because its calling the iterator of all the
+// chunks in the mergedOOOChunk and building a new chunk with the samples.
+func (o mergedOOOChunks) Bytes() []byte {
+	xc := chunkenc.NewXORChunk()
+	app, err := xc.Appender()
+	if err != nil {
+		panic(err)
+	}
+	it := o.Iterator(nil)
+	for it.Next() {
+		t, v := it.At()
+		app.Append(t, v)
+	}
+
+	return xc.Bytes()
+}
+
+func (o mergedOOOChunks) Encoding() chunkenc.Encoding {
+	return chunkenc.EncXOR
+}
+
+func (o mergedOOOChunks) Appender() (chunkenc.Appender, error) {
+	return nil, errors.New("can't append to mergedOOOChunks")
+}
+
+func (o mergedOOOChunks) Iterator(iterator chunkenc.Iterator) chunkenc.Iterator {
+	iterators := make([]chunkenc.Iterator, 0, len(o.chunks))
+	for _, c := range o.chunks {
+		iterators = append(iterators, c.Chunk.Iterator(nil))
+	}
+	return storage.NewChainSampleIterator(iterators)
+}
+
+func (o mergedOOOChunks) NumSamples() int {
+	samples := 0
+	for _, c := range o.chunks {
+		samples += c.Chunk.NumSamples()
+	}
+	return samples
+}
+
+func (o mergedOOOChunks) Compact() {}
+
+var _ chunkenc.Chunk = &boundedChunk{}
+
+// boundedChunk is an implementation of chunkenc.Chunk that uses a
+// boundedIterator that only iterates through samples which timestamps are
+// >= minT and <= maxT
+type boundedChunk struct {
+	chunkenc.Chunk
+	minT int64
+	maxT int64
+}
+
+func (b boundedChunk) Bytes() []byte {
+	xor := chunkenc.NewXORChunk()
+	a, _ := xor.Appender()
+	it := b.Iterator(nil)
+	for it.Next() {
+		t, v := it.At()
+		a.Append(t, v)
+	}
+	return xor.Bytes()
+}
+
+func (b boundedChunk) Iterator(iterator chunkenc.Iterator) chunkenc.Iterator {
+	it := b.Chunk.Iterator(iterator)
+	if it == nil {
+		panic("iterator shouldn't be nil")
+	}
+	return boundedIterator{it, b.minT, b.maxT}
+}
+
+var _ chunkenc.Iterator = &boundedIterator{}
+
+// boundedIterator is an implementation of Iterator that only iterates through
+// samples which timestamps are >= minT and <= maxT
+type boundedIterator struct {
+	chunkenc.Iterator
+	minT int64
+	maxT int64
+}
+
+// Next the first time its called it will advance as many positions as necessary
+// until its able to find a sample within the bounds minT and maxT.
+// If there are samples within bounds it will advance one by one amongst them.
+// If there are no samples within bounds it will return false.
+func (b boundedIterator) Next() bool {
+	for b.Iterator.Next() {
+		t, _ := b.Iterator.At()
+		if t < b.minT {
+			continue
+		} else if t > b.maxT {
+			return false
+		}
+		return true
+	}
+	return false
+}
+
+func (b boundedIterator) Seek(t int64) bool {
+	if t < b.minT {
+		// We must seek at least up to b.minT if it is asked for something before that.
+		ok := b.Iterator.Seek(b.minT)
+		if !ok {
+			return false
+		}
+		t, _ := b.Iterator.At()
+		return t <= b.maxT
+	}
+	if t > b.maxT {
+		// We seek anyway so that the subsequent Next() calls will also return false.
+		b.Iterator.Seek(t)
+		return false
+	}
+	return b.Iterator.Seek(t)
+}
+
+// safeChunk makes sure that the chunk can be accessed without a race condition
 type safeChunk struct {
 	chunkenc.Chunk
 	s               *memSeries
--- a/tsdb/head_read_test.go
+++ b/tsdb/head_read_test.go
@ -0,0 +1,178 @@
+// Copyright 2021 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tsdb
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+
+	"github.com/prometheus/prometheus/tsdb/chunkenc"
+)
+
+func TestBoundedChunk(t *testing.T) {
+	tests := []struct {
+		name           string
+		inputChunk     chunkenc.Chunk
+		inputMinT      int64
+		inputMaxT      int64
+		initialSeek    int64
+		seekIsASuccess bool
+		expSamples     []sample
+	}{
+		{
+			name:       "if there are no samples it returns nothing",
+			inputChunk: newTestChunk(0),
+			expSamples: nil,
+		},
+		{
+			name:       "bounds represent a single sample",
+			inputChunk: newTestChunk(10),
+			expSamples: []sample{
+				{0, 0},
+			},
+		},
+		{
+			name:       "if there are bounds set only samples within them are returned",
+			inputChunk: newTestChunk(10),
+			inputMinT:  1,
+			inputMaxT:  8,
+			expSamples: []sample{
+				{1, 1},
+				{2, 2},
+				{3, 3},
+				{4, 4},
+				{5, 5},
+				{6, 6},
+				{7, 7},
+				{8, 8},
+			},
+		},
+		{
+			name:       "if bounds set and only maxt is less than actual maxt",
+			inputChunk: newTestChunk(10),
+			inputMinT:  0,
+			inputMaxT:  5,
+			expSamples: []sample{
+				{0, 0},
+				{1, 1},
+				{2, 2},
+				{3, 3},
+				{4, 4},
+				{5, 5},
+			},
+		},
+		{
+			name:       "if bounds set and only mint is more than actual mint",
+			inputChunk: newTestChunk(10),
+			inputMinT:  5,
+			inputMaxT:  9,
+			expSamples: []sample{
+				{5, 5},
+				{6, 6},
+				{7, 7},
+				{8, 8},
+				{9, 9},
+			},
+		},
+		{
+			name:           "if there are bounds set with seek before mint",
+			inputChunk:     newTestChunk(10),
+			inputMinT:      3,
+			inputMaxT:      7,
+			initialSeek:    1,
+			seekIsASuccess: true,
+			expSamples: []sample{
+				{3, 3},
+				{4, 4},
+				{5, 5},
+				{6, 6},
+				{7, 7},
+			},
+		},
+		{
+			name:           "if there are bounds set with seek between mint and maxt",
+			inputChunk:     newTestChunk(10),
+			inputMinT:      3,
+			inputMaxT:      7,
+			initialSeek:    5,
+			seekIsASuccess: true,
+			expSamples: []sample{
+				{5, 5},
+				{6, 6},
+				{7, 7},
+			},
+		},
+		{
+			name:           "if there are bounds set with seek after maxt",
+			inputChunk:     newTestChunk(10),
+			inputMinT:      3,
+			inputMaxT:      7,
+			initialSeek:    8,
+			seekIsASuccess: false,
+		},
+	}
+	for _, tc := range tests {
+		t.Run(fmt.Sprintf("name=%s", tc.name), func(t *testing.T) {
+			chunk := boundedChunk{tc.inputChunk, tc.inputMinT, tc.inputMaxT}
+
+			// Testing Bytes()
+			expChunk := chunkenc.NewXORChunk()
+			if tc.inputChunk.NumSamples() > 0 {
+				app, err := expChunk.Appender()
+				require.NoError(t, err)
+				for ts := tc.inputMinT; ts <= tc.inputMaxT; ts++ {
+					app.Append(ts, float64(ts))
+				}
+			}
+			require.Equal(t, expChunk.Bytes(), chunk.Bytes())
+
+			var samples []sample
+			it := chunk.Iterator(nil)
+
+			if tc.initialSeek != 0 {
+				// Testing Seek()
+				ok := it.Seek(tc.initialSeek)
+				require.Equal(t, tc.seekIsASuccess, ok)
+				if ok {
+					t, v := it.At()
+					samples = append(samples, sample{t, v})
+				}
+			}
+
+			// Testing Next()
+			for it.Next() {
+				t, v := it.At()
+				samples = append(samples, sample{t, v})
+			}
+
+			// it.Next() should keep returning false.
+			for i := 0; i < 10; i++ {
+				require.False(t, it.Next())
+			}
+
+			require.Equal(t, tc.expSamples, samples)
+		})
+	}
+}
+
+func newTestChunk(numSamples int) chunkenc.Chunk {
+	xor := chunkenc.NewXORChunk()
+	a, _ := xor.Appender()
+	for i := 0; i < numSamples; i++ {
+		a.Append(int64(i), float64(i))
+	}
+	return xor
+}
--- a/tsdb/head_test.go
+++ b/tsdb/head_test.go
@ -49,7 +49,7 @@ import (
 	"github.com/prometheus/prometheus/tsdb/wal"
 )

-func newTestHead(t testing.TB, chunkRange int64, compressWAL bool) (*Head, *wal.WAL) {
+func newTestHead(t testing.TB, chunkRange int64, compressWAL, oooEnabled bool) (*Head, *wal.WAL) {
 	dir := t.TempDir()
 	wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, compressWAL)
 	require.NoError(t, err)
@ -59,18 +59,23 @@ func newTestHead(t testing.TB, chunkRange int64, compressWAL bool) (*Head, *wal.
 	opts.ChunkDirRoot = dir
 	opts.EnableExemplarStorage = true
 	opts.MaxExemplars.Store(config.DefaultExemplarsConfig.MaxExemplars)
+	if oooEnabled {
+		opts.OutOfOrderTimeWindow.Store(10 * time.Minute.Milliseconds())
+	}

-	h, err := NewHead(nil, nil, wlog, opts, nil)
+	h, err := NewHead(nil, nil, wlog, nil, opts, nil)
 	require.NoError(t, err)

-	require.NoError(t, h.chunkDiskMapper.IterateAllChunks(func(_ chunks.HeadSeriesRef, _ chunks.ChunkDiskMapperRef, _, _ int64, _ uint16) error { return nil }))
+	require.NoError(t, h.chunkDiskMapper.IterateAllChunks(func(_ chunks.HeadSeriesRef, _ chunks.ChunkDiskMapperRef, _, _ int64, _ uint16, _ chunkenc.Encoding) error {
+		return nil
+	}))

 	return h, wlog
 }

 func BenchmarkCreateSeries(b *testing.B) {
 	series := genSeries(b.N, 10, 0, 0)
-	h, _ := newTestHead(b, 10000, false)
+	h, _ := newTestHead(b, 10000, false, false)
 	defer func() {
 		require.NoError(b, h.Close())
 	}()
@ -224,7 +229,7 @@ func BenchmarkLoadWAL(b *testing.B) {
 						require.NoError(b, err)
 						for k := 0; k < c.batches*c.seriesPerBatch; k++ {
 							// Create one mmapped chunk per series, with one sample at the given time.
-							s := newMemSeries(labels.Labels{}, chunks.HeadSeriesRef(k)*101, c.mmappedChunkT, defaultIsolationDisabled)
+							s := newMemSeries(labels.Labels{}, chunks.HeadSeriesRef(k)*101, c.mmappedChunkT, 1, defaultIsolationDisabled)
 							s.append(c.mmappedChunkT, 42, 0, chunkDiskMapper)
 							s.mmapCurrentHeadChunk(chunkDiskMapper)
 						}
@ -255,7 +260,7 @@ func BenchmarkLoadWAL(b *testing.B) {
 						opts := DefaultHeadOptions()
 						opts.ChunkRange = 1000
 						opts.ChunkDirRoot = w.Dir()
-						h, err := NewHead(nil, nil, w, opts, nil)
+						h, err := NewHead(nil, nil, w, nil, opts, nil)
 						require.NoError(b, err)
 						h.Init(0)
 					}
@ -271,7 +276,7 @@ func BenchmarkLoadWAL(b *testing.B) {
 // While appending the samples to the head it concurrently queries them from multiple go routines and verifies that the
 // returned results are correct.
 func TestHead_HighConcurrencyReadAndWrite(t *testing.T) {
-	head, _ := newTestHead(t, DefaultBlockDuration, false)
+	head, _ := newTestHead(t, DefaultBlockDuration, false, false)
 	defer func() {
 		require.NoError(t, head.Close())
 	}()
@ -487,7 +492,7 @@ func TestHead_ReadWAL(t *testing.T) {
 				},
 			}

-			head, w := newTestHead(t, 1000, compress)
+			head, w := newTestHead(t, 1000, compress, false)
 			defer func() {
 				require.NoError(t, head.Close())
 			}()
@ -531,7 +536,7 @@ func TestHead_ReadWAL(t *testing.T) {
 }

 func TestHead_WALMultiRef(t *testing.T) {
-	head, w := newTestHead(t, 1000, false)
+	head, w := newTestHead(t, 1000, false, false)

 	require.NoError(t, head.Init(0))

@ -572,7 +577,7 @@ func TestHead_WALMultiRef(t *testing.T) {
 	opts := DefaultHeadOptions()
 	opts.ChunkRange = 1000
 	opts.ChunkDirRoot = w.Dir()
-	head, err = NewHead(nil, nil, w, opts, nil)
+	head, err = NewHead(nil, nil, w, nil, opts, nil)
 	require.NoError(t, err)
 	require.NoError(t, head.Init(0))
 	defer func() {
@ -591,7 +596,7 @@ func TestHead_WALMultiRef(t *testing.T) {
 }

 func TestHead_ActiveAppenders(t *testing.T) {
-	head, _ := newTestHead(t, 1000, false)
+	head, _ := newTestHead(t, 1000, false, false)
 	defer head.Close()

 	require.NoError(t, head.Init(0))
@ -624,14 +629,14 @@ func TestHead_ActiveAppenders(t *testing.T) {
 }

 func TestHead_UnknownWALRecord(t *testing.T) {
-	head, w := newTestHead(t, 1000, false)
+	head, w := newTestHead(t, 1000, false, false)
 	w.Log([]byte{255, 42})
 	require.NoError(t, head.Init(0))
 	require.NoError(t, head.Close())
 }

 func TestHead_Truncate(t *testing.T) {
-	h, _ := newTestHead(t, 1000, false)
+	h, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, h.Close())
 	}()
@ -733,7 +738,7 @@ func TestMemSeries_truncateChunks(t *testing.T) {
 		},
 	}

-	s := newMemSeries(labels.FromStrings("a", "b"), 1, 2000, defaultIsolationDisabled)
+	s := newMemSeries(labels.FromStrings("a", "b"), 1, 2000, 1, defaultIsolationDisabled)

 	for i := 0; i < 4000; i += 5 {
 		ok, _ := s.append(int64(i), float64(i), 0, chunkDiskMapper)
@ -752,7 +757,7 @@ func TestMemSeries_truncateChunks(t *testing.T) {
 	require.NotNil(t, chk)
 	require.NoError(t, err)

-	s.truncateChunksBefore(2000)
+	s.truncateChunksBefore(2000, 0)

 	require.Equal(t, int64(2000), s.mmappedChunks[0].minTime)
 	_, _, err = s.chunk(0, chunkDiskMapper, &memChunkPool)
@ -789,7 +794,7 @@ func TestHeadDeleteSeriesWithoutSamples(t *testing.T) {
 					{Ref: 50, T: 90, V: 1},
 				},
 			}
-			head, w := newTestHead(t, 1000, compress)
+			head, w := newTestHead(t, 1000, compress, false)
 			defer func() {
 				require.NoError(t, head.Close())
 			}()
@ -857,7 +862,8 @@ func TestHeadDeleteSimple(t *testing.T) {
 	for _, compress := range []bool{false, true} {
 		t.Run(fmt.Sprintf("compress=%t", compress), func(t *testing.T) {
 			for _, c := range cases {
-				head, w := newTestHead(t, 1000, compress)
+				head, w := newTestHead(t, 1000, compress, false)
+				require.NoError(t, head.Init(0))

 				app := head.Appender(context.Background())
 				for _, smpl := range smplsAll {
@ -887,7 +893,7 @@ func TestHeadDeleteSimple(t *testing.T) {
 				opts := DefaultHeadOptions()
 				opts.ChunkRange = 1000
 				opts.ChunkDirRoot = reloadedW.Dir()
-				reloadedHead, err := NewHead(nil, nil, reloadedW, opts, nil)
+				reloadedHead, err := NewHead(nil, nil, reloadedW, nil, opts, nil)
 				require.NoError(t, err)
 				require.NoError(t, reloadedHead.Init(0))

@ -937,7 +943,7 @@ func TestHeadDeleteSimple(t *testing.T) {
 }

 func TestDeleteUntilCurMax(t *testing.T) {
-	hb, _ := newTestHead(t, 1000000, false)
+	hb, _ := newTestHead(t, 1000000, false, false)
 	defer func() {
 		require.NoError(t, hb.Close())
 	}()
@ -990,7 +996,7 @@ func TestDeletedSamplesAndSeriesStillInWALAfterCheckpoint(t *testing.T) {
 	numSamples := 10000

 	// Enough samples to cause a checkpoint.
-	hb, w := newTestHead(t, int64(numSamples)*10, false)
+	hb, w := newTestHead(t, int64(numSamples)*10, false, false)

 	for i := 0; i < numSamples; i++ {
 		app := hb.Appender(context.Background())
@ -1082,7 +1088,7 @@ func TestDelete_e2e(t *testing.T) {
 		seriesMap[labels.New(l...).String()] = []tsdbutil.Sample{}
 	}

-	hb, _ := newTestHead(t, 100000, false)
+	hb, _ := newTestHead(t, 100000, false, false)
 	defer func() {
 		require.NoError(t, hb.Close())
 	}()
@ -1271,7 +1277,7 @@ func TestMemSeries_append(t *testing.T) {
 		require.NoError(t, chunkDiskMapper.Close())
 	}()

-	s := newMemSeries(labels.Labels{}, 1, 500, defaultIsolationDisabled)
+	s := newMemSeries(labels.Labels{}, 1, 500, 1, defaultIsolationDisabled)

 	// Add first two samples at the very end of a chunk range and the next two
 	// on and after it.
@ -1325,7 +1331,7 @@ func TestMemSeries_append_atVariableRate(t *testing.T) {
 		require.NoError(t, chunkDiskMapper.Close())
 	})

-	s := newMemSeries(labels.Labels{}, 1, DefaultBlockDuration, defaultIsolationDisabled)
+	s := newMemSeries(labels.Labels{}, 1, DefaultBlockDuration, 0, defaultIsolationDisabled)

 	// At this slow rate, we will fill the chunk in two block durations.
 	slowRate := (DefaultBlockDuration * 2) / samplesPerChunk
@ -1361,7 +1367,7 @@ func TestMemSeries_append_atVariableRate(t *testing.T) {

 func TestGCChunkAccess(t *testing.T) {
 	// Put a chunk, select it. GC it and then access it.
-	h, _ := newTestHead(t, 1000, false)
+	h, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, h.Close())
 	}()
@ -1398,22 +1404,22 @@ func TestGCChunkAccess(t *testing.T) {

 	cr, err := h.chunksRange(0, 1500, nil)
 	require.NoError(t, err)
-	_, err = cr.Chunk(chunks[0].Ref)
+	_, err = cr.Chunk(chunks[0])
 	require.NoError(t, err)
-	_, err = cr.Chunk(chunks[1].Ref)
+	_, err = cr.Chunk(chunks[1])
 	require.NoError(t, err)

 	require.NoError(t, h.Truncate(1500)) // Remove a chunk.

-	_, err = cr.Chunk(chunks[0].Ref)
+	_, err = cr.Chunk(chunks[0])
 	require.Equal(t, storage.ErrNotFound, err)
-	_, err = cr.Chunk(chunks[1].Ref)
+	_, err = cr.Chunk(chunks[1])
 	require.NoError(t, err)
 }

 func TestGCSeriesAccess(t *testing.T) {
 	// Put a series, select it. GC it and then access it.
-	h, _ := newTestHead(t, 1000, false)
+	h, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, h.Close())
 	}()
@ -1450,23 +1456,23 @@ func TestGCSeriesAccess(t *testing.T) {

 	cr, err := h.chunksRange(0, 2000, nil)
 	require.NoError(t, err)
-	_, err = cr.Chunk(chunks[0].Ref)
+	_, err = cr.Chunk(chunks[0])
 	require.NoError(t, err)
-	_, err = cr.Chunk(chunks[1].Ref)
+	_, err = cr.Chunk(chunks[1])
 	require.NoError(t, err)

 	require.NoError(t, h.Truncate(2000)) // Remove the series.

 	require.Equal(t, (*memSeries)(nil), h.series.getByID(1))

-	_, err = cr.Chunk(chunks[0].Ref)
+	_, err = cr.Chunk(chunks[0])
 	require.Equal(t, storage.ErrNotFound, err)
-	_, err = cr.Chunk(chunks[1].Ref)
+	_, err = cr.Chunk(chunks[1])
 	require.Equal(t, storage.ErrNotFound, err)
 }

 func TestUncommittedSamplesNotLostOnTruncate(t *testing.T) {
-	h, _ := newTestHead(t, 1000, false)
+	h, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, h.Close())
 	}()
@ -1496,7 +1502,7 @@ func TestUncommittedSamplesNotLostOnTruncate(t *testing.T) {
 }

 func TestRemoveSeriesAfterRollbackAndTruncate(t *testing.T) {
-	h, _ := newTestHead(t, 1000, false)
+	h, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, h.Close())
 	}()
@ -1529,7 +1535,7 @@ func TestRemoveSeriesAfterRollbackAndTruncate(t *testing.T) {
 func TestHead_LogRollback(t *testing.T) {
 	for _, compress := range []bool{false, true} {
 		t.Run(fmt.Sprintf("compress=%t", compress), func(t *testing.T) {
-			h, w := newTestHead(t, 1000, compress)
+			h, w := newTestHead(t, 1000, compress, false)
 			defer func() {
 				require.NoError(t, h.Close())
 			}()
@ -1606,7 +1612,7 @@ func TestWalRepair_DecodingError(t *testing.T) {
 					opts := DefaultHeadOptions()
 					opts.ChunkRange = 1
 					opts.ChunkDirRoot = w.Dir()
-					h, err := NewHead(nil, nil, w, opts, nil)
+					h, err := NewHead(nil, nil, w, nil, opts, nil)
 					require.NoError(t, err)
 					require.Equal(t, 0.0, prom_testutil.ToFloat64(h.metrics.walCorruptionsTotal))
 					initErr := h.Init(math.MinInt64)
@ -1660,7 +1666,8 @@ func TestHeadReadWriterRepair(t *testing.T) {
 		opts := DefaultHeadOptions()
 		opts.ChunkRange = chunkRange
 		opts.ChunkDirRoot = dir
-		h, err := NewHead(nil, nil, w, opts, nil)
+		opts.ChunkWriteQueueSize = 1 // We need to set this option so that we use the async queue. Upstream prometheus uses the queue directly.
+		h, err := NewHead(nil, nil, w, nil, opts, nil)
 		require.NoError(t, err)
 		require.Equal(t, 0.0, prom_testutil.ToFloat64(h.metrics.mmapChunkCorruptionTotal))
 		require.NoError(t, h.Init(math.MinInt64))
@ -1715,7 +1722,7 @@ func TestHeadReadWriterRepair(t *testing.T) {
 }

 func TestNewWalSegmentOnTruncate(t *testing.T) {
-	h, wlog := newTestHead(t, 1000, false)
+	h, wlog := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, h.Close())
 	}()
@ -1745,7 +1752,7 @@ func TestNewWalSegmentOnTruncate(t *testing.T) {
 }

 func TestAddDuplicateLabelName(t *testing.T) {
-	h, _ := newTestHead(t, 1000, false)
+	h, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, h.Close())
 	}()
@ -1828,7 +1835,7 @@ func TestMemSeriesIsolation(t *testing.T) {
 	}

 	// Test isolation without restart of Head.
-	hb, _ := newTestHead(t, 1000, false)
+	hb, _ := newTestHead(t, 1000, false, false)
 	i := addSamples(hb)
 	testIsolation(hb, i)

@ -1890,7 +1897,7 @@ func TestMemSeriesIsolation(t *testing.T) {
 	require.NoError(t, hb.Close())

 	// Test isolation with restart of Head. This is to verify the num samples of chunks after m-map chunk replay.
-	hb, w := newTestHead(t, 1000, false)
+	hb, w := newTestHead(t, 1000, false, false)
 	i = addSamples(hb)
 	require.NoError(t, hb.Close())

@ -1899,7 +1906,7 @@ func TestMemSeriesIsolation(t *testing.T) {
 	opts := DefaultHeadOptions()
 	opts.ChunkRange = 1000
 	opts.ChunkDirRoot = wlog.Dir()
-	hb, err = NewHead(nil, nil, wlog, opts, nil)
+	hb, err = NewHead(nil, nil, wlog, nil, opts, nil)
 	defer func() { require.NoError(t, hb.Close()) }()
 	require.NoError(t, err)
 	require.NoError(t, hb.Init(0))
@ -1943,7 +1950,7 @@ func TestIsolationRollback(t *testing.T) {
 	}

 	// Rollback after a failed append and test if the low watermark has progressed anyway.
-	hb, _ := newTestHead(t, 1000, false)
+	hb, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, hb.Close())
 	}()
@ -1974,7 +1981,7 @@ func TestIsolationLowWatermarkMonotonous(t *testing.T) {
 		t.Skip("skipping test since tsdb isolation is disabled")
 	}

-	hb, _ := newTestHead(t, 1000, false)
+	hb, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, hb.Close())
 	}()
@ -2011,7 +2018,7 @@ func TestIsolationAppendIDZeroIsNoop(t *testing.T) {
 		t.Skip("skipping test since tsdb isolation is disabled")
 	}

-	h, _ := newTestHead(t, 1000, false)
+	h, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, h.Close())
 	}()
@ -2036,7 +2043,7 @@ func TestIsolationWithoutAdd(t *testing.T) {
 		t.Skip("skipping test since tsdb isolation is disabled")
 	}

-	hb, _ := newTestHead(t, 1000, false)
+	hb, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, hb.Close())
 	}()
@ -2131,7 +2138,7 @@ func TestOutOfOrderSamplesMetric(t *testing.T) {
 }

 func testHeadSeriesChunkRace(t *testing.T) {
-	h, _ := newTestHead(t, 1000, false)
+	h, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, h.Close())
 	}()
@ -2166,7 +2173,7 @@ func testHeadSeriesChunkRace(t *testing.T) {
 }

 func TestHeadLabelNamesValuesWithMinMaxRange(t *testing.T) {
-	head, _ := newTestHead(t, 1000, false)
+	head, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, head.Close())
 	}()
@ -2226,7 +2233,7 @@ func TestHeadLabelNamesValuesWithMinMaxRange(t *testing.T) {
 }

 func TestHeadLabelValuesWithMatchers(t *testing.T) {
-	head, _ := newTestHead(t, 1000, false)
+	head, _ := newTestHead(t, 1000, false, false)
 	t.Cleanup(func() { require.NoError(t, head.Close()) })

 	app := head.Appender(context.Background())
@ -2285,7 +2292,7 @@ func TestHeadLabelValuesWithMatchers(t *testing.T) {
 }

 func TestHeadLabelNamesWithMatchers(t *testing.T) {
-	head, _ := newTestHead(t, 1000, false)
+	head, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, head.Close())
 	}()
@ -2353,7 +2360,7 @@ func TestHeadLabelNamesWithMatchers(t *testing.T) {
 }

 func TestErrReuseAppender(t *testing.T) {
-	head, _ := newTestHead(t, 1000, false)
+	head, _ := newTestHead(t, 1000, false, false)
 	defer func() {
 		require.NoError(t, head.Close())
 	}()
@ -2389,7 +2396,7 @@ func TestErrReuseAppender(t *testing.T) {

 func TestHeadMintAfterTruncation(t *testing.T) {
 	chunkRange := int64(2000)
-	head, _ := newTestHead(t, chunkRange, false)
+	head, _ := newTestHead(t, chunkRange, false, false)

 	app := head.Appender(context.Background())
 	_, err := app.Append(0, labels.FromStrings("a", "b"), 100, 100)
@ -2423,7 +2430,7 @@ func TestHeadMintAfterTruncation(t *testing.T) {

 func TestHeadExemplars(t *testing.T) {
 	chunkRange := int64(2000)
-	head, _ := newTestHead(t, chunkRange, false)
+	head, _ := newTestHead(t, chunkRange, false, false)
 	app := head.Appender(context.Background())

 	l := labels.FromStrings("traceId", "123")
@ -2445,7 +2452,7 @@ func TestHeadExemplars(t *testing.T) {

 func BenchmarkHeadLabelValuesWithMatchers(b *testing.B) {
 	chunkRange := int64(2000)
-	head, _ := newTestHead(b, chunkRange, false)
+	head, _ := newTestHead(b, chunkRange, false, false)
 	b.Cleanup(func() { require.NoError(b, head.Close()) })

 	app := head.Appender(context.Background())
@ -2483,7 +2490,7 @@ func TestMemSafeIteratorSeekIntoBuffer(t *testing.T) {
 		require.NoError(t, chunkDiskMapper.Close())
 	}()

-	s := newMemSeries(labels.Labels{}, 1, 500, defaultIsolationDisabled)
+	s := newMemSeries(labels.Labels{}, 1, 500, 1, defaultIsolationDisabled)

 	for i := 0; i < 7; i++ {
 		ok, _ := s.append(int64(i), float64(i), 0, chunkDiskMapper)
@ -2754,7 +2761,7 @@ func TestWaitForPendingReadersInTimeRange(t *testing.T) {
 }

 func TestChunkSnapshot(t *testing.T) {
-	head, _ := newTestHead(t, 120*4, false)
+	head, _ := newTestHead(t, 120*4, false, false)
 	defer func() {
 		head.opts.EnableMemorySnapshotOnShutdown = false
 		require.NoError(t, head.Close())
@ -2833,7 +2840,7 @@ func TestChunkSnapshot(t *testing.T) {
 	openHeadAndCheckReplay := func() {
 		w, err := wal.NewSize(nil, nil, head.wal.Dir(), 32768, false)
 		require.NoError(t, err)
-		head, err = NewHead(nil, nil, w, head.opts, nil)
+		head, err = NewHead(nil, nil, w, nil, head.opts, nil)
 		require.NoError(t, err)
 		require.NoError(t, head.Init(math.MinInt64))

@ -2996,7 +3003,7 @@ func TestChunkSnapshot(t *testing.T) {
 }

 func TestSnapshotError(t *testing.T) {
-	head, _ := newTestHead(t, 120*4, false)
+	head, _ := newTestHead(t, 120*4, false, false)
 	defer func() {
 		head.opts.EnableMemorySnapshotOnShutdown = false
 		require.NoError(t, head.Close())
@ -3043,7 +3050,7 @@ func TestSnapshotError(t *testing.T) {
 	w, err := wal.NewSize(nil, nil, head.wal.Dir(), 32768, false)
 	require.NoError(t, err)
 	// Testing https://github.com/prometheus/prometheus/issues/9437 with the registry.
-	head, err = NewHead(prometheus.NewRegistry(), nil, w, head.opts, nil)
+	head, err = NewHead(prometheus.NewRegistry(), nil, w, nil, head.opts, nil)
 	require.NoError(t, err)
 	require.NoError(t, head.Init(math.MinInt64))

@ -3102,7 +3109,7 @@ func TestChunkSnapshotReplayBug(t *testing.T) {
 	opts := DefaultHeadOptions()
 	opts.ChunkDirRoot = dir
 	opts.EnableMemorySnapshotOnShutdown = true
-	head, err := NewHead(nil, nil, wlog, opts, nil)
+	head, err := NewHead(nil, nil, wlog, nil, opts, nil)
 	require.NoError(t, err)
 	require.NoError(t, head.Init(math.MinInt64))
 	defer func() {
@ -3136,7 +3143,7 @@ func TestChunkSnapshotTakenAfterIncompleteSnapshot(t *testing.T) {
 	opts := DefaultHeadOptions()
 	opts.ChunkDirRoot = dir
 	opts.EnableMemorySnapshotOnShutdown = true
-	head, err := NewHead(nil, nil, wlog, opts, nil)
+	head, err := NewHead(nil, nil, wlog, nil, opts, nil)
 	require.NoError(t, err)
 	require.NoError(t, head.Init(math.MinInt64))

@ -3159,6 +3166,251 @@ func TestChunkSnapshotTakenAfterIncompleteSnapshot(t *testing.T) {
 	require.Greater(t, offset, 0)
 }

+// TestOOOWalReplay checks the replay at a low level.
+// TODO(codesome): Needs test for ooo WAL repair.
+func TestOOOWalReplay(t *testing.T) {
+	dir := t.TempDir()
+	wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
+	require.NoError(t, err)
+	oooWlog, err := wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
+	require.NoError(t, err)
+
+	opts := DefaultHeadOptions()
+	opts.ChunkRange = 1000
+	opts.ChunkDirRoot = dir
+	opts.OutOfOrderTimeWindow.Store(30 * time.Minute.Milliseconds())
+
+	h, err := NewHead(nil, nil, wlog, oooWlog, opts, nil)
+	require.NoError(t, err)
+	require.NoError(t, h.Init(0))
+
+	var expOOOSamples []sample
+	l := labels.FromStrings("foo", "bar")
+	appendSample := func(mins int64, isOOO bool) {
+		app := h.Appender(context.Background())
+		ts, v := mins*time.Minute.Milliseconds(), float64(mins)
+		_, err := app.Append(0, l, ts, v)
+		require.NoError(t, err)
+		require.NoError(t, app.Commit())
+
+		if isOOO {
+			expOOOSamples = append(expOOOSamples, sample{t: ts, v: v})
+		}
+	}
+
+	// In-order sample.
+	appendSample(60, false)
+
+	// Out of order samples.
+	appendSample(40, true)
+	appendSample(35, true)
+	appendSample(50, true)
+	appendSample(55, true)
+	appendSample(59, true)
+	appendSample(31, true)
+
+	// Check that Head's time ranges are set properly.
+	require.Equal(t, 60*time.Minute.Milliseconds(), h.MinTime())
+	require.Equal(t, 60*time.Minute.Milliseconds(), h.MaxTime())
+	require.Equal(t, 31*time.Minute.Milliseconds(), h.MinOOOTime())
+	require.Equal(t, 59*time.Minute.Milliseconds(), h.MaxOOOTime())
+
+	// Restart head.
+	require.NoError(t, h.Close())
+	wlog, err = wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
+	require.NoError(t, err)
+	oooWlog, err = wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
+	require.NoError(t, err)
+	h, err = NewHead(nil, nil, wlog, oooWlog, opts, nil)
+	require.NoError(t, err)
+	require.NoError(t, h.Init(0)) // Replay happens here.
+
+	// Get the ooo samples from the Head.
+	ms, ok, err := h.getOrCreate(l.Hash(), l)
+	require.NoError(t, err)
+	require.False(t, ok)
+	require.NotNil(t, ms)
+
+	xor, err := ms.oooHeadChunk.chunk.ToXOR()
+	require.NoError(t, err)
+
+	it := xor.Iterator(nil)
+	actOOOSamples := make([]sample, 0, len(expOOOSamples))
+	for it.Next() {
+		ts, v := it.At()
+		actOOOSamples = append(actOOOSamples, sample{t: ts, v: v})
+	}
+
+	// OOO chunk will be sorted. Hence sort the expected samples.
+	sort.Slice(expOOOSamples, func(i, j int) bool {
+		return expOOOSamples[i].t < expOOOSamples[j].t
+	})
+
+	require.Equal(t, expOOOSamples, actOOOSamples)
+
+	require.NoError(t, h.Close())
+}
+
+// TestOOOMmapReplay checks the replay at a low level.
+func TestOOOMmapReplay(t *testing.T) {
+	dir := t.TempDir()
+	wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
+	require.NoError(t, err)
+	oooWlog, err := wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
+	require.NoError(t, err)
+
+	opts := DefaultHeadOptions()
+	opts.ChunkRange = 1000
+	opts.ChunkDirRoot = dir
+	opts.OutOfOrderCapMax.Store(30)
+	opts.OutOfOrderTimeWindow.Store(1000 * time.Minute.Milliseconds())
+
+	h, err := NewHead(nil, nil, wlog, oooWlog, opts, nil)
+	require.NoError(t, err)
+	require.NoError(t, h.Init(0))
+
+	l := labels.FromStrings("foo", "bar")
+	appendSample := func(mins int64) {
+		app := h.Appender(context.Background())
+		ts, v := mins*time.Minute.Milliseconds(), float64(mins)
+		_, err := app.Append(0, l, ts, v)
+		require.NoError(t, err)
+		require.NoError(t, app.Commit())
+	}
+
+	// In-order sample.
+	appendSample(200)
+
+	// Out of order samples. 92 samples to create 3 m-map chunks.
+	for mins := int64(100); mins <= 191; mins++ {
+		appendSample(mins)
+	}
+
+	ms, ok, err := h.getOrCreate(l.Hash(), l)
+	require.NoError(t, err)
+	require.False(t, ok)
+	require.NotNil(t, ms)
+
+	require.Len(t, ms.oooMmappedChunks, 3)
+	// Verify that we can access the chunks without error.
+	for _, m := range ms.oooMmappedChunks {
+		chk, err := h.chunkDiskMapper.Chunk(m.ref)
+		require.NoError(t, err)
+		require.Equal(t, int(m.numSamples), chk.NumSamples())
+	}
+
+	expMmapChunks := make([]*mmappedChunk, 3)
+	copy(expMmapChunks, ms.oooMmappedChunks)
+
+	// Restart head.
+	require.NoError(t, h.Close())
+
+	wlog, err = wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
+	require.NoError(t, err)
+	oooWlog, err = wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
+	require.NoError(t, err)
+	h, err = NewHead(nil, nil, wlog, oooWlog, opts, nil)
+	require.NoError(t, err)
+	require.NoError(t, h.Init(0)) // Replay happens here.
+
+	// Get the mmap chunks from the Head.
+	ms, ok, err = h.getOrCreate(l.Hash(), l)
+	require.NoError(t, err)
+	require.False(t, ok)
+	require.NotNil(t, ms)
+
+	require.Len(t, ms.oooMmappedChunks, len(expMmapChunks))
+	// Verify that we can access the chunks without error.
+	for _, m := range ms.oooMmappedChunks {
+		chk, err := h.chunkDiskMapper.Chunk(m.ref)
+		require.NoError(t, err)
+		require.Equal(t, int(m.numSamples), chk.NumSamples())
+	}
+
+	actMmapChunks := make([]*mmappedChunk, len(expMmapChunks))
+	copy(actMmapChunks, ms.oooMmappedChunks)
+
+	require.Equal(t, expMmapChunks, actMmapChunks)
+
+	require.NoError(t, h.Close())
+}
+
+func TestHeadInit_DiscardChunksWithUnsupportedEncoding(t *testing.T) {
+	h, _ := newTestHead(t, 1000, false, false)
+	defer func() {
+		require.NoError(t, h.Close())
+	}()
+
+	require.NoError(t, h.Init(0))
+
+	ctx := context.Background()
+	app := h.Appender(ctx)
+	seriesLabels := labels.FromStrings("a", "1")
+	var seriesRef storage.SeriesRef
+	var err error
+	for i := 0; i < 400; i++ {
+		seriesRef, err = app.Append(0, seriesLabels, int64(i), float64(i))
+		require.NoError(t, err)
+	}
+
+	require.NoError(t, app.Commit())
+	require.Greater(t, prom_testutil.ToFloat64(h.metrics.chunksCreated), 1.0)
+
+	uc := newUnsupportedChunk()
+	// Make this chunk not overlap with the previous and the next
+	h.chunkDiskMapper.WriteChunk(chunks.HeadSeriesRef(seriesRef), 500, 600, uc, func(err error) { require.NoError(t, err) })
+
+	app = h.Appender(ctx)
+	for i := 700; i < 1200; i++ {
+		_, err := app.Append(0, seriesLabels, int64(i), float64(i))
+		require.NoError(t, err)
+	}
+
+	require.NoError(t, app.Commit())
+	require.Greater(t, prom_testutil.ToFloat64(h.metrics.chunksCreated), 4.0)
+
+	series, created, err := h.getOrCreate(seriesLabels.Hash(), seriesLabels)
+	require.NoError(t, err)
+	require.False(t, created, "should already exist")
+	require.NotNil(t, series, "should return the series we created above")
+
+	expChunks := make([]*mmappedChunk, len(series.mmappedChunks))
+	copy(expChunks, series.mmappedChunks)
+
+	require.NoError(t, h.Close())
+
+	wlog, err := wal.NewSize(nil, nil, filepath.Join(h.opts.ChunkDirRoot, "wal"), 32768, false)
+	require.NoError(t, err)
+	h, err = NewHead(nil, nil, wlog, nil, h.opts, nil)
+	require.NoError(t, err)
+	require.NoError(t, h.Init(0))
+
+	series, created, err = h.getOrCreate(seriesLabels.Hash(), seriesLabels)
+	require.NoError(t, err)
+	require.False(t, created, "should already exist")
+	require.NotNil(t, series, "should return the series we created above")
+
+	require.Equal(t, expChunks, series.mmappedChunks)
+}
+
+const (
+	UnsupportedMask   = 0b10000000
+	EncUnsupportedXOR = chunkenc.EncXOR | UnsupportedMask
+)
+
+// unsupportedChunk holds a XORChunk and overrides the Encoding() method.
+type unsupportedChunk struct {
+	*chunkenc.XORChunk
+}
+
+func newUnsupportedChunk() *unsupportedChunk {
+	return &unsupportedChunk{chunkenc.NewXORChunk()}
+}
+
+func (c *unsupportedChunk) Encoding() chunkenc.Encoding {
+	return EncUnsupportedXOR
+}
+
 // Tests https://github.com/prometheus/prometheus/issues/10277.
 func TestMmapPanicAfterMmapReplayCorruption(t *testing.T) {
 	dir := t.TempDir()
@ -3171,7 +3423,7 @@ func TestMmapPanicAfterMmapReplayCorruption(t *testing.T) {
 	opts.EnableExemplarStorage = true
 	opts.MaxExemplars.Store(config.DefaultExemplarsConfig.MaxExemplars)

-	h, err := NewHead(nil, nil, wlog, opts, nil)
+	h, err := NewHead(nil, nil, wlog, nil, opts, nil)
 	require.NoError(t, err)
 	require.NoError(t, h.Init(0))

@ -3205,7 +3457,7 @@ func TestMmapPanicAfterMmapReplayCorruption(t *testing.T) {
 	require.NoError(t, err)
 	require.NoError(t, f.Close())

-	h, err = NewHead(nil, nil, wlog, opts, nil)
+	h, err = NewHead(nil, nil, wlog, nil, opts, nil)
 	require.NoError(t, err)
 	require.NoError(t, h.Init(0))

@ -3230,7 +3482,7 @@ func TestReplayAfterMmapReplayError(t *testing.T) {
 		opts.EnableMemorySnapshotOnShutdown = true
 		opts.MaxExemplars.Store(config.DefaultExemplarsConfig.MaxExemplars)

-		h, err = NewHead(nil, nil, wlog, opts, nil)
+		h, err = NewHead(nil, nil, wlog, nil, opts, nil)
 		require.NoError(t, err)
 		require.NoError(t, h.Init(0))
 	}
@ -3292,3 +3544,131 @@ func TestReplayAfterMmapReplayError(t *testing.T) {

 	require.NoError(t, h.Close())
 }
+
+func TestOOOAppendWithNoSeries(t *testing.T) {
+	dir := t.TempDir()
+	wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
+	require.NoError(t, err)
+	oooWlog, err := wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
+	require.NoError(t, err)
+
+	opts := DefaultHeadOptions()
+	opts.ChunkDirRoot = dir
+	opts.OutOfOrderCapMax.Store(30)
+	opts.OutOfOrderTimeWindow.Store(120 * time.Minute.Milliseconds())
+
+	h, err := NewHead(nil, nil, wlog, oooWlog, opts, nil)
+	require.NoError(t, err)
+	t.Cleanup(func() {
+		require.NoError(t, h.Close())
+	})
+	require.NoError(t, h.Init(0))
+
+	appendSample := func(lbls labels.Labels, ts int64) {
+		app := h.Appender(context.Background())
+		_, err := app.Append(0, lbls, ts*time.Minute.Milliseconds(), float64(ts))
+		require.NoError(t, err)
+		require.NoError(t, app.Commit())
+	}
+
+	verifyOOOSamples := func(lbls labels.Labels, expSamples int) {
+		ms, created, err := h.getOrCreate(lbls.Hash(), lbls)
+		require.NoError(t, err)
+		require.False(t, created)
+		require.NotNil(t, ms)
+
+		require.Nil(t, ms.headChunk)
+		require.NotNil(t, ms.oooHeadChunk)
+		require.Equal(t, expSamples, ms.oooHeadChunk.chunk.NumSamples())
+	}
+
+	verifyInOrderSamples := func(lbls labels.Labels, expSamples int) {
+		ms, created, err := h.getOrCreate(lbls.Hash(), lbls)
+		require.NoError(t, err)
+		require.False(t, created)
+		require.NotNil(t, ms)
+
+		require.Nil(t, ms.oooHeadChunk)
+		require.NotNil(t, ms.headChunk)
+		require.Equal(t, expSamples, ms.headChunk.chunk.NumSamples())
+	}
+
+	newLabels := func(idx int) labels.Labels { return labels.FromStrings("foo", fmt.Sprintf("%d", idx)) }
+
+	s1 := newLabels(1)
+	appendSample(s1, 300) // At 300m.
+	verifyInOrderSamples(s1, 1)
+
+	// At 239m, the sample cannot be appended to in-order chunk since it is
+	// beyond the minValidTime. So it should go in OOO chunk.
+	// Series does not exist for s2 yet.
+	s2 := newLabels(2)
+	appendSample(s2, 239) // OOO sample.
+	verifyOOOSamples(s2, 1)
+
+	// Similar for 180m.
+	s3 := newLabels(3)
+	appendSample(s3, 180) // OOO sample.
+	verifyOOOSamples(s3, 1)
+
+	// Now 179m is too old.
+	s4 := newLabels(4)
+	app := h.Appender(context.Background())
+	_, err = app.Append(0, s4, 179*time.Minute.Milliseconds(), float64(179))
+	require.Equal(t, storage.ErrTooOldSample, err)
+	require.NoError(t, app.Rollback())
+	verifyOOOSamples(s3, 1)
+
+	// Samples still go into in-order chunk for samples within
+	// appendable minValidTime.
+	s5 := newLabels(5)
+	appendSample(s5, 240)
+	verifyInOrderSamples(s5, 1)
+}
+
+func TestHeadMinOOOTimeUpdate(t *testing.T) {
+	dir := t.TempDir()
+	wlog, err := wal.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, true)
+	require.NoError(t, err)
+	oooWlog, err := wal.NewSize(nil, nil, filepath.Join(dir, wal.WblDirName), 32768, true)
+	require.NoError(t, err)
+
+	opts := DefaultHeadOptions()
+	opts.ChunkDirRoot = dir
+	opts.OutOfOrderTimeWindow.Store(10 * time.Minute.Milliseconds())
+
+	h, err := NewHead(nil, nil, wlog, oooWlog, opts, nil)
+	require.NoError(t, err)
+	t.Cleanup(func() {
+		require.NoError(t, h.Close())
+	})
+	require.NoError(t, h.Init(0))
+
+	appendSample := func(ts int64) {
+		lbls := labels.FromStrings("foo", "bar")
+		app := h.Appender(context.Background())
+		_, err := app.Append(0, lbls, ts*time.Minute.Milliseconds(), float64(ts))
+		require.NoError(t, err)
+		require.NoError(t, app.Commit())
+	}
+
+	appendSample(300) // In-order sample.
+
+	require.Equal(t, int64(math.MaxInt64), h.MinOOOTime())
+
+	appendSample(295) // OOO sample.
+	require.Equal(t, 295*time.Minute.Milliseconds(), h.MinOOOTime())
+
+	// Allowed window for OOO is >=290, which is before the earliest ooo sample 295, so it gets set to the lower value.
+	require.NoError(t, h.truncateOOO(0, 1))
+	require.Equal(t, 290*time.Minute.Milliseconds(), h.MinOOOTime())
+
+	appendSample(310) // In-order sample.
+	appendSample(305) // OOO sample.
+	require.Equal(t, 290*time.Minute.Milliseconds(), h.MinOOOTime())
+
+	// Now the OOO sample 295 was not gc'ed yet. And allowed window for OOO is now >=300.
+	// So the lowest among them, 295, is set as minOOOTime.
+	require.NoError(t, h.truncateOOO(0, 2))
+	require.Equal(t, 295*time.Minute.Milliseconds(), h.MinOOOTime())
+}
--- a/tsdb/head_wal.go
+++ b/tsdb/head_wal.go
@ -42,7 +42,7 @@ import (
 	"github.com/prometheus/prometheus/tsdb/wal"
 )

-func (h *Head) loadWAL(r *wal.Reader, multiRef map[chunks.HeadSeriesRef]chunks.HeadSeriesRef, mmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk) (err error) {
+func (h *Head) loadWAL(r *wal.Reader, multiRef map[chunks.HeadSeriesRef]chunks.HeadSeriesRef, mmappedChunks, oooMmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk) (err error) {
 	// Track number of samples that referenced a series we don't know about
 	// for error reporting.
 	var unknownRefs atomic.Uint64
@ -107,7 +107,7 @@ func (h *Head) loadWAL(r *wal.Reader, multiRef map[chunks.HeadSeriesRef]chunks.H
 		processors[i].setup()

 		go func(wp *walSubsetProcessor) {
-			unknown, overlapping := wp.processWALSamples(h, mmappedChunks)
+			unknown, overlapping := wp.processWALSamples(h, mmappedChunks, oooMmappedChunks)
 			unknownRefs.Add(unknown)
 			mmapOverlappingChunks.Add(overlapping)
 			wg.Done()
@ -343,7 +343,7 @@ Outer:
 }

 // resetSeriesWithMMappedChunks is only used during the WAL replay.
-func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc []*mmappedChunk, walSeriesRef chunks.HeadSeriesRef) (overlapped bool) {
+func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc, oooMmc []*mmappedChunk, walSeriesRef chunks.HeadSeriesRef) (overlapped bool) {
 	if mSeries.ref != walSeriesRef {
 		// Checking if the new m-mapped chunks overlap with the already existing ones.
 		if len(mSeries.mmappedChunks) > 0 && len(mmc) > 0 {
@ -368,10 +368,11 @@ func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc []*mmappedCh
 		}
 	}

-	h.metrics.chunksCreated.Add(float64(len(mmc)))
+	h.metrics.chunksCreated.Add(float64(len(mmc) + len(oooMmc)))
 	h.metrics.chunksRemoved.Add(float64(len(mSeries.mmappedChunks)))
-	h.metrics.chunks.Add(float64(len(mmc) - len(mSeries.mmappedChunks)))
+	h.metrics.chunks.Add(float64(len(mmc) + len(oooMmc) - len(mSeries.mmappedChunks)))
 	mSeries.mmappedChunks = mmc
+	mSeries.oooMmappedChunks = oooMmc
 	// Cache the last mmapped chunk time, so we can skip calling append() for samples it will reject.
 	if len(mmc) == 0 {
 		mSeries.mmMaxTime = math.MinInt64
@ -379,6 +380,19 @@ func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc []*mmappedCh
 		mSeries.mmMaxTime = mmc[len(mmc)-1].maxTime
 		h.updateMinMaxTime(mmc[0].minTime, mSeries.mmMaxTime)
 	}
+	if len(oooMmc) != 0 {
+		// Mint and maxt can be in any chunk, they are not sorted.
+		mint, maxt := int64(math.MaxInt64), int64(math.MinInt64)
+		for _, ch := range oooMmc {
+			if ch.minTime < mint {
+				mint = ch.minTime
+			}
+			if ch.maxTime > maxt {
+				maxt = ch.maxTime
+			}
+		}
+		h.updateMinOOOMaxOOOTime(mint, maxt)
+	}

 	// Any samples replayed till now would already be compacted. Resetting the head chunk.
 	mSeries.nextAt = 0
@ -421,7 +435,7 @@ func (wp *walSubsetProcessor) reuseBuf() []record.RefSample {

 // processWALSamples adds the samples it receives to the head and passes
 // the buffer received to an output channel for reuse.
-func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk) (unknownRefs, mmapOverlappingChunks uint64) {
+func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks, oooMmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk) (unknownRefs, mmapOverlappingChunks uint64) {
 	defer close(wp.output)

 	mint, maxt := int64(math.MaxInt64), int64(math.MinInt64)
@ -429,7 +443,8 @@ func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks map[chunk
 	for in := range wp.input {
 		if in.existingSeries != nil {
 			mmc := mmappedChunks[in.walSeriesRef]
-			if h.resetSeriesWithMMappedChunks(in.existingSeries, mmc, in.walSeriesRef) {
+			oooMmc := oooMmappedChunks[in.walSeriesRef]
+			if h.resetSeriesWithMMappedChunks(in.existingSeries, mmc, oooMmc, in.walSeriesRef) {
 				mmapOverlappingChunks++
 			}
 			continue
@ -465,6 +480,292 @@ func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks map[chunk
 	return unknownRefs, mmapOverlappingChunks
 }

+func (h *Head) loadWBL(r *wal.Reader, multiRef map[chunks.HeadSeriesRef]chunks.HeadSeriesRef, lastMmapRef chunks.ChunkDiskMapperRef) (err error) {
+	// Track number of samples, m-map markers, that referenced a series we don't know about
+	// for error reporting.
+	var unknownRefs, mmapMarkerUnknownRefs atomic.Uint64
+
+	lastSeq, lastOff := lastMmapRef.Unpack()
+	// Start workers that each process samples for a partition of the series ID space.
+	var (
+		wg         sync.WaitGroup
+		n          = runtime.GOMAXPROCS(0)
+		processors = make([]wblSubsetProcessor, n)
+
+		dec    record.Decoder
+		shards = make([][]record.RefSample, n)
+
+		decodedCh   = make(chan interface{}, 10)
+		decodeErr   error
+		samplesPool = sync.Pool{
+			New: func() interface{} {
+				return []record.RefSample{}
+			},
+		}
+		markersPool = sync.Pool{
+			New: func() interface{} {
+				return []record.RefMmapMarker{}
+			},
+		}
+	)
+
+	defer func() {
+		// For CorruptionErr ensure to terminate all workers before exiting.
+		// We also wrap it to identify OOO WBL corruption.
+		_, ok := err.(*wal.CorruptionErr)
+		if ok {
+			err = &errLoadWbl{err: err}
+			for i := 0; i < n; i++ {
+				processors[i].closeAndDrain()
+			}
+			wg.Wait()
+		}
+	}()
+
+	wg.Add(n)
+	for i := 0; i < n; i++ {
+		processors[i].setup()
+
+		go func(wp *wblSubsetProcessor) {
+			unknown := wp.processWBLSamples(h)
+			unknownRefs.Add(unknown)
+			wg.Done()
+		}(&processors[i])
+	}
+
+	go func() {
+		defer close(decodedCh)
+		for r.Next() {
+			rec := r.Record()
+			switch dec.Type(rec) {
+			case record.Samples:
+				samples := samplesPool.Get().([]record.RefSample)[:0]
+				samples, err = dec.Samples(rec, samples)
+				if err != nil {
+					decodeErr = &wal.CorruptionErr{
+						Err:     errors.Wrap(err, "decode samples"),
+						Segment: r.Segment(),
+						Offset:  r.Offset(),
+					}
+					return
+				}
+				decodedCh <- samples
+			case record.MmapMarkers:
+				markers := markersPool.Get().([]record.RefMmapMarker)[:0]
+				markers, err = dec.MmapMarkers(rec, markers)
+				if err != nil {
+					decodeErr = &wal.CorruptionErr{
+						Err:     errors.Wrap(err, "decode mmap markers"),
+						Segment: r.Segment(),
+						Offset:  r.Offset(),
+					}
+					return
+				}
+				decodedCh <- markers
+			default:
+				// Noop.
+			}
+		}
+	}()
+
+	// The records are always replayed from the oldest to the newest.
+	for d := range decodedCh {
+		switch v := d.(type) {
+		case []record.RefSample:
+			samples := v
+			// We split up the samples into parts of 5000 samples or less.
+			// With O(300 * #cores) in-flight sample batches, large scrapes could otherwise
+			// cause thousands of very large in flight buffers occupying large amounts
+			// of unused memory.
+			for len(samples) > 0 {
+				m := 5000
+				if len(samples) < m {
+					m = len(samples)
+				}
+				for i := 0; i < n; i++ {
+					shards[i] = processors[i].reuseBuf()
+				}
+				for _, sam := range samples[:m] {
+					if r, ok := multiRef[sam.Ref]; ok {
+						sam.Ref = r
+					}
+					mod := uint64(sam.Ref) % uint64(n)
+					shards[mod] = append(shards[mod], sam)
+				}
+				for i := 0; i < n; i++ {
+					processors[i].input <- shards[i]
+				}
+				samples = samples[m:]
+			}
+			//nolint:staticcheck // Ignore SA6002 relax staticcheck verification.
+			samplesPool.Put(d)
+		case []record.RefMmapMarker:
+			markers := v
+			for _, rm := range markers {
+				seq, off := rm.MmapRef.Unpack()
+				if seq > lastSeq || (seq == lastSeq && off > lastOff) {
+					// This m-map chunk from markers was not present during
+					// the load of mmapped chunks that happened in the head
+					// initialization.
+					continue
+				}
+
+				if r, ok := multiRef[rm.Ref]; ok {
+					rm.Ref = r
+				}
+
+				ms := h.series.getByID(rm.Ref)
+				if ms == nil {
+					mmapMarkerUnknownRefs.Inc()
+					continue
+				}
+				idx := uint64(ms.ref) % uint64(n)
+				// It is possible that some old sample is being processed in processWALSamples that
+				// could cause race below. So we wait for the goroutine to empty input the buffer and finish
+				// processing all old samples after emptying the buffer.
+				processors[idx].waitUntilIdle()
+				// Lock the subset so we can modify the series object
+				processors[idx].mx.Lock()
+
+				// All samples till now have been m-mapped. Hence clear out the headChunk.
+				// In case some samples slipped through and went into m-map chunks because of changed
+				// chunk size parameters, we are not taking care of that here.
+				// TODO(codesome): see if there is a way to avoid duplicate m-map chunks if
+				// the size of ooo chunk was reduced between restart.
+				ms.oooHeadChunk = nil
+
+				processors[idx].mx.Unlock()
+			}
+		default:
+			panic(fmt.Errorf("unexpected decodedCh type: %T", d))
+		}
+	}
+
+	if decodeErr != nil {
+		return decodeErr
+	}
+
+	// Signal termination to each worker and wait for it to close its output channel.
+	for i := 0; i < n; i++ {
+		processors[i].closeAndDrain()
+	}
+	wg.Wait()
+
+	if r.Err() != nil {
+		return errors.Wrap(r.Err(), "read records")
+	}
+
+	if unknownRefs.Load() > 0 || mmapMarkerUnknownRefs.Load() > 0 {
+		level.Warn(h.logger).Log("msg", "Unknown series references for ooo WAL replay", "samples", unknownRefs.Load(), "mmap_markers", mmapMarkerUnknownRefs.Load())
+	}
+	return nil
+}
+
+type errLoadWbl struct {
+	err error
+}
+
+func (e errLoadWbl) Error() string {
+	return e.err.Error()
+}
+
+// To support errors.Cause().
+func (e errLoadWbl) Cause() error {
+	return e.err
+}
+
+// To support errors.Unwrap().
+func (e errLoadWbl) Unwrap() error {
+	return e.err
+}
+
+// isErrLoadOOOWal returns a boolean if the error is errLoadWbl.
+func isErrLoadOOOWal(err error) bool {
+	_, ok := err.(*errLoadWbl)
+	return ok
+}
+
+type wblSubsetProcessor struct {
+	mx     sync.Mutex // Take this lock while modifying series in the subset.
+	input  chan []record.RefSample
+	output chan []record.RefSample
+}
+
+func (wp *wblSubsetProcessor) setup() {
+	wp.output = make(chan []record.RefSample, 300)
+	wp.input = make(chan []record.RefSample, 300)
+}
+
+func (wp *wblSubsetProcessor) closeAndDrain() {
+	close(wp.input)
+	for range wp.output {
+	}
+}
+
+// If there is a buffer in the output chan, return it for reuse, otherwise return nil.
+func (wp *wblSubsetProcessor) reuseBuf() []record.RefSample {
+	select {
+	case buf := <-wp.output:
+		return buf[:0]
+	default:
+	}
+	return nil
+}
+
+// processWBLSamples adds the samples it receives to the head and passes
+// the buffer received to an output channel for reuse.
+// Samples before the minValidTime timestamp are discarded.
+func (wp *wblSubsetProcessor) processWBLSamples(h *Head) (unknownRefs uint64) {
+	defer close(wp.output)
+
+	// We don't check for minValidTime for ooo samples.
+	mint, maxt := int64(math.MaxInt64), int64(math.MinInt64)
+	for samples := range wp.input {
+		wp.mx.Lock()
+		for _, s := range samples {
+			ms := h.series.getByID(s.Ref)
+			if ms == nil {
+				unknownRefs++
+				continue
+			}
+			ok, chunkCreated, _ := ms.insert(s.T, s.V, h.chunkDiskMapper)
+			if chunkCreated {
+				h.metrics.chunksCreated.Inc()
+				h.metrics.chunks.Inc()
+			}
+			if ok {
+				if s.T < mint {
+					mint = s.T
+				}
+				if s.T > maxt {
+					maxt = s.T
+				}
+			}
+		}
+		wp.mx.Unlock()
+		wp.output <- samples
+	}
+
+	h.updateMinOOOMaxOOOTime(mint, maxt)
+
+	return unknownRefs
+}
+
+func (wp *wblSubsetProcessor) waitUntilIdle() {
+	select {
+	case <-wp.output: // Allow output side to drain to avoid deadlock.
+	default:
+	}
+	wp.input <- []record.RefSample{}
+	for len(wp.input) != 0 {
+		time.Sleep(10 * time.Microsecond)
+		select {
+		case <-wp.output: // Allow output side to drain to avoid deadlock.
+		default:
+		}
+	}
+}
+
 const (
 	chunkSnapshotRecordTypeSeries     uint8 = 1
 	chunkSnapshotRecordTypeTombstones uint8 = 2
--- a/tsdb/ooo_head.go
+++ b/tsdb/ooo_head.go
@ -0,0 +1,159 @@
+// Copyright 2022 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tsdb
+
+import (
+	"fmt"
+	"sort"
+
+	"github.com/prometheus/prometheus/tsdb/chunkenc"
+	"github.com/prometheus/prometheus/tsdb/tombstones"
+)
+
+// OOOChunk maintains samples in time-ascending order.
+// Inserts for timestamps already seen, are dropped.
+// Samples are stored uncompressed to allow easy sorting.
+// Perhaps we can be more efficient later.
+type OOOChunk struct {
+	samples []sample
+}
+
+func NewOOOChunk() *OOOChunk {
+	return &OOOChunk{samples: make([]sample, 0, 4)}
+}
+
+// Insert inserts the sample such that order is maintained.
+// Returns false if insert was not possible due to the same timestamp already existing.
+func (o *OOOChunk) Insert(t int64, v float64) bool {
+	// Find index of sample we should replace.
+	i := sort.Search(len(o.samples), func(i int) bool { return o.samples[i].t >= t })
+
+	if i >= len(o.samples) {
+		// none found. append it at the end
+		o.samples = append(o.samples, sample{t, v})
+		return true
+	}
+
+	if o.samples[i].t == t {
+		return false
+	}
+
+	// Expand length by 1 to make room. use a zero sample, we will overwrite it anyway.
+	o.samples = append(o.samples, sample{})
+	copy(o.samples[i+1:], o.samples[i:])
+	o.samples[i] = sample{t, v}
+
+	return true
+}
+
+func (o *OOOChunk) NumSamples() int {
+	return len(o.samples)
+}
+
+func (o *OOOChunk) ToXOR() (*chunkenc.XORChunk, error) {
+	x := chunkenc.NewXORChunk()
+	app, err := x.Appender()
+	if err != nil {
+		return nil, err
+	}
+	for _, s := range o.samples {
+		app.Append(s.t, s.v)
+	}
+	return x, nil
+}
+
+func (o *OOOChunk) ToXORBetweenTimestamps(mint, maxt int64) (*chunkenc.XORChunk, error) {
+	x := chunkenc.NewXORChunk()
+	app, err := x.Appender()
+	if err != nil {
+		return nil, err
+	}
+	for _, s := range o.samples {
+		if s.t < mint {
+			continue
+		}
+		if s.t > maxt {
+			break
+		}
+		app.Append(s.t, s.v)
+	}
+	return x, nil
+}
+
+var _ BlockReader = &OOORangeHead{}
+
+// OOORangeHead allows querying Head out of order samples via BlockReader
+// interface implementation.
+type OOORangeHead struct {
+	head *Head
+	// mint and maxt are tracked because when a query is handled we only want
+	// the timerange of the query and having preexisting pointers to the first
+	// and last timestamp help with that.
+	mint, maxt int64
+}
+
+func NewOOORangeHead(head *Head, mint, maxt int64) *OOORangeHead {
+	return &OOORangeHead{
+		head: head,
+		mint: mint,
+		maxt: maxt,
+	}
+}
+
+func (oh *OOORangeHead) Index() (IndexReader, error) {
+	return NewOOOHeadIndexReader(oh.head, oh.mint, oh.maxt), nil
+}
+
+func (oh *OOORangeHead) Chunks() (ChunkReader, error) {
+	return NewOOOHeadChunkReader(oh.head, oh.mint, oh.maxt), nil
+}
+
+func (oh *OOORangeHead) Tombstones() (tombstones.Reader, error) {
+	// As stated in the design doc https://docs.google.com/document/d/1Kppm7qL9C-BJB1j6yb6-9ObG3AbdZnFUBYPNNWwDBYM/edit?usp=sharing
+	// Tombstones are not supported for out of order metrics.
+	return tombstones.NewMemTombstones(), nil
+}
+
+func (oh *OOORangeHead) Meta() BlockMeta {
+	var id [16]byte
+	copy(id[:], "____ooo_head____")
+	return BlockMeta{
+		MinTime: oh.mint,
+		MaxTime: oh.maxt,
+		ULID:    id,
+		Stats: BlockStats{
+			NumSeries: oh.head.NumSeries(),
+		},
+	}
+}
+
+// Size returns the size taken by the Head block.
+func (oh *OOORangeHead) Size() int64 {
+	return oh.head.Size()
+}
+
+// String returns an human readable representation of the out of order range
+// head. It's important to keep this function in order to avoid the struct dump
+// when the head is stringified in errors or logs.
+func (oh *OOORangeHead) String() string {
+	return fmt.Sprintf("ooo range head (mint: %d, maxt: %d)", oh.MinTime(), oh.MaxTime())
+}
+
+func (oh *OOORangeHead) MinTime() int64 {
+	return oh.mint
+}
+
+func (oh *OOORangeHead) MaxTime() int64 {
+	return oh.maxt
+}
--- a/tsdb/ooo_head_read.go
+++ b/tsdb/ooo_head_read.go
@ -0,0 +1,433 @@
+// Copyright 2022 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tsdb
+
+import (
+	"errors"
+	"math"
+	"sort"
+
+	"github.com/prometheus/prometheus/model/labels"
+	"github.com/prometheus/prometheus/storage"
+	"github.com/prometheus/prometheus/tsdb/chunkenc"
+	"github.com/prometheus/prometheus/tsdb/chunks"
+	"github.com/prometheus/prometheus/tsdb/index"
+	"github.com/prometheus/prometheus/tsdb/tombstones"
+)
+
+var _ IndexReader = &OOOHeadIndexReader{}
+
+// OOOHeadIndexReader implements IndexReader so ooo samples in the head can be
+// accessed.
+// It also has a reference to headIndexReader so we can leverage on its
+// IndexReader implementation for all the methods that remain the same. We
+// decided to do this to avoid code duplication.
+// The only methods that change are the ones about getting Series and Postings.
+type OOOHeadIndexReader struct {
+	*headIndexReader // A reference to the headIndexReader so we can reuse as many interface implementation as possible.
+}
+
+func NewOOOHeadIndexReader(head *Head, mint, maxt int64) *OOOHeadIndexReader {
+	hr := &headIndexReader{
+		head: head,
+		mint: mint,
+		maxt: maxt,
+	}
+	return &OOOHeadIndexReader{hr}
+}
+
+func (oh *OOOHeadIndexReader) Series(ref storage.SeriesRef, lbls *labels.Labels, chks *[]chunks.Meta) error {
+	return oh.series(ref, lbls, chks, 0)
+}
+
+// The passed lastMmapRef tells upto what max m-map chunk that we can consider.
+// If it is 0, it means all chunks need to be considered.
+// If it is non-0, then the oooHeadChunk must not be considered.
+func (oh *OOOHeadIndexReader) series(ref storage.SeriesRef, lbls *labels.Labels, chks *[]chunks.Meta, lastMmapRef chunks.ChunkDiskMapperRef) error {
+	s := oh.head.series.getByID(chunks.HeadSeriesRef(ref))
+
+	if s == nil {
+		oh.head.metrics.seriesNotFound.Inc()
+		return storage.ErrNotFound
+	}
+	*lbls = append((*lbls)[:0], s.lset...)
+
+	if chks == nil {
+		return nil
+	}
+
+	s.Lock()
+	defer s.Unlock()
+	*chks = (*chks)[:0]
+
+	tmpChks := make([]chunks.Meta, 0, len(s.oooMmappedChunks))
+
+	// We define these markers to track the last chunk reference while we
+	// fill the chunk meta.
+	// These markers are useful to give consistent responses to repeated queries
+	// even if new chunks that might be overlapping or not are added afterwards.
+	// Also, lastMinT and lastMaxT are initialized to the max int as a sentinel
+	// value to know they are unset.
+	var lastChunkRef chunks.ChunkRef
+	lastMinT, lastMaxT := int64(math.MaxInt64), int64(math.MaxInt64)
+
+	addChunk := func(minT, maxT int64, ref chunks.ChunkRef) {
+		// the first time we get called is for the last included chunk.
+		// set the markers accordingly
+		if lastMinT == int64(math.MaxInt64) {
+			lastChunkRef = ref
+			lastMinT = minT
+			lastMaxT = maxT
+		}
+
+		tmpChks = append(tmpChks, chunks.Meta{
+			MinTime:        minT,
+			MaxTime:        maxT,
+			Ref:            ref,
+			OOOLastRef:     lastChunkRef,
+			OOOLastMinTime: lastMinT,
+			OOOLastMaxTime: lastMaxT,
+		})
+	}
+
+	// Collect all chunks that overlap the query range, in order from most recent to most old,
+	// so we can set the correct markers.
+	if s.oooHeadChunk != nil {
+		c := s.oooHeadChunk
+		if c.OverlapsClosedInterval(oh.mint, oh.maxt) && lastMmapRef == 0 {
+			ref := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(len(s.oooMmappedChunks))))
+			addChunk(c.minTime, c.maxTime, ref)
+		}
+	}
+	for i := len(s.oooMmappedChunks) - 1; i >= 0; i-- {
+		c := s.oooMmappedChunks[i]
+		if c.OverlapsClosedInterval(oh.mint, oh.maxt) && (lastMmapRef == 0 || lastMmapRef.GreaterThanOrEqualTo(c.ref)) {
+			ref := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(i)))
+			addChunk(c.minTime, c.maxTime, ref)
+		}
+	}
+
+	// There is nothing to do if we did not collect any chunk
+	if len(tmpChks) == 0 {
+		return nil
+	}
+
+	// Next we want to sort all the collected chunks by min time so we can find
+	// those that overlap.
+	sort.Sort(metaByMinTimeAndMinRef(tmpChks))
+
+	// Next we want to iterate the sorted collected chunks and only return the
+	// chunks Meta the first chunk that overlaps with others.
+	// Example chunks of a series: 5:(100, 200) 6:(500, 600) 7:(150, 250) 8:(550, 650)
+	// In the example 5 overlaps with 7 and 6 overlaps with 8 so we only want to
+	// to return chunk Metas for chunk 5 and chunk 6
+	*chks = append(*chks, tmpChks[0])
+	maxTime := tmpChks[0].MaxTime // tracks the maxTime of the previous "to be merged chunk"
+	for _, c := range tmpChks[1:] {
+		if c.MinTime > maxTime {
+			*chks = append(*chks, c)
+			maxTime = c.MaxTime
+		} else if c.MaxTime > maxTime {
+			maxTime = c.MaxTime
+			(*chks)[len(*chks)-1].MaxTime = c.MaxTime
+		}
+	}
+
+	return nil
+}
+
+// LabelValues needs to be overridden from the headIndexReader implementation due
+// to the check that happens at the beginning where we make sure that the query
+// interval overlaps with the head minooot and maxooot.
+func (oh *OOOHeadIndexReader) LabelValues(name string, matchers ...*labels.Matcher) ([]string, error) {
+	if oh.maxt < oh.head.MinOOOTime() || oh.mint > oh.head.MaxOOOTime() {
+		return []string{}, nil
+	}
+
+	if len(matchers) == 0 {
+		return oh.head.postings.LabelValues(name), nil
+	}
+
+	return labelValuesWithMatchers(oh, name, matchers...)
+}
+
+type chunkMetaAndChunkDiskMapperRef struct {
+	meta     chunks.Meta
+	ref      chunks.ChunkDiskMapperRef
+	origMinT int64
+	origMaxT int64
+}
+
+type byMinTimeAndMinRef []chunkMetaAndChunkDiskMapperRef
+
+func (b byMinTimeAndMinRef) Len() int { return len(b) }
+func (b byMinTimeAndMinRef) Less(i, j int) bool {
+	if b[i].meta.MinTime == b[j].meta.MinTime {
+		return b[i].meta.Ref < b[j].meta.Ref
+	}
+	return b[i].meta.MinTime < b[j].meta.MinTime
+}
+
+func (b byMinTimeAndMinRef) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
+
+type metaByMinTimeAndMinRef []chunks.Meta
+
+func (b metaByMinTimeAndMinRef) Len() int { return len(b) }
+func (b metaByMinTimeAndMinRef) Less(i, j int) bool {
+	if b[i].MinTime == b[j].MinTime {
+		return b[i].Ref < b[j].Ref
+	}
+	return b[i].MinTime < b[j].MinTime
+}
+
+func (b metaByMinTimeAndMinRef) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
+
+func (oh *OOOHeadIndexReader) Postings(name string, values ...string) (index.Postings, error) {
+	switch len(values) {
+	case 0:
+		return index.EmptyPostings(), nil
+	case 1:
+		return oh.head.postings.Get(name, values[0]), nil // TODO(ganesh) Also call GetOOOPostings
+	default:
+		// TODO(ganesh) We want to only return postings for out of order series.
+		res := make([]index.Postings, 0, len(values))
+		for _, value := range values {
+			res = append(res, oh.head.postings.Get(name, value)) // TODO(ganesh) Also call GetOOOPostings
+		}
+		return index.Merge(res...), nil
+	}
+}
+
+type OOOHeadChunkReader struct {
+	head       *Head
+	mint, maxt int64
+}
+
+func NewOOOHeadChunkReader(head *Head, mint, maxt int64) *OOOHeadChunkReader {
+	return &OOOHeadChunkReader{
+		head: head,
+		mint: mint,
+		maxt: maxt,
+	}
+}
+
+func (cr OOOHeadChunkReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
+	sid, _ := chunks.HeadChunkRef(meta.Ref).Unpack()
+
+	s := cr.head.series.getByID(sid)
+	// This means that the series has been garbage collected.
+	if s == nil {
+		return nil, storage.ErrNotFound
+	}
+
+	s.Lock()
+	c, err := s.oooMergedChunk(meta, cr.head.chunkDiskMapper, cr.mint, cr.maxt)
+	s.Unlock()
+	if err != nil {
+		return nil, err
+	}
+
+	// This means that the query range did not overlap with the requested chunk.
+	if len(c.chunks) == 0 {
+		return nil, storage.ErrNotFound
+	}
+
+	return c, nil
+}
+
+func (cr OOOHeadChunkReader) Close() error {
+	return nil
+}
+
+type OOOCompactionHead struct {
+	oooIR       *OOOHeadIndexReader
+	lastMmapRef chunks.ChunkDiskMapperRef
+	lastWBLFile int
+	postings    []storage.SeriesRef
+	chunkRange  int64
+	mint, maxt  int64 // Among all the compactable chunks.
+}
+
+// NewOOOCompactionHead does the following:
+// 1. M-maps all the in-memory ooo chunks.
+// 2. Compute the expected block ranges while iterating through all ooo series and store it.
+// 3. Store the list of postings having ooo series.
+// 4. Cuts a new WBL file for the OOO WBL.
+// All the above together have a bit of CPU and memory overhead, and can have a bit of impact
+// on the sample append latency. So call NewOOOCompactionHead only right before compaction.
+func NewOOOCompactionHead(head *Head) (*OOOCompactionHead, error) {
+	newWBLFile, err := head.wbl.NextSegmentSync()
+	if err != nil {
+		return nil, err
+	}
+
+	ch := &OOOCompactionHead{
+		chunkRange:  head.chunkRange.Load(),
+		mint:        math.MaxInt64,
+		maxt:        math.MinInt64,
+		lastWBLFile: newWBLFile,
+	}
+
+	ch.oooIR = NewOOOHeadIndexReader(head, math.MinInt64, math.MaxInt64)
+	n, v := index.AllPostingsKey()
+
+	// TODO: verify this gets only ooo samples.
+	p, err := ch.oooIR.Postings(n, v)
+	if err != nil {
+		return nil, err
+	}
+	p = ch.oooIR.SortedPostings(p)
+
+	var lastSeq, lastOff int
+	for p.Next() {
+		seriesRef := p.At()
+		ms := head.series.getByID(chunks.HeadSeriesRef(seriesRef))
+		if ms == nil {
+			continue
+		}
+
+		// M-map the in-memory chunk and keep track of the last one.
+		// Also build the block ranges -> series map.
+		// TODO: consider having a lock specifically for ooo data.
+		ms.Lock()
+
+		mmapRef := ms.mmapCurrentOOOHeadChunk(head.chunkDiskMapper)
+		if mmapRef == 0 && len(ms.oooMmappedChunks) > 0 {
+			// Nothing was m-mapped. So take the mmapRef from the existing slice if it exists.
+			mmapRef = ms.oooMmappedChunks[len(ms.oooMmappedChunks)-1].ref
+		}
+		seq, off := mmapRef.Unpack()
+		if seq > lastSeq || (seq == lastSeq && off > lastOff) {
+			ch.lastMmapRef, lastSeq, lastOff = mmapRef, seq, off
+		}
+		if len(ms.oooMmappedChunks) > 0 {
+			ch.postings = append(ch.postings, seriesRef)
+			for _, c := range ms.oooMmappedChunks {
+				if c.minTime < ch.mint {
+					ch.mint = c.minTime
+				}
+				if c.maxTime > ch.maxt {
+					ch.maxt = c.maxTime
+				}
+			}
+		}
+		ms.Unlock()
+	}
+
+	return ch, nil
+}
+
+func (ch *OOOCompactionHead) Index() (IndexReader, error) {
+	return NewOOOCompactionHeadIndexReader(ch), nil
+}
+
+func (ch *OOOCompactionHead) Chunks() (ChunkReader, error) {
+	return NewOOOHeadChunkReader(ch.oooIR.head, ch.oooIR.mint, ch.oooIR.maxt), nil
+}
+
+func (ch *OOOCompactionHead) Tombstones() (tombstones.Reader, error) {
+	return tombstones.NewMemTombstones(), nil
+}
+
+func (ch *OOOCompactionHead) Meta() BlockMeta {
+	var id [16]byte
+	copy(id[:], "copy(id[:], \"ooo_compact_head\")")
+	return BlockMeta{
+		MinTime: ch.mint,
+		MaxTime: ch.maxt,
+		ULID:    id,
+		Stats: BlockStats{
+			NumSeries: uint64(len(ch.postings)),
+		},
+	}
+}
+
+// CloneForTimeRange clones the OOOCompactionHead such that the IndexReader and ChunkReader
+// obtained from this only looks at the m-map chunks within the given time ranges while not looking
+// beyond the ch.lastMmapRef.
+// Only the method of BlockReader interface are valid for the cloned OOOCompactionHead.
+func (ch *OOOCompactionHead) CloneForTimeRange(mint, maxt int64) *OOOCompactionHead {
+	return &OOOCompactionHead{
+		oooIR:       NewOOOHeadIndexReader(ch.oooIR.head, mint, maxt),
+		lastMmapRef: ch.lastMmapRef,
+		postings:    ch.postings,
+		chunkRange:  ch.chunkRange,
+		mint:        ch.mint,
+		maxt:        ch.maxt,
+	}
+}
+
+func (ch *OOOCompactionHead) Size() int64                            { return 0 }
+func (ch *OOOCompactionHead) MinTime() int64                         { return ch.mint }
+func (ch *OOOCompactionHead) MaxTime() int64                         { return ch.maxt }
+func (ch *OOOCompactionHead) ChunkRange() int64                      { return ch.chunkRange }
+func (ch *OOOCompactionHead) LastMmapRef() chunks.ChunkDiskMapperRef { return ch.lastMmapRef }
+func (ch *OOOCompactionHead) LastWBLFile() int                       { return ch.lastWBLFile }
+
+type OOOCompactionHeadIndexReader struct {
+	ch *OOOCompactionHead
+}
+
+func NewOOOCompactionHeadIndexReader(ch *OOOCompactionHead) IndexReader {
+	return &OOOCompactionHeadIndexReader{ch: ch}
+}
+
+func (ir *OOOCompactionHeadIndexReader) Symbols() index.StringIter {
+	return ir.ch.oooIR.Symbols()
+}
+
+func (ir *OOOCompactionHeadIndexReader) Postings(name string, values ...string) (index.Postings, error) {
+	n, v := index.AllPostingsKey()
+	if name != n || len(values) != 1 || values[0] != v {
+		return nil, errors.New("only AllPostingsKey is supported")
+	}
+	return index.NewListPostings(ir.ch.postings), nil
+}
+
+func (ir *OOOCompactionHeadIndexReader) SortedPostings(p index.Postings) index.Postings {
+	// This will already be sorted from the Postings() call above.
+	return p
+}
+
+func (ir *OOOCompactionHeadIndexReader) Series(ref storage.SeriesRef, lset *labels.Labels, chks *[]chunks.Meta) error {
+	return ir.ch.oooIR.series(ref, lset, chks, ir.ch.lastMmapRef)
+}
+
+func (ir *OOOCompactionHeadIndexReader) SortedLabelValues(name string, matchers ...*labels.Matcher) ([]string, error) {
+	return nil, errors.New("not implemented")
+}
+
+func (ir *OOOCompactionHeadIndexReader) LabelValues(name string, matchers ...*labels.Matcher) ([]string, error) {
+	return nil, errors.New("not implemented")
+}
+
+func (ir *OOOCompactionHeadIndexReader) PostingsForMatchers(concurrent bool, ms ...*labels.Matcher) (index.Postings, error) {
+	return nil, errors.New("not implemented")
+}
+
+func (ir *OOOCompactionHeadIndexReader) LabelNames(matchers ...*labels.Matcher) ([]string, error) {
+	return nil, errors.New("not implemented")
+}
+
+func (ir *OOOCompactionHeadIndexReader) LabelValueFor(id storage.SeriesRef, label string) (string, error) {
+	return "", errors.New("not implemented")
+}
+
+func (ir *OOOCompactionHeadIndexReader) LabelNamesFor(ids ...storage.SeriesRef) ([]string, error) {
+	return nil, errors.New("not implemented")
+}
+
+func (ir *OOOCompactionHeadIndexReader) Close() error {
+	return ir.ch.oooIR.Close()
+}
--- a/tsdb/ooo_head_read_test.go
+++ b/tsdb/ooo_head_read_test.go
--- a/tsdb/ooo_head_test.go
+++ b/tsdb/ooo_head_test.go
@ -0,0 +1,93 @@
+// Copyright 2022 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tsdb
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+const testMaxSize int = 32
+
+// Formulas chosen to make testing easy:
+func valEven(pos int) int { return pos*2 + 2 } // s[0]=2, s[1]=4, s[2]=6, ..., s[31]=64 - Predictable pre-existing values
+func valOdd(pos int) int  { return pos*2 + 1 } // s[0]=1, s[1]=3, s[2]=5, ..., s[31]=63 - New values will interject at chosen position because they sort before the pre-existing vals.
+
+func samplify(v int) sample { return sample{int64(v), float64(v)} }
+
+func makeEvenSampleSlice(n int) []sample {
+	s := make([]sample, n)
+	for i := 0; i < n; i++ {
+		s[i] = samplify(valEven(i))
+	}
+	return s
+}
+
+// TestOOOInsert tests the following cases:
+// - Number of pre-existing samples anywhere from 0 to testMaxSize-1.
+// - Insert new sample before first pre-existing samples, after the last, and anywhere in between.
+// - With a chunk initial capacity of testMaxSize/8 and testMaxSize, which lets us test non-full and full chunks, and chunks that need to expand themselves.
+// Note: In all samples used, t always equals v in numeric value. when we talk about 'value' we just refer to a value that will be used for both sample.t and sample.v.
+func TestOOOInsert(t *testing.T) {
+	for numPreExisting := 0; numPreExisting <= testMaxSize; numPreExisting++ {
+		// For example, if we have numPreExisting 2, then:
+		// chunk.samples indexes filled        0   1
+		// chunk.samples with these values     2   4     // valEven
+		// we want to test inserting at index  0   1   2 // insertPos=0..numPreExisting
+		// we can do this by using values      1,  3   5 // valOdd(insertPos)
+
+		for insertPos := 0; insertPos <= numPreExisting; insertPos++ {
+			chunk := NewOOOChunk()
+			chunk.samples = makeEvenSampleSlice(numPreExisting)
+			newSample := samplify(valOdd(insertPos))
+			chunk.Insert(newSample.t, newSample.v)
+
+			var expSamples []sample
+			// Our expected new samples slice, will be first the original samples.
+			for i := 0; i < insertPos; i++ {
+				expSamples = append(expSamples, samplify(valEven(i)))
+			}
+			// Then the new sample.
+			expSamples = append(expSamples, newSample)
+			// Followed by any original samples that were pushed back by the new one.
+			for i := insertPos; i < numPreExisting; i++ {
+				expSamples = append(expSamples, samplify(valEven(i)))
+			}
+
+			require.Equal(t, expSamples, chunk.samples, "numPreExisting %d, insertPos %d", numPreExisting, insertPos)
+		}
+	}
+}
+
+// TestOOOInsertDuplicate tests the correct behavior when inserting a sample that is a duplicate of any
+// pre-existing samples, with between 1 and testMaxSize pre-existing samples and
+// with a chunk initial capacity of testMaxSize/8 and testMaxSize, which lets us test non-full and full chunks, and chunks that need to expand themselves.
+func TestOOOInsertDuplicate(t *testing.T) {
+	for num := 1; num <= testMaxSize; num++ {
+		for dupPos := 0; dupPos < num; dupPos++ {
+			chunk := NewOOOChunk()
+			chunk.samples = makeEvenSampleSlice(num)
+
+			dupSample := chunk.samples[dupPos]
+			dupSample.v = 0.123
+
+			ok := chunk.Insert(dupSample.t, dupSample.v)
+
+			expSamples := makeEvenSampleSlice(num) // We expect no change.
+			require.False(t, ok)
+			require.Equal(t, expSamples, chunk.samples, "num %d, dupPos %d", num, dupPos)
+		}
+	}
+}
--- a/tsdb/querier.go
+++ b/tsdb/querier.go
@ -569,7 +569,7 @@ func (p *populateWithDelGenericSeriesIterator) next() bool {
 	p.i++
 	p.currChkMeta = p.chks[p.i]

-	p.currChkMeta.Chunk, p.err = p.chunks.Chunk(p.currChkMeta.Ref)
+	p.currChkMeta.Chunk, p.err = p.chunks.Chunk(p.currChkMeta)
 	if p.err != nil {
 		p.err = errors.Wrapf(p.err, "cannot populate chunk %d", p.currChkMeta.Ref)
 		return false
@ -898,7 +898,7 @@ func newNopChunkReader() ChunkReader {
 	}
 }

-func (cr nopChunkReader) Chunk(ref chunks.ChunkRef) (chunkenc.Chunk, error) {
+func (cr nopChunkReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
 	return cr.emptyChunk, nil
 }

--- a/tsdb/querier_bench_test.go
+++ b/tsdb/querier_bench_test.go
@ -34,7 +34,7 @@ func BenchmarkQuerier(b *testing.B) {
 	opts := DefaultHeadOptions()
 	opts.ChunkRange = 1000
 	opts.ChunkDirRoot = chunkDir
-	h, err := NewHead(nil, nil, nil, opts, nil)
+	h, err := NewHead(nil, nil, nil, nil, opts, nil)
 	require.NoError(b, err)
 	defer func() {
 		require.NoError(b, h.Close())
@ -180,7 +180,7 @@ func BenchmarkQuerierSelect(b *testing.B) {
 	opts := DefaultHeadOptions()
 	opts.ChunkRange = 1000
 	opts.ChunkDirRoot = chunkDir
-	h, err := NewHead(nil, nil, nil, opts, nil)
+	h, err := NewHead(nil, nil, nil, nil, opts, nil)
 	require.NoError(b, err)
 	defer h.Close()
 	app := h.Appender(context.Background())
--- a/tsdb/querier_test.go
+++ b/tsdb/querier_test.go
@ -458,7 +458,7 @@ func TestBlockQuerier_AgainstHeadWithOpenChunks(t *testing.T) {
 		t.Run("", func(t *testing.T) {
 			opts := DefaultHeadOptions()
 			opts.ChunkRange = 2 * time.Hour.Milliseconds()
-			h, err := NewHead(nil, nil, nil, opts, nil)
+			h, err := NewHead(nil, nil, nil, nil, opts, nil)
 			require.NoError(t, err)
 			defer h.Close()

@ -627,10 +627,10 @@ func createFakeReaderAndNotPopulatedChunks(s ...[]tsdbutil.Sample) (*fakeChunksR
 	return f, chks
 }

-func (r *fakeChunksReader) Chunk(ref chunks.ChunkRef) (chunkenc.Chunk, error) {
-	chk, ok := r.chks[ref]
+func (r *fakeChunksReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
+	chk, ok := r.chks[meta.Ref]
 	if !ok {
-		return nil, errors.Errorf("chunk not found at ref %v", ref)
+		return nil, errors.Errorf("chunk not found at ref %v", meta.Ref)
 	}
 	return chk, nil
 }
@ -1016,8 +1016,8 @@ func BenchmarkMergedSeriesSet(b *testing.B) {

 type mockChunkReader map[chunks.ChunkRef]chunkenc.Chunk

-func (cr mockChunkReader) Chunk(id chunks.ChunkRef) (chunkenc.Chunk, error) {
-	chk, ok := cr[id]
+func (cr mockChunkReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
+	chk, ok := cr[meta.Ref]
 	if ok {
 		return chk, nil
 	}
@ -1643,7 +1643,7 @@ func TestPostingsForMatchers(t *testing.T) {
 	opts := DefaultHeadOptions()
 	opts.ChunkRange = 1000
 	opts.ChunkDirRoot = chunkDir
-	h, err := NewHead(nil, nil, nil, opts, nil)
+	h, err := NewHead(nil, nil, nil, nil, opts, nil)
 	require.NoError(t, err)
 	defer func() {
 		require.NoError(t, h.Close())
@ -1944,13 +1944,17 @@ func BenchmarkQueries(b *testing.B) {
 		},
 	}

-	queryTypes := make(map[string]storage.Querier)
+	type qt struct {
+		typ     string
+		querier storage.Querier
+	}
+	var queryTypes []qt // We use a slice instead of map to keep the order of test cases consistent.
 	defer func() {
 		for _, q := range queryTypes {
 			// Can't run a check for error here as some of these will fail as
 			// queryTypes is using the same slice for the different block queriers
 			// and would have been closed in the previous iteration.
-			q.Close()
+			q.querier.Close()
 		}
 	}()

@ -1991,21 +1995,38 @@ func BenchmarkQueries(b *testing.B) {
 					qs = append(qs, q)
 				}

-				queryTypes["_1-Block"] = storage.NewMergeQuerier(qs[:1], nil, storage.ChainedSeriesMerge)
-				queryTypes["_3-Blocks"] = storage.NewMergeQuerier(qs[0:3], nil, storage.ChainedSeriesMerge)
-				queryTypes["_10-Blocks"] = storage.NewMergeQuerier(qs, nil, storage.ChainedSeriesMerge)
+				queryTypes = append(queryTypes, qt{"_1-Block", storage.NewMergeQuerier(qs[:1], nil, storage.ChainedSeriesMerge)})
+				queryTypes = append(queryTypes, qt{"_3-Blocks", storage.NewMergeQuerier(qs[0:3], nil, storage.ChainedSeriesMerge)})
+				queryTypes = append(queryTypes, qt{"_10-Blocks", storage.NewMergeQuerier(qs, nil, storage.ChainedSeriesMerge)})

 				chunkDir := b.TempDir()
 				head := createHead(b, nil, series, chunkDir)
-				qHead, err := NewBlockQuerier(head, 1, nSamples)
+				qHead, err := NewBlockQuerier(NewRangeHead(head, 1, nSamples), 1, nSamples)
 				require.NoError(b, err)
-				queryTypes["_Head"] = qHead
+				queryTypes = append(queryTypes, qt{"_Head", qHead})

-				for qtype, querier := range queryTypes {
-					b.Run(title+qtype+"_nSeries:"+strconv.Itoa(nSeries)+"_nSamples:"+strconv.Itoa(int(nSamples)), func(b *testing.B) {
+				for _, oooPercentage := range []int{1, 3, 5, 10} {
+					chunkDir := b.TempDir()
+					totalOOOSamples := oooPercentage * int(nSamples) / 100
+					oooSampleFrequency := int(nSamples) / totalOOOSamples
+					head := createHeadWithOOOSamples(b, nil, series, chunkDir, oooSampleFrequency)
+
+					qHead, err := NewBlockQuerier(NewRangeHead(head, 1, nSamples), 1, nSamples)
+					require.NoError(b, err)
+					qOOOHead, err := NewBlockQuerier(NewOOORangeHead(head, 1, nSamples), 1, nSamples)
+					require.NoError(b, err)
+
+					queryTypes = append(queryTypes, qt{
+						fmt.Sprintf("_Head_oooPercent:%d", oooPercentage),
+						storage.NewMergeQuerier([]storage.Querier{qHead, qOOOHead}, nil, storage.ChainedSeriesMerge),
+					})
+				}
+
+				for _, q := range queryTypes {
+					b.Run(title+q.typ+"_nSeries:"+strconv.Itoa(nSeries)+"_nSamples:"+strconv.Itoa(int(nSamples)), func(b *testing.B) {
 						expExpansions, err := strconv.Atoi(string(title[len(title)-1]))
 						require.NoError(b, err)
-						benchQuery(b, expExpansions, querier, selectors)
+						benchQuery(b, expExpansions, q.querier, selectors)
 					})
 				}
 				require.NoError(b, head.Close())
@ -2025,6 +2046,7 @@ func benchQuery(b *testing.B, expExpansions int, q storage.Querier, selectors la
 			s.Labels()
 			it := s.Iterator()
 			for it.Next() {
+				_, _ = it.At()
 			}
 			actualExpansions++
 		}
--- a/tsdb/record/record.go
+++ b/tsdb/record/record.go
@ -43,6 +43,8 @@ const (
 	Tombstones Type = 3
 	// Exemplars is used to match WAL records of type Exemplars.
 	Exemplars Type = 4
+	// MmapMarkers is used to match OOO WBL records of type MmapMarkers.
+	MmapMarkers Type = 5
 	// Metadata is used to match WAL records of type Metadata.
 	Metadata Type = 6
 )
@ -57,6 +59,8 @@ func (rt Type) String() string {
 		return "exemplars"
 	case Tombstones:
 		return "tombstones"
+	case MmapMarkers:
+		return "mmapmarkers"
 	case Metadata:
 		return "metadata"
 	default:
@ -157,6 +161,12 @@ type RefExemplar struct {
 	Labels labels.Labels
 }

+// RefMmapMarker marks that the all the samples of the given series until now have been m-mapped to disk.
+type RefMmapMarker struct {
+	Ref     chunks.HeadSeriesRef
+	MmapRef chunks.ChunkDiskMapperRef
+}
+
 // Decoder decodes series, sample, metadata and tombstone records.
 // The zero value is ready to use.
 type Decoder struct{}
@ -168,7 +178,7 @@ func (d *Decoder) Type(rec []byte) Type {
 		return Unknown
 	}
 	switch t := Type(rec[0]); t {
-	case Series, Samples, Tombstones, Exemplars, Metadata:
+	case Series, Samples, Tombstones, Exemplars, MmapMarkers, Metadata:
 		return t
 	}
 	return Unknown
@ -354,6 +364,34 @@ func (d *Decoder) ExemplarsFromBuffer(dec *encoding.Decbuf, exemplars []RefExemp
 	return exemplars, nil
 }

+func (d *Decoder) MmapMarkers(rec []byte, markers []RefMmapMarker) ([]RefMmapMarker, error) {
+	dec := encoding.Decbuf{B: rec}
+	t := Type(dec.Byte())
+	if t != MmapMarkers {
+		return nil, errors.New("invalid record type")
+	}
+
+	if dec.Len() == 0 {
+		return markers, nil
+	}
+	for len(dec.B) > 0 && dec.Err() == nil {
+		ref := chunks.HeadSeriesRef(dec.Be64())
+		mmapRef := chunks.ChunkDiskMapperRef(dec.Be64())
+		markers = append(markers, RefMmapMarker{
+			Ref:     ref,
+			MmapRef: mmapRef,
+		})
+	}
+
+	if dec.Err() != nil {
+		return nil, errors.Wrapf(dec.Err(), "decode error after %d mmap markers", len(markers))
+	}
+	if len(dec.B) > 0 {
+		return nil, errors.Errorf("unexpected %d bytes left in entry", len(dec.B))
+	}
+	return markers, nil
+}
+
 // Encoder encodes series, sample, and tombstones records.
 // The zero value is ready to use.
 type Encoder struct{}
@ -467,3 +505,15 @@ func (e *Encoder) EncodeExemplarsIntoBuffer(exemplars []RefExemplar, buf *encodi
 		EncodeLabels(buf, ex.Labels)
 	}
 }
+
+func (e *Encoder) MmapMarkers(markers []RefMmapMarker, b []byte) []byte {
+	buf := encoding.Encbuf{B: b}
+	buf.PutByte(byte(MmapMarkers))
+
+	for _, s := range markers {
+		buf.PutBE64(uint64(s.Ref))
+		buf.PutBE64(uint64(s.MmapRef))
+	}
+
+	return buf.Get()
+}
--- a/tsdb/wal/wal.go
+++ b/tsdb/wal/wal.go
@ -40,6 +40,7 @@ const (
 	DefaultSegmentSize = 128 * 1024 * 1024 // 128 MB
 	pageSize           = 32 * 1024         // 32KB
 	recordHeaderSize   = 7
+	WblDirName         = "wbl"
 )

 // The table gets initialized with sync.Once but may still cause a race
@ -204,32 +205,32 @@ func newWALMetrics(r prometheus.Registerer) *walMetrics {
 	m := &walMetrics{}

 	m.fsyncDuration = prometheus.NewSummary(prometheus.SummaryOpts{
-		Name:       "prometheus_tsdb_wal_fsync_duration_seconds",
+		Name:       "fsync_duration_seconds",
 		Help:       "Duration of WAL fsync.",
 		Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
 	})
 	m.pageFlushes = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "prometheus_tsdb_wal_page_flushes_total",
+		Name: "page_flushes_total",
 		Help: "Total number of page flushes.",
 	})
 	m.pageCompletions = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "prometheus_tsdb_wal_completed_pages_total",
+		Name: "completed_pages_total",
 		Help: "Total number of completed pages.",
 	})
 	m.truncateFail = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "prometheus_tsdb_wal_truncations_failed_total",
+		Name: "truncations_failed_total",
 		Help: "Total number of WAL truncations that failed.",
 	})
 	m.truncateTotal = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "prometheus_tsdb_wal_truncations_total",
+		Name: "truncations_total",
 		Help: "Total number of WAL truncations attempted.",
 	})
 	m.currentSegment = prometheus.NewGauge(prometheus.GaugeOpts{
-		Name: "prometheus_tsdb_wal_segment_current",
+		Name: "segment_current",
 		Help: "WAL segment index that TSDB is currently writing to.",
 	})
 	m.writesFailed = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "prometheus_tsdb_wal_writes_failed_total",
+		Name: "writes_failed_total",
 		Help: "Total number of WAL writes that failed.",
 	})

@ -274,7 +275,11 @@ func NewSize(logger log.Logger, reg prometheus.Registerer, dir string, segmentSi
 		stopc:       make(chan chan struct{}),
 		compress:    compress,
 	}
-	w.metrics = newWALMetrics(reg)
+	prefix := "prometheus_tsdb_wal_"
+	if filepath.Base(dir) == WblDirName {
+		prefix = "prometheus_tsdb_out_of_order_wal_"
+	}
+	w.metrics = newWALMetrics(prometheus.WrapRegistererWithPrefix(prefix, reg))

 	_, last, err := Segments(w.Dir())
 	if err != nil {
@ -459,36 +464,46 @@ func SegmentName(dir string, i int) string {
 	return filepath.Join(dir, fmt.Sprintf("%08d", i))
 }

-// NextSegment creates the next segment and closes the previous one.
-func (w *WAL) NextSegment() error {
+// NextSegment creates the next segment and closes the previous one asynchronously.
+// It returns the file number of the new file.
+func (w *WAL) NextSegment() (int, error) {
 	w.mtx.Lock()
 	defer w.mtx.Unlock()
-	return w.nextSegment()
+	return w.nextSegment(true)
+}
+
+// NextSegmentSync creates the next segment and closes the previous one in sync.
+// It returns the file number of the new file.
+func (w *WAL) NextSegmentSync() (int, error) {
+	w.mtx.Lock()
+	defer w.mtx.Unlock()
+	return w.nextSegment(false)
 }

 // nextSegment creates the next segment and closes the previous one.
-func (w *WAL) nextSegment() error {
+// It returns the file number of the new file.
+func (w *WAL) nextSegment(async bool) (int, error) {
 	if w.closed {
-		return errors.New("wal is closed")
+		return 0, errors.New("wal is closed")
 	}

 	// Only flush the current page if it actually holds data.
 	if w.page.alloc > 0 {
 		if err := w.flushPage(true); err != nil {
-			return err
+			return 0, err
 		}
 	}
 	next, err := CreateSegment(w.Dir(), w.segment.Index()+1)
 	if err != nil {
-		return errors.Wrap(err, "create new segment file")
+		return 0, errors.Wrap(err, "create new segment file")
 	}
 	prev := w.segment
 	if err := w.setSegment(next); err != nil {
-		return err
+		return 0, err
 	}

 	// Don't block further writes by fsyncing the last segment.
-	w.actorc <- func() {
+	f := func() {
 		if err := w.fsync(prev); err != nil {
 			level.Error(w.logger).Log("msg", "sync previous segment", "err", err)
 		}
@ -496,7 +511,12 @@ func (w *WAL) nextSegment() error {
 			level.Error(w.logger).Log("msg", "close previous segment", "err", err)
 		}
 	}
-	return nil
+	if async {
+		w.actorc <- f
+	} else {
+		f()
+	}
+	return next.Index(), nil
 }

 func (w *WAL) setSegment(segment *Segment) error {
@ -638,7 +658,7 @@ func (w *WAL) log(rec []byte, final bool) error {
 	left += (pageSize - recordHeaderSize) * (w.pagesPerSegment() - w.donePages - 1) // Free pages in the active segment.

 	if len(rec) > left {
-		if err := w.nextSegment(); err != nil {
+		if _, err := w.nextSegment(true); err != nil {
 			return err
 		}
 	}
@ -745,6 +765,13 @@ func (w *WAL) fsync(f *Segment) error {
 	return err
 }

+// Sync forces a file sync on the current wal segment. This function is meant
+// to be used only on tests due to different behaviour on Operating Systems
+// like windows and linux
+func (w *WAL) Sync() error {
+	return w.fsync(w.segment)
+}
+
 // Close flushes all writes and closes active segment.
 func (w *WAL) Close() (err error) {
 	w.mtx.Lock()
--- a/tsdb/wal/watcher_test.go
+++ b/tsdb/wal/watcher_test.go
@ -364,14 +364,16 @@ func TestReadCheckpoint(t *testing.T) {
 			err := os.Mkdir(wdir, 0o777)
 			require.NoError(t, err)

-			os.Create(SegmentName(wdir, 30))
+			f, err := os.Create(SegmentName(wdir, 30))
+			require.NoError(t, err)
+			require.NoError(t, f.Close())

 			enc := record.Encoder{}
 			w, err := NewSize(nil, nil, wdir, 128*pageSize, compress)
 			require.NoError(t, err)
-			defer func() {
+			t.Cleanup(func() {
 				require.NoError(t, w.Close())
-			}()
+			})

 			// Write to the initial segment then checkpoint.
 			for i := 0; i < seriesCount; i++ {
@ -396,8 +398,11 @@ func TestReadCheckpoint(t *testing.T) {
 					require.NoError(t, w.Log(sample))
 				}
 			}
-			Checkpoint(log.NewNopLogger(), w, 30, 31, func(x chunks.HeadSeriesRef) bool { return true }, 0)
-			w.Truncate(32)
+			_, err = w.NextSegmentSync()
+			require.NoError(t, err)
+			_, err = Checkpoint(log.NewNopLogger(), w, 30, 31, func(x chunks.HeadSeriesRef) bool { return true }, 0)
+			require.NoError(t, err)
+			require.NoError(t, w.Truncate(32))

 			// Start read after checkpoint, no more data written.
 			_, _, err = Segments(w.Dir())
--- a/web/api/v1/api_test.go
+++ b/web/api/v1/api_test.go
@ -2314,7 +2314,7 @@ func (f *fakeDB) Stats(statsByLabelName string) (_ *tsdb.Stats, retErr error) {
 	}()
 	opts := tsdb.DefaultHeadOptions()
 	opts.ChunkRange = 1000
-	h, _ := tsdb.NewHead(nil, nil, nil, opts, nil)
+	h, _ := tsdb.NewHead(nil, nil, nil, nil, opts, nil)
 	return h.Stats(statsByLabelName), nil
 }