Merge branch 'main' into prometheus-2023-07-31-76dd9b547

2025-03-05 20:59:13 -08:00 · 2023-08-04 09:46:03 +02:00 · 2023-08-04 09:46:03 +02:00 · a54cb4744d
parent 76dd9b5470 77db38b483
commit a54cb4744d
90 changed files with 8752 additions and 1475 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -1,8 +0,0 @@
-/web/ui @juliusv
-/web/ui/module @juliusv @nexucis
-/storage/remote @csmarchbanks @cstyan @bwplotka @tomwilkie
-/storage/remote/otlptranslator @gouthamve @jesusvazquez
-/discovery/kubernetes @brancz
-/tsdb @jesusvazquez
-/promql @roidelapluie
-/cmd/promtool @dgl
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -4,19 +4,25 @@ updates:
    directory: "/"
    schedule:
      interval: "monthly"
+    # Disable version updates; we will get them when we update from upstream Prometheus.
+    open-pull-requests-limit: 0
  - package-ecosystem: "gomod"
    directory: "/documentation/examples/remote_storage"
    schedule:
      interval: "monthly"
+    open-pull-requests-limit: 0
  - package-ecosystem: "npm"
    directory: "/web/ui"
    schedule:
      interval: "monthly"
+    open-pull-requests-limit: 0
  - package-ecosystem: "github-actions"
    directory: "/"
    schedule:
      interval: "monthly"
+    open-pull-requests-limit: 0
  - package-ecosystem: "docker"
    directory: "/"
    schedule:
      interval: "monthly"
+    open-pull-requests-limit: 0
--- a/.github/workflows/buf-lint.yml
+++ b/.github/workflows/buf-lint.yml
@ -1,25 +0,0 @@
-name: buf.build
-on:
-  pull_request:
-    paths:
-      - ".github/workflows/buf-lint.yml"
-      - "**.proto"
-permissions:
-  contents: read
-
-jobs:
-  buf:
-    name: lint
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-      - uses: bufbuild/buf-setup-action@v1.23.1
-        with:
-          github_token: ${{ secrets.GITHUB_TOKEN }}
-      - uses: bufbuild/buf-lint-action@v1
-        with:
-          input: 'prompb'
-      - uses: bufbuild/buf-breaking-action@v1
-        with:
-          input: 'prompb'
-          against: 'https://github.com/prometheus/prometheus.git#branch=main,ref=HEAD,subdir=prompb'
--- a/.github/workflows/buf.yml
+++ b/.github/workflows/buf.yml
@ -1,29 +0,0 @@
-name: buf.build
-on:
-  push:
-    branches:
-      - main
-permissions:
-  contents: read
-
-jobs:
-  buf:
-    name: lint and publish
-    runs-on: ubuntu-latest
-    if: github.repository_owner == 'prometheus'
-    steps:
-      - uses: actions/checkout@v3
-      - uses: bufbuild/buf-setup-action@v1.23.1
-        with:
-          github_token: ${{ secrets.GITHUB_TOKEN }}
-      - uses: bufbuild/buf-lint-action@v1
-        with:
-          input: 'prompb'
-      - uses: bufbuild/buf-breaking-action@v1
-        with:
-          input: 'prompb'
-          against: 'https://github.com/prometheus/prometheus.git#branch=main,ref=HEAD~1,subdir=prompb'
-      - uses: bufbuild/buf-push-action@v1
-        with:
-          input: 'prompb'
-          buf_token: ${{ secrets.BUF_TOKEN }}
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -1,224 +0,0 @@
---
-name: CI
-on:
-  pull_request:
-  push:
-
-jobs:
-  test_go:
-    name: Go tests
-    runs-on: ubuntu-latest
-    # Whenever the Go version is updated here, .promu.yml
-    # should also be updated.
-    container:
-      image: quay.io/prometheus/golang-builder:1.20-base
-    steps:
-      - uses: actions/checkout@v3
-      - uses: prometheus/promci@v0.1.0
-      - uses: ./.github/promci/actions/setup_environment
-      - run: make GO_ONLY=1 SKIP_GOLANGCI_LINT=1
-      - run: go test ./tsdb/ -test.tsdb-isolation=false
-      - run: go test --tags=stringlabels ./...
-      - run: GOARCH=386 go test ./cmd/prometheus
-      - run: make -C documentation/examples/remote_storage
-      - run: make -C documentation/examples
-      - uses: ./.github/promci/actions/check_proto
-        with:
-          version: "3.15.8"
-
-  test_ui:
-    name: UI tests
-    runs-on: ubuntu-latest
-    # Whenever the Go version is updated here, .promu.yml
-    # should also be updated.
-    container:
-      image: quay.io/prometheus/golang-builder:1.20-base
-
-    steps:
-      - uses: actions/checkout@v3
-      - uses: prometheus/promci@v0.1.0
-      - uses: ./.github/promci/actions/setup_environment
-        with:
-          enable_go: false
-          enable_npm: true
-      - run: make assets-tarball
-      - run: make ui-lint
-      - run: make ui-test
-      - uses: ./.github/promci/actions/save_artifacts
-        with:
-          directory: .tarballs
-
-  test_windows:
-    name: Go tests on Windows
-    runs-on: windows-latest
-    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-go@v4
-        with:
-          go-version: '>=1.20 <1.21'
-      - run: |
-          $TestTargets = go list ./... | Where-Object { $_ -NotMatch "(github.com/prometheus/prometheus/discovery.*|github.com/prometheus/prometheus/config|github.com/prometheus/prometheus/web)"}
-          go test $TestTargets -vet=off -v
-        shell: powershell
-
-  test_golang_oldest:
-    name: Go tests with previous Go version
-    runs-on: ubuntu-latest
-    # The go verson in this image should be N-1 wrt test_go.
-    container:
-      image: quay.io/prometheus/golang-builder:1.19-base
-    steps:
-      - uses: actions/checkout@v3
-      - run: make build
-      - run: go test ./tsdb/...
-      - run: go test ./tsdb/ -test.tsdb-isolation=false
-
-  test_mixins:
-    name: Mixins tests
-    runs-on: ubuntu-latest
-    # Whenever the Go version is updated here, .promu.yml
-    # should also be updated.
-    container:
-      image: quay.io/prometheus/golang-builder:1.19-base
-    steps:
-      - uses: actions/checkout@v3
-      - run: go install ./cmd/promtool/.
-      - run: go install github.com/google/go-jsonnet/cmd/jsonnet@latest
-      - run: go install github.com/google/go-jsonnet/cmd/jsonnetfmt@latest
-      - run: go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@latest
-      - run: make -C documentation/prometheus-mixin clean
-      - run: make -C documentation/prometheus-mixin jb_install
-      - run: make -C documentation/prometheus-mixin
-      - run: git diff --exit-code
-
-  build:
-    name: Build Prometheus for common architectures
-    runs-on: ubuntu-latest
-    if: |
-      !(github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v2.'))
-      &&
-      !(github.event_name == 'pull_request' && startsWith(github.event.pull_request.base.ref, 'release-'))
-      &&
-      !(github.event_name == 'push' && github.event.ref == 'refs/heads/main')
-    strategy:
-      matrix:
-        thread: [ 0, 1, 2 ]
-    steps:
-      - uses: actions/checkout@v3
-      - uses: prometheus/promci@v0.1.0
-      - uses: ./.github/promci/actions/build
-        with:
-          promu_opts: "-p linux/amd64 -p windows/amd64 -p linux/arm64 -p darwin/amd64 -p darwin/arm64 -p linux/386"
-          parallelism: 3
-          thread: ${{ matrix.thread }}
-  build_all:
-    name: Build Prometheus for all architectures
-    runs-on: ubuntu-latest
-    if: |
-      (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v2.'))
-      ||
-      (github.event_name == 'pull_request' && startsWith(github.event.pull_request.base.ref, 'release-'))
-      ||
-      (github.event_name == 'push' && github.event.ref == 'refs/heads/main')
-    strategy:
-      matrix:
-        thread: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 ]
-
-    # Whenever the Go version is updated here, .promu.yml
-    # should also be updated.
-    steps:
-      - uses: actions/checkout@v3
-      - uses: prometheus/promci@v0.1.0
-      - uses: ./.github/promci/actions/build
-        with:
-          parallelism: 12
-          thread: ${{ matrix.thread }}
-  golangci:
-    name: golangci-lint
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-      - name: Install Go
-        uses: actions/setup-go@v4
-        with:
-          go-version: 1.20.x
-      - name: Install snmp_exporter/generator dependencies
-        run: sudo apt-get update && sudo apt-get -y install libsnmp-dev
-        if: github.repository == 'prometheus/snmp_exporter'
-      - name: Lint
-        uses: golangci/golangci-lint-action@v3.6.0
-        with:
-          args: --verbose
-          version: v1.53.3
-  fuzzing:
-    uses: ./.github/workflows/fuzzing.yml
-    if: github.event_name == 'pull_request'
-  codeql:
-    uses: ./.github/workflows/codeql-analysis.yml
-
-  publish_main:
-    name: Publish main branch artifacts
-    runs-on: ubuntu-latest
-    needs: [test_ui, test_go, test_windows, golangci, codeql, build_all]
-    if: github.event_name == 'push' && github.event.ref == 'refs/heads/main'
-    steps:
-      - uses: actions/checkout@v3
-      - uses: prometheus/promci@v0.1.0
-      - uses: ./.github/promci/actions/publish_main
-        with:
-          docker_hub_login: ${{ secrets.docker_hub_login }}
-          docker_hub_password: ${{ secrets.docker_hub_password }}
-          quay_io_login: ${{ secrets.quay_io_login }}
-          quay_io_password: ${{ secrets.quay_io_password }}
-  publish_release:
-    name: Publish release artefacts
-    runs-on: ubuntu-latest
-    needs: [test_ui, test_go, test_windows, golangci, codeql, build_all]
-    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v2.')
-    steps:
-      - uses: actions/checkout@v3
-      - uses: prometheus/promci@v0.1.0
-      - uses: ./.github/promci/actions/publish_release
-        with:
-          docker_hub_login: ${{ secrets.docker_hub_login }}
-          docker_hub_password: ${{ secrets.docker_hub_password }}
-          quay_io_login: ${{ secrets.quay_io_login }}
-          quay_io_password: ${{ secrets.quay_io_password }}
-          github_token: ${{ secrets.PROMBOT_GITHUB_TOKEN }}
-  publish_ui_release:
-    name: Publish UI on npm Registry
-    runs-on: ubuntu-latest
-    needs: [test_ui, codeql]
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-      - uses: prometheus/promci@v0.1.0
-      - name: Install nodejs
-        uses: actions/setup-node@v3
-        with:
-          node-version-file: "web/ui/.nvmrc"
-          registry-url: "https://registry.npmjs.org"
-      - uses: actions/cache@v3.3.1
-        with:
-          path: ~/.npm
-          key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }}
-          restore-keys: |
-            ${{ runner.os }}-node-
-      - name: Check libraries version
-        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v2.')
-        run: ./scripts/ui_release.sh --check-package "$(echo ${{ github.ref_name }}|sed s/v2/v0/)"
-      - name: build
-        run: make assets
-      - name: Copy files before publishing libs
-        run: ./scripts/ui_release.sh --copy
-      - name: Publish dry-run libraries
-        if: "!(github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v2.'))"
-        run: ./scripts/ui_release.sh --publish dry-run
-      - name: Publish libraries
-        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v2.')
-        run: ./scripts/ui_release.sh --publish
-        env:
-          # The setup-node action writes an .npmrc file with this env variable
-          # as the placeholder for the auth token
-          NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@ -1,41 +0,0 @@
---
-name: "CodeQL"
-
-on:
-  workflow_call:
-  schedule:
-    - cron: "26 14 * * 1"
-
-permissions:
-  contents: read
-  security-events: write
-
-jobs:
-  analyze:
-    name: Analyze
-    runs-on: ubuntu-latest
-    permissions:
-      security-events: write
-
-    strategy:
-      fail-fast: false
-      matrix:
-        language: ["go", "javascript"]
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-      - uses: actions/setup-go@v4
-        with:
-          go-version: '>=1.20 <1.21'
-
-      - name: Initialize CodeQL
-        uses: github/codeql-action/init@v2
-        with:
-          languages: ${{ matrix.language }}
-
-      - name: Autobuild
-        uses: github/codeql-action/autobuild@v2
-
-      - name: Perform CodeQL Analysis
-        uses: github/codeql-action/analyze@v2
--- a/.github/workflows/funcbench.yml
+++ b/.github/workflows/funcbench.yml
@ -1,61 +0,0 @@
-on:
-  repository_dispatch:
-    types: [funcbench_start]
-name: Funcbench Workflow
-permissions:
-  contents: read
-
-jobs:
-  run_funcbench:
-    name: Running funcbench
-    if: github.event.action == 'funcbench_start'
-    runs-on: ubuntu-latest
-    env:
-      AUTH_FILE: ${{ secrets.TEST_INFRA_PROVIDER_AUTH }}
-      BRANCH: ${{ github.event.client_payload.BRANCH }}
-      BENCH_FUNC_REGEX: ${{ github.event.client_payload.BENCH_FUNC_REGEX }}
-      PACKAGE_PATH: ${{ github.event.client_payload.PACKAGE_PATH }}
-      GITHUB_TOKEN: ${{ secrets.PROMBOT_GITHUB_TOKEN }}
-      GITHUB_ORG: prometheus
-      GITHUB_REPO: prometheus
-      GITHUB_STATUS_TARGET_URL: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}
-      LAST_COMMIT_SHA: ${{ github.event.client_payload.LAST_COMMIT_SHA }}
-      GKE_PROJECT_ID: macro-mile-203600
-      PR_NUMBER: ${{ github.event.client_payload.PR_NUMBER }}
-      PROVIDER: gke
-      ZONE: europe-west3-a
-    steps:
-      - name: Update status to pending
-        run: >-
-          curl -i -X POST
-          -H "Authorization: Bearer $GITHUB_TOKEN"
-          -H "Content-Type: application/json"
-          --data '{"state":"pending","context":"funcbench-status","target_url":"'$GITHUB_STATUS_TARGET_URL'"}'
-          "https://api.github.com/repos/$GITHUB_REPOSITORY/statuses/$LAST_COMMIT_SHA"
-      - name: Prepare nodepool
-        uses: docker://prominfra/funcbench:master
-        with:
-          entrypoint: "docker_entrypoint"
-          args: make deploy
-      - name: Delete all resources
-        if: always()
-        uses: docker://prominfra/funcbench:master
-        with:
-          entrypoint: "docker_entrypoint"
-          args: make clean
-      - name: Update status to failure
-        if: failure()
-        run: >-
-          curl -i -X POST
-          -H "Authorization: Bearer $GITHUB_TOKEN"
-          -H "Content-Type: application/json"
-          --data '{"state":"failure","context":"funcbench-status","target_url":"'$GITHUB_STATUS_TARGET_URL'"}'
-          "https://api.github.com/repos/$GITHUB_REPOSITORY/statuses/$LAST_COMMIT_SHA"
-      - name: Update status to success
-        if: success()
-        run: >-
-          curl -i -X POST
-          -H "Authorization: Bearer $GITHUB_TOKEN"
-          -H "Content-Type: application/json"
-          --data '{"state":"success","context":"funcbench-status","target_url":"'$GITHUB_STATUS_TARGET_URL'"}'
-          "https://api.github.com/repos/$GITHUB_REPOSITORY/statuses/$LAST_COMMIT_SHA"
--- a/.github/workflows/fuzzing.yml
+++ b/.github/workflows/fuzzing.yml
@ -1,28 +0,0 @@
-name: CIFuzz
-on:
-  workflow_call:
-permissions:
-  contents: read
-
-jobs:
-  Fuzzing:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Build Fuzzers
-        id: build
-        uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master
-        with:
-          oss-fuzz-project-name: "prometheus"
-          dry-run: false
-      - name: Run Fuzzers
-        uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master
-        with:
-          oss-fuzz-project-name: "prometheus"
-          fuzz-seconds: 600
-          dry-run: false
-      - name: Upload Crash
-        uses: actions/upload-artifact@v3
-        if: failure() && steps.build.outcome == 'success'
-        with:
-          name: artifacts
-          path: ./out/artifacts
--- a/.github/workflows/golangci-lint.yml
+++ b/.github/workflows/golangci-lint.yml
@ -0,0 +1,30 @@
+name: golangci-lint
+on:
+  push:
+    paths:
+      - "go.sum"
+      - "go.mod"
+      - "**.go"
+      - "scripts/errcheck_excludes.txt"
+      - ".github/workflows/golangci-lint.yml"
+      - ".golangci.yml"
+  pull_request:
+
+jobs:
+  golangci:
+    name: lint
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: install Go
+        uses: actions/setup-go@v2
+        with:
+          go-version: '>=1.20 <1.21'
+      - name: Install snmp_exporter/generator dependencies
+        run: sudo apt-get update && sudo apt-get -y install libsnmp-dev
+        if: github.repository == 'prometheus/snmp_exporter'
+      - name: Lint
+        uses: golangci/golangci-lint-action@v3.3.1
+        with:
+          version: v1.52.2
--- a/.github/workflows/prombench.yml
+++ b/.github/workflows/prombench.yml
@ -1,126 +0,0 @@
-on:
-  repository_dispatch:
-    types: [prombench_start, prombench_restart, prombench_stop]
-name: Prombench Workflow
-env:
-  AUTH_FILE: ${{ secrets.TEST_INFRA_PROVIDER_AUTH }}
-  CLUSTER_NAME: test-infra
-  DOMAIN_NAME: prombench.prometheus.io
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  GITHUB_ORG: prometheus
-  GITHUB_REPO: prometheus
-  GITHUB_STATUS_TARGET_URL: https://github.com/${{github.repository}}/actions/runs/${{github.run_id}}
-  LAST_COMMIT_SHA: ${{ github.event.client_payload.LAST_COMMIT_SHA }}
-  GKE_PROJECT_ID: macro-mile-203600
-  PR_NUMBER: ${{ github.event.client_payload.PR_NUMBER }}
-  PROVIDER: gke
-  RELEASE: ${{ github.event.client_payload.RELEASE }}
-  ZONE: europe-west3-a
-jobs:
-  benchmark_start:
-    name: Benchmark Start
-    if: github.event.action == 'prombench_start'
-    runs-on: ubuntu-latest
-    steps:
-      - name: Update status to pending
-        run: >-
-          curl -i -X POST
-          -H "Authorization: Bearer $GITHUB_TOKEN"
-          -H "Content-Type: application/json"
-          --data '{"state":"pending",  "context": "prombench-status-update-start", "target_url": "'$GITHUB_STATUS_TARGET_URL'"}'
-          "https://api.github.com/repos/$GITHUB_REPOSITORY/statuses/$LAST_COMMIT_SHA"
-      - name: Run make deploy to start test
-        id: make_deploy
-        uses: docker://prominfra/prombench:master
-        with:
-          args: >-
-            until make all_nodes_deleted; do echo "waiting for nodepools to be deleted"; sleep 10; done;
-            make deploy;
-      - name: Update status to failure
-        if: failure()
-        run: >-
-          curl -i -X POST
-          -H "Authorization: Bearer $GITHUB_TOKEN"
-          -H "Content-Type: application/json"
-          --data '{"state":"failure",  "context": "prombench-status-update-start", "target_url": "'$GITHUB_STATUS_TARGET_URL'"}'
-          "https://api.github.com/repos/$GITHUB_REPOSITORY/statuses/$LAST_COMMIT_SHA"
-      - name: Update status to success
-        if: success()
-        run: >-
-          curl -i -X POST
-          -H "Authorization: Bearer $GITHUB_TOKEN"
-          -H "Content-Type: application/json"
-          --data '{"state":"success",  "context": "prombench-status-update-start", "target_url": "'$GITHUB_STATUS_TARGET_URL'"}'
-          "https://api.github.com/repos/$GITHUB_REPOSITORY/statuses/$LAST_COMMIT_SHA"
-  benchmark_cancel:
-    name: Benchmark Cancel
-    if: github.event.action == 'prombench_stop'
-    runs-on: ubuntu-latest
-    steps:
-      - name: Update status to pending
-        run: >-
-          curl -i -X POST
-          -H "Authorization: Bearer $GITHUB_TOKEN"
-          -H "Content-Type: application/json"
-          --data '{"state":"pending",  "context": "prombench-status-update-cancel", "target_url": "'$GITHUB_STATUS_TARGET_URL'"}'
-          "https://api.github.com/repos/$GITHUB_REPOSITORY/statuses/$LAST_COMMIT_SHA"
-      - name: Run make clean to stop test
-        id: make_clean
-        uses: docker://prominfra/prombench:master
-        with:
-          args: >-
-            until make all_nodes_running; do echo "waiting for nodepools to be created"; sleep 10; done;
-            make clean;
-      - name: Update status to failure
-        if: failure()
-        run: >-
-          curl -i -X POST
-          -H "Authorization: Bearer $GITHUB_TOKEN"
-          -H "Content-Type: application/json"
-          --data '{"state":"failure",  "context": "prombench-status-update-cancel", "target_url": "'$GITHUB_STATUS_TARGET_URL'"}'
-          "https://api.github.com/repos/$GITHUB_REPOSITORY/statuses/$LAST_COMMIT_SHA"
-      - name: Update status to success
-        if: success()
-        run: >-
-          curl -i -X POST
-          -H "Authorization: Bearer $GITHUB_TOKEN"
-          -H "Content-Type: application/json"
-          --data '{"state":"success",  "context": "prombench-status-update-cancel", "target_url": "'$GITHUB_STATUS_TARGET_URL'"}'
-          "https://api.github.com/repos/$GITHUB_REPOSITORY/statuses/$LAST_COMMIT_SHA"
-  benchmark_restart:
-    name: Benchmark Restart
-    if: github.event.action == 'prombench_restart'
-    runs-on: ubuntu-latest
-    steps:
-      - name: Update status to pending
-        run: >-
-          curl -i -X POST
-          -H "Authorization: Bearer $GITHUB_TOKEN"
-          -H "Content-Type: application/json"
-          --data '{"state":"pending",  "context": "prombench-status-update-restart", "target_url": "'$GITHUB_STATUS_TARGET_URL'"}'
-          "https://api.github.com/repos/$GITHUB_REPOSITORY/statuses/$LAST_COMMIT_SHA"
-      - name: Run make clean then make deploy to restart test
-        id: make_restart
-        uses: docker://prominfra/prombench:master
-        with:
-          args: >-
-            until make all_nodes_running; do echo "waiting for nodepools to be created"; sleep 10; done;
-            make clean;
-            until make all_nodes_deleted; do echo "waiting for nodepools to be deleted"; sleep 10; done;
-            make deploy;
-      - name: Update status to failure
-        if: failure()
-        run: >-
-          curl -i -X POST
-          -H "Authorization: Bearer $GITHUB_TOKEN"
-          -H "Content-Type: application/json"
-          --data '{"state":"failure",  "context": "prombench-status-update-restart", "target_url": "'$GITHUB_STATUS_TARGET_URL'"}'
-          "https://api.github.com/repos/$GITHUB_REPOSITORY/statuses/$LAST_COMMIT_SHA"
-      - name: Update status to success
-        if: success()
-        run: >-
-          curl -i -X POST
-          -H "Authorization: Bearer $GITHUB_TOKEN"
-          -H "Content-Type: application/json"
-          --data '{"state":"success",  "context": "prombench-status-update-restart", "target_url": "'$GITHUB_STATUS_TARGET_URL'"}'
-          "https://api.github.com/repos/$GITHUB_REPOSITORY/statuses/$LAST_COMMIT_SHA"
--- a/.github/workflows/repo_sync.yml
+++ b/.github/workflows/repo_sync.yml
@ -1,19 +0,0 @@
---
-name: Sync repo files
-on:
-  schedule:
-    - cron: '44 17 * * *'
-permissions:
-  contents: read
-
-jobs:
-  repo_sync:
-    runs-on: ubuntu-latest
-    if: github.repository_owner == 'prometheus'
-    container:
-      image: quay.io/prometheus/golang-builder
-    steps:
-      - uses: actions/checkout@v3
-      - run: ./scripts/sync_repo_files.sh
-        env:
-          GITHUB_TOKEN: ${{ secrets.PROMBOT_GITHUB_TOKEN }}
--- a/.github/workflows/sync-fork.yml
+++ b/.github/workflows/sync-fork.yml
@ -0,0 +1,17 @@
+name: sync fork with upstream
+
+on:
+  schedule:
+    - cron: '11 8 * * 1' # 8:11 UTC on Monday
+
+  workflow_dispatch: # for manual testing
+
+jobs:
+  sync-fork-pr:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: tgymnich/fork-sync@v1.7
+        with:
+          owner: grafana
+          base: main
+          head: main
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -0,0 +1,54 @@
+name: ci
+on:
+  push:
+    branches: [main]
+  pull_request:
+
+jobs:
+  test:
+    runs-on: ubuntu-20.04
+    steps:
+      - name: Upgrade golang
+        run: |
+          cd /tmp
+          wget https://dl.google.com/go/go1.20.3.linux-amd64.tar.gz
+          tar -zxvf go1.20.3.linux-amd64.tar.gz
+          sudo rm -fr /usr/local/go
+          sudo mv /tmp/go /usr/local/go
+          cd -
+          ls -l /usr/bin/go
+
+      - name: Checkout Repo
+        uses: actions/checkout@v2
+
+      # This file would normally be created by `make assets`, here we just
+      #  mock it because the file is required for the tests to pass.
+      - name: Mock building of necessary react file
+        run: mkdir web/ui/static/react && touch web/ui/static/react/index.html
+
+      - name: Run Tests
+        run: GO=/usr/local/go/bin/go make common-test
+
+  test-stringlabels:
+    runs-on: ubuntu-20.04
+    steps:
+      - name: Upgrade golang
+        run: |
+          cd /tmp
+          wget https://dl.google.com/go/go1.20.3.linux-amd64.tar.gz
+          tar -zxvf go1.20.3.linux-amd64.tar.gz
+          sudo rm -fr /usr/local/go
+          sudo mv /tmp/go /usr/local/go
+          cd -
+          ls -l /usr/bin/go
+
+      - name: Checkout Repo
+        uses: actions/checkout@v2
+
+      # This file would normally be created by `make assets`, here we just
+      #  mock it because the file is required for the tests to pass.
+      - name: Mock building of necessary react file
+        run: mkdir web/ui/static/react && touch web/ui/static/react/index.html
+
+      - name: Run Tests -tags=stringlabels
+        run: GO=/usr/local/go/bin/go GOOPTS=-tags=stringlabels make common-test
--- a/.github/workflows/ui_build_and_release.yml
+++ b/.github/workflows/ui_build_and_release.yml
@ -0,0 +1,44 @@
+name: ui_build_and_release
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+    tags:
+      - "v0.[0-9]+.[0-9]+*"
+jobs:
+  release:
+    name: release
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Install nodejs
+        uses: actions/setup-node@v3
+        with:
+          node-version-file: "web/ui/.nvmrc"
+      - uses: actions/cache@v3.0.4
+        with:
+          path: ~/.npm
+          key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }}
+          restore-keys: |
+            ${{ runner.os }}-node-
+
+      - name: Check libraries version
+        ## This step is verifying that the version of each package is matching the tag
+        if: ${{ github.event_name == 'push' && startsWith(github.ref_name, 'v') }}
+        run: ./scripts/ui_release.sh --check-package "${{ github.ref_name }}"
+      - name: build
+        run: make assets
+      - name: Copy files before publishing libs
+        run: ./scripts/ui_release.sh --copy
+      - name: Publish dry-run libraries
+        if: ${{ github.event_name == 'pull_request' || github.ref_name == 'main' }}
+        run: ./scripts/ui_release.sh --publish dry-run
+      - name: Publish libraries
+        if: ${{ github.event_name == 'push' && startsWith(github.ref_name, 'v') }}
+        run: ./scripts/ui_release.sh --publish
+        env:
+          # The setup-node action writes an .npmrc file with this env variable
+          # as the placeholder for the auth token
+          NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
--- a/.yamllint.yml
+++ b/.yamllint.yml
--- a/Makefile.common
+++ b/Makefile.common
@ -100,6 +100,8 @@ ifeq ($(GOHOSTARCH),amd64)
        endif
 endif

+test-flags += -timeout 20m
+
 # This rule is used to forward a target like "build" to "common-build".  This
 # allows a new "build" target to be defined in a Makefile which includes this
 # one and override "common-build" without override warnings.
--- a/cmd/compact/main.go
+++ b/cmd/compact/main.go
@ -0,0 +1,89 @@
+package main
+
+import (
+	"context"
+	"flag"
+	"log"
+	"os"
+	"os/signal"
+	"runtime/pprof"
+	"syscall"
+
+	golog "github.com/go-kit/log"
+
+	"github.com/prometheus/prometheus/tsdb"
+)
+
+func main() {
+	var (
+		outputDir        string
+		shardCount       int
+		cpuProf          string
+		segmentSizeMB    int64
+		maxClosingBlocks int
+		symbolFlushers   int
+		openConcurrency  int
+	)
+
+	flag.StringVar(&outputDir, "output-dir", ".", "Output directory for new block(s)")
+	flag.StringVar(&cpuProf, "cpuprofile", "", "Where to store CPU profile (it not empty)")
+	flag.IntVar(&shardCount, "shard-count", 1, "Number of shards for splitting")
+	flag.Int64Var(&segmentSizeMB, "segment-file-size", 512, "Size of segment file")
+	flag.IntVar(&maxClosingBlocks, "max-closing-blocks", 2, "Number of blocks that can close at once during split compaction")
+	flag.IntVar(&symbolFlushers, "symbol-flushers", 4, "Number of symbol flushers used during split compaction")
+	flag.IntVar(&openConcurrency, "open-concurrency", 4, "Number of goroutines used when opening blocks")
+
+	flag.Parse()
+
+	logger := golog.NewLogfmtLogger(os.Stderr)
+
+	var blockDirs []string
+	for _, d := range flag.Args() {
+		s, err := os.Stat(d)
+		if err != nil {
+			panic(err)
+		}
+		if !s.IsDir() {
+			log.Fatalln("not a directory: ", d)
+		}
+		blockDirs = append(blockDirs, d)
+	}
+
+	if len(blockDirs) == 0 {
+		log.Fatalln("no blocks to compact")
+	}
+
+	if cpuProf != "" {
+		f, err := os.Create(cpuProf)
+		if err != nil {
+			log.Fatalln(err)
+		}
+
+		log.Println("writing to", cpuProf)
+		err = pprof.StartCPUProfile(f)
+		if err != nil {
+			log.Fatalln(err)
+		}
+
+		defer pprof.StopCPUProfile()
+	}
+
+	ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
+	defer cancel()
+
+	c, err := tsdb.NewLeveledCompactorWithChunkSize(ctx, nil, logger, []int64{0}, nil, segmentSizeMB*1024*1024, nil, true)
+	if err != nil {
+		log.Panicln("creating compactor", err)
+	}
+
+	opts := tsdb.DefaultLeveledCompactorConcurrencyOptions()
+	opts.MaxClosingBlocks = maxClosingBlocks
+	opts.SymbolsFlushersCount = symbolFlushers
+	opts.MaxOpeningBlocks = openConcurrency
+	c.SetConcurrencyOptions(opts)
+
+	_, err = c.CompactWithSplitting(outputDir, blockDirs, nil, uint64(shardCount))
+	if err != nil {
+		log.Panicln("compacting", err)
+	}
+}
--- a/discovery/xds/kuma_mads.pb.go
+++ b/discovery/xds/kuma_mads.pb.go
@ -132,13 +132,13 @@ type MonitoringAssignment_Target struct {
 	// E.g., `backend-01`
 	Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"`
 	// Scheme on which to scrape the target.
-	//E.g., `http`
+	// E.g., `http`
 	Scheme string `protobuf:"bytes,2,opt,name=scheme,proto3" json:"scheme,omitempty"`
 	// Address (preferably IP) for the service
 	// E.g., `backend.svc` or `10.1.4.32:9090`
 	Address string `protobuf:"bytes,3,opt,name=address,proto3" json:"address,omitempty"`
 	// Optional path to append to the address for scraping
-	//E.g., `/metrics`
+	// E.g., `/metrics`
 	MetricsPath string `protobuf:"bytes,4,opt,name=metrics_path,json=metricsPath,proto3" json:"metrics_path,omitempty"`
 	// Arbitrary labels associated with that particular target.
 	//
--- a/go.mod
+++ b/go.mod
@ -8,11 +8,13 @@ require (
 	github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.3.0
 	github.com/Azure/go-autorest/autorest v0.11.29
 	github.com/Azure/go-autorest/autorest/adal v0.9.23
+	github.com/DmitriyVTitov/size v1.5.0
 	github.com/alecthomas/kingpin/v2 v2.3.2
 	github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137
 	github.com/aws/aws-sdk-go v1.44.302
 	github.com/cespare/xxhash/v2 v2.2.0
 	github.com/dennwc/varint v1.0.0
+	github.com/dgraph-io/ristretto v0.1.1
 	github.com/digitalocean/godo v1.99.0
 	github.com/docker/docker v24.0.4+incompatible
 	github.com/edsrzf/mmap-go v1.1.0
@ -91,6 +93,7 @@ require (
 	github.com/Azure/azure-sdk-for-go/sdk/internal v1.3.0 // indirect
 	github.com/AzureAD/microsoft-authentication-library-for-go v1.0.0 // indirect
 	github.com/coreos/go-systemd/v22 v22.5.0 // indirect
+	github.com/dustin/go-humanize v1.0.0 // indirect
 	github.com/google/s2a-go v0.1.4 // indirect
 	github.com/hashicorp/errwrap v1.1.0 // indirect
 	github.com/hashicorp/go-multierror v1.1.1 // indirect
--- a/go.sum
+++ b/go.sum
@ -69,6 +69,8 @@ github.com/AzureAD/microsoft-authentication-library-for-go v1.0.0/go.mod h1:kgDm
 github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
 github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
 github.com/DataDog/datadog-go v3.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ=
+github.com/DmitriyVTitov/size v1.5.0 h1:/PzqxYrOyOUX1BXj6J9OuVRVGe+66VL4D9FlUaW515g=
+github.com/DmitriyVTitov/size v1.5.0/go.mod h1:le6rNI4CoLQV1b9gzp1+3d7hMAD/uu2QcJ+aYbNgiU0=
 github.com/Knetic/govaluate v3.0.1-0.20171022003610-9aa49832a739+incompatible/go.mod h1:r7JcOSlj0wfOMncg0iLm8Leh48TZaKVeNIfJntJ2wa0=
 github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow=
 github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM=
@ -155,7 +157,11 @@ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1
 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/dennwc/varint v1.0.0 h1:kGNFFSSw8ToIy3obO/kKr8U9GZYUAxQEVuix4zfDWzE=
 github.com/dennwc/varint v1.0.0/go.mod h1:hnItb35rvZvJrbTALZtY/iQfDs48JKRG1RPpgziApxA=
+github.com/dgraph-io/ristretto v0.1.1 h1:6CWw5tJNgpegArSHpNHJKldNeq03FQCwYvfMVWajOK8=
+github.com/dgraph-io/ristretto v0.1.1/go.mod h1:S1GPSBCYCIhmVNfcth17y2zZtQT6wzkzgwUve0VDWWA=
 github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
+github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2 h1:tdlZCpZ/P9DhczCTSixgIKmwPv6+wP5DGjqLYw5SUiA=
+github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw=
 github.com/digitalocean/godo v1.99.0 h1:gUHO7n9bDaZFWvbzOum4bXE0/09ZuYA9yA8idQHX57E=
 github.com/digitalocean/godo v1.99.0/go.mod h1:SsS2oXo2rznfM/nORlZ/6JaUJZFhmKTib1YhopUc8NA=
 github.com/dnaeon/go-vcr v1.2.0 h1:zHCHvJYTMh1N7xnV7zf1m1GPBF9Ad0Jk/whtQ1663qI=
@ -169,6 +175,8 @@ github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4
 github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
 github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE=
 github.com/dustin/go-humanize v0.0.0-20171111073723-bb3d318650d4/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
+github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo=
+github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
 github.com/eapache/go-resiliency v1.1.0/go.mod h1:kFI+JgMyC7bLPUVY133qvEBtVayf5mFgVsvEsIPBvNs=
 github.com/eapache/go-xerial-snappy v0.0.0-20180814174437-776d5712da21/go.mod h1:+020luEh2TKB4/GOp8oxxtq0Daoen/Cii55CzbTV6DU=
 github.com/eapache/queue v1.1.0/go.mod h1:6eCeP0CKFpHLu8blIFXhExK/dRa7WDZfr6jVFPTqq+I=
@ -1029,6 +1037,7 @@ golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBc
 golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20221010170243-090e33056c14/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
--- a/model/labels/matcher.go
+++ b/model/labels/matcher.go
@ -118,3 +118,30 @@ func (m *Matcher) GetRegexString() string {
 	}
 	return m.re.GetRegexString()
 }
+
+// SetMatches returns a set of equality matchers for the current regex matchers if possible.
+// For examples the regexp `a(b|f)` will returns "ab" and "af".
+// Returns nil if we can't replace the regexp by only equality matchers.
+func (m *Matcher) SetMatches() []string {
+	if m.re == nil {
+		return nil
+	}
+	return m.re.SetMatches()
+}
+
+// Prefix returns the required prefix of the value to match, if possible.
+// It will be empty if it's an equality matcher or if the prefix can't be determined.
+func (m *Matcher) Prefix() string {
+	if m.re == nil {
+		return ""
+	}
+	return m.re.prefix
+}
+
+// IsRegexOptimized returns whether regex is optimized.
+func (m *Matcher) IsRegexOptimized() bool {
+	if m.re == nil {
+		return false
+	}
+	return m.re.IsOptimized()
+}
--- a/model/labels/matcher_test.go
+++ b/model/labels/matcher_test.go
@ -14,13 +14,14 @@
 package labels

 import (
+	"fmt"
 	"testing"

 	"github.com/stretchr/testify/require"
 )

 func mustNewMatcher(t *testing.T, mType MatchType, value string) *Matcher {
-	m, err := NewMatcher(mType, "", value)
+	m, err := NewMatcher(mType, "test_label_name", value)
 	require.NoError(t, err)
 	return m
 }
@ -81,6 +82,21 @@ func TestMatcher(t *testing.T) {
 			value:   "foo-bar",
 			match:   false,
 		},
+		{
+			matcher: mustNewMatcher(t, MatchRegexp, "$*bar"),
+			value:   "foo-bar",
+			match:   false,
+		},
+		{
+			matcher: mustNewMatcher(t, MatchRegexp, "bar^+"),
+			value:   "foo-bar",
+			match:   false,
+		},
+		{
+			matcher: mustNewMatcher(t, MatchRegexp, "$+bar"),
+			value:   "foo-bar",
+			match:   false,
+		},
 	}

 	for _, test := range tests {
@ -118,6 +134,82 @@ func TestInverse(t *testing.T) {
 	}
 }

+func TestPrefix(t *testing.T) {
+	for i, tc := range []struct {
+		matcher *Matcher
+		prefix  string
+	}{
+		{
+			matcher: mustNewMatcher(t, MatchEqual, "abc"),
+			prefix:  "",
+		},
+		{
+			matcher: mustNewMatcher(t, MatchNotEqual, "abc"),
+			prefix:  "",
+		},
+		{
+			matcher: mustNewMatcher(t, MatchRegexp, "abc.+"),
+			prefix:  "abc",
+		},
+		{
+			matcher: mustNewMatcher(t, MatchRegexp, "abcd|abc.+"),
+			prefix:  "abc",
+		},
+		{
+			matcher: mustNewMatcher(t, MatchNotRegexp, "abcd|abc.+"),
+			prefix:  "abc",
+		},
+		{
+			matcher: mustNewMatcher(t, MatchRegexp, "abc(def|ghj)|ab|a."),
+			prefix:  "a",
+		},
+		{
+			matcher: mustNewMatcher(t, MatchRegexp, "foo.+bar|foo.*baz"),
+			prefix:  "foo",
+		},
+		{
+			matcher: mustNewMatcher(t, MatchRegexp, "abc|.*"),
+			prefix:  "",
+		},
+		{
+			matcher: mustNewMatcher(t, MatchRegexp, "abc|def"),
+			prefix:  "",
+		},
+		{
+			matcher: mustNewMatcher(t, MatchRegexp, ".+def"),
+			prefix:  "",
+		},
+	} {
+		t.Run(fmt.Sprintf("%d: %s", i, tc.matcher), func(t *testing.T) {
+			require.Equal(t, tc.prefix, tc.matcher.Prefix())
+		})
+	}
+}
+
+func TestIsRegexOptimized(t *testing.T) {
+	for i, tc := range []struct {
+		matcher          *Matcher
+		isRegexOptimized bool
+	}{
+		{
+			matcher:          mustNewMatcher(t, MatchEqual, "abc"),
+			isRegexOptimized: false,
+		},
+		{
+			matcher:          mustNewMatcher(t, MatchRegexp, "."),
+			isRegexOptimized: false,
+		},
+		{
+			matcher:          mustNewMatcher(t, MatchRegexp, "abc.+"),
+			isRegexOptimized: true,
+		},
+	} {
+		t.Run(fmt.Sprintf("%d: %s", i, tc.matcher), func(t *testing.T) {
+			require.Equal(t, tc.isRegexOptimized, tc.matcher.IsRegexOptimized())
+		})
+	}
+}
+
 func BenchmarkMatchType_String(b *testing.B) {
 	for i := 0; i <= b.N; i++ {
 		_ = MatchType(i % int(MatchNotRegexp+1)).String()
--- a/model/labels/regexp.go
+++ b/model/labels/regexp.go
@ -15,72 +15,380 @@ package labels

 import (
 	"strings"
+	"time"

+	"github.com/DmitriyVTitov/size"
+	"github.com/dgraph-io/ristretto"
 	"github.com/grafana/regexp"
 	"github.com/grafana/regexp/syntax"
 )

-type FastRegexMatcher struct {
-	re       *regexp.Regexp
-	prefix   string
-	suffix   string
-	contains string
+const (
+	maxSetMatches = 256

-	// shortcut for literals
-	literal bool
-	value   string
+	// The minimum number of alternate values a regex should have to trigger
+	// the optimization done by optimizeEqualStringMatchers() and so use a map
+	// to match values instead of iterating over a list. This value has
+	// been computed running BenchmarkOptimizeEqualStringMatchers.
+	minEqualMultiStringMatcherMapThreshold = 16
+
+	fastRegexMatcherCacheMaxSizeBytes = 1024 * 1024 * 1024 // 1GB
+	fastRegexMatcherCacheTTL          = 5 * time.Minute
+)
+
+var fastRegexMatcherCache *ristretto.Cache
+
+func init() {
+	// Ignore error because it can only return error if config is invalid,
+	// but we're using an hardcoded static config here.
+	fastRegexMatcherCache, _ = ristretto.NewCache(&ristretto.Config{
+		NumCounters: 100_000, // 10x the max number of expected items (takes 3 bytes per counter),
+		MaxCost:     fastRegexMatcherCacheMaxSizeBytes,
+		BufferItems: 64, // Recommended default per the Config docs,
+		Metrics:     false,
+	})
+}
+
+type FastRegexMatcher struct {
+	// Under some conditions, re is nil because the expression is never parsed.
+	// We store the original string to be able to return it in GetRegexString().
+	reString string
+	re       *regexp.Regexp
+
+	setMatches    []string
+	stringMatcher StringMatcher
+	prefix        string
+	suffix        string
+	contains      string
+
+	// matchString is the "compiled" function to run by MatchString().
+	matchString func(string) bool
 }

 func NewFastRegexMatcher(v string) (*FastRegexMatcher, error) {
-	if isLiteral(v) {
-		return &FastRegexMatcher{literal: true, value: v}, nil
+	// Check the cache.
+	if matcher, ok := fastRegexMatcherCache.Get(v); ok {
+		return matcher.(*FastRegexMatcher), nil
 	}
-	re, err := regexp.Compile("^(?:" + v + ")$")
+
+	// Create a new matcher.
+	matcher, err := newFastRegexMatcherWithoutCache(v)
 	if err != nil {
 		return nil, err
 	}

-	parsed, err := syntax.Parse(v, syntax.Perl)
-	if err != nil {
-		return nil, err
-	}
+	// Cache it.
+	fastRegexMatcherCache.SetWithTTL(v, matcher, int64(size.Of(matcher)), fastRegexMatcherCacheTTL)

+	return matcher, nil
+}
+
+func newFastRegexMatcherWithoutCache(v string) (*FastRegexMatcher, error) {
 	m := &FastRegexMatcher{
-		re: re,
+		reString: v,
 	}

-	if parsed.Op == syntax.OpConcat {
-		m.prefix, m.suffix, m.contains = optimizeConcatRegex(parsed)
+	m.stringMatcher, m.setMatches = optimizeAlternatingLiterals(v)
+	if m.stringMatcher != nil {
+		// If we already have a string matcher, we don't need to parse the regex
+		// or compile the matchString function. This also avoids the behavior in
+		// compileMatchStringFunction where it prefers to use setMatches when
+		// available, even if the string matcher is faster.
+		m.matchString = m.stringMatcher.Matches
+	} else {
+		parsed, err := syntax.Parse(v, syntax.Perl)
+		if err != nil {
+			return nil, err
+		}
+		// Simplify the syntax tree to run faster.
+		parsed = parsed.Simplify()
+		m.re, err = regexp.Compile("^(?:" + parsed.String() + ")$")
+		if err != nil {
+			return nil, err
+		}
+		if parsed.Op == syntax.OpConcat {
+			m.prefix, m.suffix, m.contains = optimizeConcatRegex(parsed)
+		}
+		if matches, caseSensitive := findSetMatches(parsed); caseSensitive {
+			m.setMatches = matches
+		}
+		m.stringMatcher = stringMatcherFromRegexp(parsed)
+		m.matchString = m.compileMatchStringFunction()
 	}

 	return m, nil
 }

+// compileMatchStringFunction returns the function to run by MatchString().
+func (m *FastRegexMatcher) compileMatchStringFunction() func(string) bool {
+	// If the only optimization available is the string matcher, then we can just run it.
+	if len(m.setMatches) == 0 && m.prefix == "" && m.suffix == "" && m.contains == "" && m.stringMatcher != nil {
+		return m.stringMatcher.Matches
+	}
+
+	return func(s string) bool {
+		if len(m.setMatches) != 0 {
+			for _, match := range m.setMatches {
+				if match == s {
+					return true
+				}
+			}
+			return false
+		}
+		if m.prefix != "" && !strings.HasPrefix(s, m.prefix) {
+			return false
+		}
+		if m.suffix != "" && !strings.HasSuffix(s, m.suffix) {
+			return false
+		}
+		if m.contains != "" && !strings.Contains(s, m.contains) {
+			return false
+		}
+		if m.stringMatcher != nil {
+			return m.stringMatcher.Matches(s)
+		}
+		return m.re.MatchString(s)
+	}
+}
+
+// IsOptimized returns true if any fast-path optimization is applied to the
+// regex matcher.
+func (m *FastRegexMatcher) IsOptimized() bool {
+	return len(m.setMatches) > 0 || m.stringMatcher != nil || m.prefix != "" || m.suffix != "" || m.contains != ""
+}
+
+// findSetMatches extract equality matches from a regexp.
+// Returns nil if we can't replace the regexp by only equality matchers or the regexp contains
+// a mix of case sensitive and case insensitive matchers.
+func findSetMatches(re *syntax.Regexp) (matches []string, caseSensitive bool) {
+	clearBeginEndText(re)
+
+	return findSetMatchesInternal(re, "")
+}
+
+func findSetMatchesInternal(re *syntax.Regexp, base string) (matches []string, caseSensitive bool) {
+	switch re.Op {
+	case syntax.OpBeginText:
+		// Correctly handling the begin text operator inside a regex is tricky,
+		// so in this case we fallback to the regex engine.
+		return nil, false
+	case syntax.OpEndText:
+		// Correctly handling the end text operator inside a regex is tricky,
+		// so in this case we fallback to the regex engine.
+		return nil, false
+	case syntax.OpLiteral:
+		return []string{base + string(re.Rune)}, isCaseSensitive(re)
+	case syntax.OpEmptyMatch:
+		if base != "" {
+			return []string{base}, isCaseSensitive(re)
+		}
+	case syntax.OpAlternate:
+		return findSetMatchesFromAlternate(re, base)
+	case syntax.OpCapture:
+		clearCapture(re)
+		return findSetMatchesInternal(re, base)
+	case syntax.OpConcat:
+		return findSetMatchesFromConcat(re, base)
+	case syntax.OpCharClass:
+		if len(re.Rune)%2 != 0 {
+			return nil, false
+		}
+		var matches []string
+		var totalSet int
+		for i := 0; i+1 < len(re.Rune); i += 2 {
+			totalSet += int(re.Rune[i+1]-re.Rune[i]) + 1
+		}
+		// limits the total characters that can be used to create matches.
+		// In some case like negation [^0-9] a lot of possibilities exists and that
+		// can create thousands of possible matches at which points we're better off using regexp.
+		if totalSet > maxSetMatches {
+			return nil, false
+		}
+		for i := 0; i+1 < len(re.Rune); i += 2 {
+			lo, hi := re.Rune[i], re.Rune[i+1]
+			for c := lo; c <= hi; c++ {
+				matches = append(matches, base+string(c))
+			}
+		}
+		return matches, isCaseSensitive(re)
+	default:
+		return nil, false
+	}
+	return nil, false
+}
+
+func findSetMatchesFromConcat(re *syntax.Regexp, base string) (matches []string, matchesCaseSensitive bool) {
+	if len(re.Sub) == 0 {
+		return nil, false
+	}
+	clearCapture(re.Sub...)
+
+	matches = []string{base}
+
+	for i := 0; i < len(re.Sub); i++ {
+		var newMatches []string
+		for j, b := range matches {
+			m, caseSensitive := findSetMatchesInternal(re.Sub[i], b)
+			if m == nil {
+				return nil, false
+			}
+			if tooManyMatches(newMatches, m...) {
+				return nil, false
+			}
+
+			// All matches must have the same case sensitivity. If it's the first set of matches
+			// returned, we store its sensitivity as the expected case, and then we'll check all
+			// other ones.
+			if i == 0 && j == 0 {
+				matchesCaseSensitive = caseSensitive
+			}
+			if matchesCaseSensitive != caseSensitive {
+				return nil, false
+			}
+
+			newMatches = append(newMatches, m...)
+		}
+		matches = newMatches
+	}
+
+	return matches, matchesCaseSensitive
+}
+
+func findSetMatchesFromAlternate(re *syntax.Regexp, base string) (matches []string, matchesCaseSensitive bool) {
+	for i, sub := range re.Sub {
+		found, caseSensitive := findSetMatchesInternal(sub, base)
+		if found == nil {
+			return nil, false
+		}
+		if tooManyMatches(matches, found...) {
+			return nil, false
+		}
+
+		// All matches must have the same case sensitivity. If it's the first set of matches
+		// returned, we store its sensitivity as the expected case, and then we'll check all
+		// other ones.
+		if i == 0 {
+			matchesCaseSensitive = caseSensitive
+		}
+		if matchesCaseSensitive != caseSensitive {
+			return nil, false
+		}
+
+		matches = append(matches, found...)
+	}
+
+	return matches, matchesCaseSensitive
+}
+
+// clearCapture removes capture operation as they are not used for matching.
+func clearCapture(regs ...*syntax.Regexp) {
+	for _, r := range regs {
+		if r.Op == syntax.OpCapture {
+			*r = *r.Sub[0]
+		}
+	}
+}
+
+// clearBeginEndText removes the begin and end text from the regexp. Prometheus regexp are anchored to the beginning and end of the string.
+func clearBeginEndText(re *syntax.Regexp) {
+	// Do not clear begin/end text from an alternate operator because it could
+	// change the actual regexp properties.
+	if re.Op == syntax.OpAlternate {
+		return
+	}
+
+	if len(re.Sub) == 0 {
+		return
+	}
+	if len(re.Sub) == 1 {
+		if re.Sub[0].Op == syntax.OpBeginText || re.Sub[0].Op == syntax.OpEndText {
+			// We need to remove this element. Since it's the only one, we convert into a matcher of an empty string.
+			// OpEmptyMatch is regexp's nop operator.
+			re.Op = syntax.OpEmptyMatch
+			re.Sub = nil
+			return
+		}
+	}
+	if re.Sub[0].Op == syntax.OpBeginText {
+		re.Sub = re.Sub[1:]
+	}
+	if re.Sub[len(re.Sub)-1].Op == syntax.OpEndText {
+		re.Sub = re.Sub[:len(re.Sub)-1]
+	}
+}
+
+// isCaseInsensitive tells if a regexp is case insensitive.
+// The flag should be check at each level of the syntax tree.
+func isCaseInsensitive(reg *syntax.Regexp) bool {
+	return (reg.Flags & syntax.FoldCase) != 0
+}
+
+// isCaseSensitive tells if a regexp is case sensitive.
+// The flag should be check at each level of the syntax tree.
+func isCaseSensitive(reg *syntax.Regexp) bool {
+	return !isCaseInsensitive(reg)
+}
+
+// tooManyMatches guards against creating too many set matches
+func tooManyMatches(matches []string, added ...string) bool {
+	return len(matches)+len(added) > maxSetMatches
+}
+
 func (m *FastRegexMatcher) MatchString(s string) bool {
-	if m.literal {
-		return s == m.value
-	}
-	if m.prefix != "" && !strings.HasPrefix(s, m.prefix) {
-		return false
-	}
-	if m.suffix != "" && !strings.HasSuffix(s, m.suffix) {
-		return false
-	}
-	if m.contains != "" && !strings.Contains(s, m.contains) {
-		return false
-	}
-	return m.re.MatchString(s)
+	return m.matchString(s)
+}
+
+func (m *FastRegexMatcher) SetMatches() []string {
+	return m.setMatches
 }

 func (m *FastRegexMatcher) GetRegexString() string {
-	if m.literal {
-		return m.value
-	}
-	return m.re.String()
+	return m.reString
 }

-func isLiteral(re string) bool {
-	return regexp.QuoteMeta(re) == re
+// optimizeAlternatingLiterals optimizes a regex of the form
+//
+//	`literal1|literal2|literal3|...`
+//
+// this function returns an optimized StringMatcher or nil if the regex
+// cannot be optimized in this way, and a list of setMatches up to maxSetMatches
+func optimizeAlternatingLiterals(s string) (StringMatcher, []string) {
+	if len(s) == 0 {
+		return emptyStringMatcher{}, nil
+	}
+
+	estimatedAlternates := strings.Count(s, "|") + 1
+
+	// If there are no alternates, check if the string is a literal
+	if estimatedAlternates == 1 {
+		if regexp.QuoteMeta(s) == s {
+			return &equalStringMatcher{s: s, caseSensitive: true}, nil
+		}
+		return nil, nil
+	}
+
+	multiMatcher := newEqualMultiStringMatcher(true, estimatedAlternates)
+
+	for end := strings.IndexByte(s, '|'); end > -1; end = strings.IndexByte(s, '|') {
+		// Split the string into the next literal and the remainder
+		subMatch := s[:end]
+		s = s[end+1:]
+
+		// break if any of the submatches are not literals
+		if regexp.QuoteMeta(subMatch) != subMatch {
+			return nil, nil
+		}
+
+		multiMatcher.add(subMatch)
+	}
+
+	// break if the remainder is not a literal
+	if regexp.QuoteMeta(s) != s {
+		return nil, nil
+	}
+	multiMatcher.add(s)
+
+	return multiMatcher, multiMatcher.setMatches()
 }

 // optimizeConcatRegex returns literal prefix/suffix text that can be safely
@ -123,3 +431,409 @@ func optimizeConcatRegex(r *syntax.Regexp) (prefix, suffix, contains string) {

 	return
 }
+
+// StringMatcher is a matcher that matches a string in place of a regular expression.
+type StringMatcher interface {
+	Matches(s string) bool
+}
+
+// stringMatcherFromRegexp attempts to replace a common regexp with a string matcher.
+// It returns nil if the regexp is not supported.
+// For examples, it will replace `.*foo` with `foo.*` and `.*foo.*` with `(?i)foo`.
+func stringMatcherFromRegexp(re *syntax.Regexp) StringMatcher {
+	clearBeginEndText(re)
+
+	m := stringMatcherFromRegexpInternal(re)
+	m = optimizeEqualStringMatchers(m, minEqualMultiStringMatcherMapThreshold)
+
+	return m
+}
+
+func stringMatcherFromRegexpInternal(re *syntax.Regexp) StringMatcher {
+	clearCapture(re)
+
+	switch re.Op {
+	case syntax.OpBeginText:
+		// Correctly handling the begin text operator inside a regex is tricky,
+		// so in this case we fallback to the regex engine.
+		return nil
+	case syntax.OpEndText:
+		// Correctly handling the end text operator inside a regex is tricky,
+		// so in this case we fallback to the regex engine.
+		return nil
+	case syntax.OpPlus:
+		if re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL {
+			return nil
+		}
+		return &anyNonEmptyStringMatcher{
+			matchNL: re.Sub[0].Op == syntax.OpAnyChar,
+		}
+	case syntax.OpStar:
+		if re.Sub[0].Op != syntax.OpAnyChar && re.Sub[0].Op != syntax.OpAnyCharNotNL {
+			return nil
+		}
+
+		// If the newline is valid, than this matcher literally match any string (even empty).
+		if re.Sub[0].Op == syntax.OpAnyChar {
+			return trueMatcher{}
+		}
+
+		// Any string is fine (including an empty one), as far as it doesn't contain any newline.
+		return anyStringWithoutNewlineMatcher{}
+	case syntax.OpEmptyMatch:
+		return emptyStringMatcher{}
+
+	case syntax.OpLiteral:
+		return &equalStringMatcher{
+			s:             string(re.Rune),
+			caseSensitive: !isCaseInsensitive(re),
+		}
+	case syntax.OpAlternate:
+		or := make([]StringMatcher, 0, len(re.Sub))
+		for _, sub := range re.Sub {
+			m := stringMatcherFromRegexpInternal(sub)
+			if m == nil {
+				return nil
+			}
+			or = append(or, m)
+		}
+		return orStringMatcher(or)
+	case syntax.OpConcat:
+		clearCapture(re.Sub...)
+		if len(re.Sub) == 0 {
+			return emptyStringMatcher{}
+		}
+		if len(re.Sub) == 1 {
+			return stringMatcherFromRegexpInternal(re.Sub[0])
+		}
+		var left, right StringMatcher
+		// Let's try to find if there's a first and last any matchers.
+		if re.Sub[0].Op == syntax.OpPlus || re.Sub[0].Op == syntax.OpStar {
+			left = stringMatcherFromRegexpInternal(re.Sub[0])
+			if left == nil {
+				return nil
+			}
+			re.Sub = re.Sub[1:]
+		}
+		if re.Sub[len(re.Sub)-1].Op == syntax.OpPlus || re.Sub[len(re.Sub)-1].Op == syntax.OpStar {
+			right = stringMatcherFromRegexpInternal(re.Sub[len(re.Sub)-1])
+			if right == nil {
+				return nil
+			}
+			re.Sub = re.Sub[:len(re.Sub)-1]
+		}
+
+		matches, matchesCaseSensitive := findSetMatchesInternal(re, "")
+		if len(matches) == 0 {
+			return nil
+		}
+
+		if left == nil && right == nil {
+			// if there's no any matchers on both side it's a concat of literals
+			or := make([]StringMatcher, 0, len(matches))
+			for _, match := range matches {
+				or = append(or, &equalStringMatcher{
+					s:             match,
+					caseSensitive: matchesCaseSensitive,
+				})
+			}
+			return orStringMatcher(or)
+		}
+
+		// We found literals in the middle. We can triggered the fast path only if
+		// the matches are case sensitive because containsStringMatcher doesn't
+		// support case insensitive.
+		if matchesCaseSensitive {
+			return &containsStringMatcher{
+				substrings: matches,
+				left:       left,
+				right:      right,
+			}
+		}
+	}
+	return nil
+}
+
+// containsStringMatcher matches a string if it contains any of the substrings.
+// If left and right are not nil, it's a contains operation where left and right must match.
+// If left is nil, it's a hasPrefix operation and right must match.
+// Finally if right is nil it's a hasSuffix operation and left must match.
+type containsStringMatcher struct {
+	substrings []string
+	left       StringMatcher
+	right      StringMatcher
+}
+
+func (m *containsStringMatcher) Matches(s string) bool {
+	for _, substr := range m.substrings {
+		switch {
+		case m.right != nil && m.left != nil:
+			searchStartPos := 0
+
+			for {
+				pos := strings.Index(s[searchStartPos:], substr)
+				if pos < 0 {
+					break
+				}
+
+				// Since we started searching from searchStartPos, we have to add that offset
+				// to get the actual position of the substring inside the text.
+				pos += searchStartPos
+
+				// If both the left and right matchers match, then we can stop searching because
+				// we've found a match.
+				if m.left.Matches(s[:pos]) && m.right.Matches(s[pos+len(substr):]) {
+					return true
+				}
+
+				// Continue searching for another occurrence of the substring inside the text.
+				searchStartPos = pos + 1
+			}
+		case m.left != nil:
+			// If we have to check for characters on the left then we need to match a suffix.
+			if strings.HasSuffix(s, substr) && m.left.Matches(s[:len(s)-len(substr)]) {
+				return true
+			}
+		case m.right != nil:
+			if strings.HasPrefix(s, substr) && m.right.Matches(s[len(substr):]) {
+				return true
+			}
+		}
+	}
+	return false
+}
+
+// emptyStringMatcher matches an empty string.
+type emptyStringMatcher struct{}
+
+func (m emptyStringMatcher) Matches(s string) bool {
+	return len(s) == 0
+}
+
+// orStringMatcher matches any of the sub-matchers.
+type orStringMatcher []StringMatcher
+
+func (m orStringMatcher) Matches(s string) bool {
+	for _, matcher := range m {
+		if matcher.Matches(s) {
+			return true
+		}
+	}
+	return false
+}
+
+// equalStringMatcher matches a string exactly and support case insensitive.
+type equalStringMatcher struct {
+	s             string
+	caseSensitive bool
+}
+
+func (m *equalStringMatcher) Matches(s string) bool {
+	if m.caseSensitive {
+		return m.s == s
+	}
+	return strings.EqualFold(m.s, s)
+}
+
+type multiStringMatcherBuilder interface {
+	StringMatcher
+	add(s string)
+	setMatches() []string
+}
+
+func newEqualMultiStringMatcher(caseSensitive bool, estimatedSize int) multiStringMatcherBuilder {
+	// If the estimated size is low enough, it's faster to use a slice instead of a map.
+	if estimatedSize < minEqualMultiStringMatcherMapThreshold {
+		return &equalMultiStringSliceMatcher{caseSensitive: caseSensitive, values: make([]string, 0, estimatedSize)}
+	}
+
+	return &equalMultiStringMapMatcher{
+		values:        make(map[string]struct{}, estimatedSize),
+		caseSensitive: caseSensitive,
+	}
+}
+
+// equalMultiStringSliceMatcher matches a string exactly against a slice of valid values.
+type equalMultiStringSliceMatcher struct {
+	values []string
+
+	caseSensitive bool
+}
+
+func (m *equalMultiStringSliceMatcher) add(s string) {
+	m.values = append(m.values, s)
+}
+
+func (m *equalMultiStringSliceMatcher) setMatches() []string {
+	return m.values
+}
+
+func (m *equalMultiStringSliceMatcher) Matches(s string) bool {
+	if m.caseSensitive {
+		for _, v := range m.values {
+			if s == v {
+				return true
+			}
+		}
+	} else {
+		for _, v := range m.values {
+			if strings.EqualFold(s, v) {
+				return true
+			}
+		}
+	}
+	return false
+}
+
+// equalMultiStringMapMatcher matches a string exactly against a map of valid values.
+type equalMultiStringMapMatcher struct {
+	// values contains values to match a string against. If the matching is case insensitive,
+	// the values here must be lowercase.
+	values map[string]struct{}
+
+	caseSensitive bool
+}
+
+func (m *equalMultiStringMapMatcher) add(s string) {
+	if !m.caseSensitive {
+		s = strings.ToLower(s)
+	}
+
+	m.values[s] = struct{}{}
+}
+
+func (m *equalMultiStringMapMatcher) setMatches() []string {
+	if len(m.values) >= maxSetMatches {
+		return nil
+	}
+
+	matches := make([]string, 0, len(m.values))
+	for s := range m.values {
+		matches = append(matches, s)
+	}
+	return matches
+}
+
+func (m *equalMultiStringMapMatcher) Matches(s string) bool {
+	if !m.caseSensitive {
+		s = strings.ToLower(s)
+	}
+
+	_, ok := m.values[s]
+	return ok
+}
+
+// anyStringWithoutNewlineMatcher is a stringMatcher which matches any string
+// (including an empty one) as far as it doesn't contain any newline character.
+type anyStringWithoutNewlineMatcher struct{}
+
+func (m anyStringWithoutNewlineMatcher) Matches(s string) bool {
+	// We need to make sure it doesn't contain a newline. Since the newline is
+	// an ASCII character, we can use strings.IndexByte().
+	return strings.IndexByte(s, '\n') == -1
+}
+
+// anyNonEmptyStringMatcher is a stringMatcher which matches any non-empty string.
+type anyNonEmptyStringMatcher struct {
+	matchNL bool
+}
+
+func (m *anyNonEmptyStringMatcher) Matches(s string) bool {
+	if m.matchNL {
+		// It's OK if the string contains a newline so we just need to make
+		// sure it's non-empty.
+		return len(s) > 0
+	}
+
+	// We need to make sure it non-empty and doesn't contain a newline.
+	// Since the newline is an ASCII character, we can use strings.IndexByte().
+	return len(s) > 0 && strings.IndexByte(s, '\n') == -1
+}
+
+// trueMatcher is a stringMatcher which matches any string (always returns true).
+type trueMatcher struct{}
+
+func (m trueMatcher) Matches(_ string) bool {
+	return true
+}
+
+// optimizeEqualStringMatchers optimize a specific case where all matchers are made by an
+// alternation (orStringMatcher) of strings checked for equality (equalStringMatcher). In
+// this specific case, when we have many strings to match against we can use a map instead
+// of iterating over the list of strings.
+func optimizeEqualStringMatchers(input StringMatcher, threshold int) StringMatcher {
+	var (
+		caseSensitive    bool
+		caseSensitiveSet bool
+		numValues        int
+	)
+
+	// Analyse the input StringMatcher to count the number of occurrences
+	// and ensure all of them have the same case sensitivity.
+	analyseCallback := func(matcher *equalStringMatcher) bool {
+		// Ensure we don't have mixed case sensitivity.
+		if caseSensitiveSet && caseSensitive != matcher.caseSensitive {
+			return false
+		} else if !caseSensitiveSet {
+			caseSensitive = matcher.caseSensitive
+			caseSensitiveSet = true
+		}
+
+		numValues++
+		return true
+	}
+
+	if !findEqualStringMatchers(input, analyseCallback) {
+		return input
+	}
+
+	// If the number of values found is less than the threshold, then we should skip the optimization.
+	if numValues < threshold {
+		return input
+	}
+
+	// Parse again the input StringMatcher to extract all values and storing them.
+	// We can skip the case sensitivity check because we've already checked it and
+	// if the code reach this point then it means all matchers have the same case sensitivity.
+	multiMatcher := newEqualMultiStringMatcher(caseSensitive, numValues)
+
+	// Ignore the return value because we already iterated over the input StringMatcher
+	// and it was all good.
+	findEqualStringMatchers(input, func(matcher *equalStringMatcher) bool {
+		multiMatcher.add(matcher.s)
+		return true
+	})
+
+	return multiMatcher
+}
+
+// findEqualStringMatchers analyze the input StringMatcher and calls the callback for each
+// equalStringMatcher found. Returns true if and only if the input StringMatcher is *only*
+// composed by an alternation of equalStringMatcher.
+func findEqualStringMatchers(input StringMatcher, callback func(matcher *equalStringMatcher) bool) bool {
+	orInput, ok := input.(orStringMatcher)
+	if !ok {
+		return false
+	}
+
+	for _, m := range orInput {
+		switch casted := m.(type) {
+		case orStringMatcher:
+			if !findEqualStringMatchers(m, callback) {
+				return false
+			}
+
+		case *equalStringMatcher:
+			if !callback(casted) {
+				return false
+			}
+
+		default:
+			// It's not an equal string matcher, so we have to stop searching
+			// cause this optimization can't be applied.
+			return false
+		}
+	}
+
+	return true
+}
--- a/model/labels/regexp_test.go
+++ b/model/labels/regexp_test.go
--- a/model/labels/sharding.go
+++ b/model/labels/sharding.go
@ -0,0 +1,34 @@
+//go:build !stringlabels
+
+package labels
+
+import (
+	"github.com/cespare/xxhash/v2"
+)
+
+// StableHash is a labels hashing implementation which is guaranteed to not change over time.
+// This function should be used whenever labels hashing backward compatibility must be guaranteed.
+func StableHash(ls Labels) uint64 {
+	// Use xxhash.Sum64(b) for fast path as it's faster.
+	b := make([]byte, 0, 1024)
+	for i, v := range ls {
+		if len(b)+len(v.Name)+len(v.Value)+2 >= cap(b) {
+			// If labels entry is 1KB+ do not allocate whole entry.
+			h := xxhash.New()
+			_, _ = h.Write(b)
+			for _, v := range ls[i:] {
+				_, _ = h.WriteString(v.Name)
+				_, _ = h.Write(seps)
+				_, _ = h.WriteString(v.Value)
+				_, _ = h.Write(seps)
+			}
+			return h.Sum64()
+		}
+
+		b = append(b, v.Name...)
+		b = append(b, seps[0])
+		b = append(b, v.Value...)
+		b = append(b, seps[0])
+	}
+	return xxhash.Sum64(b)
+}
--- a/model/labels/sharding_stringlabels.go
+++ b/model/labels/sharding_stringlabels.go
@ -0,0 +1,41 @@
+//go:build stringlabels
+
+package labels
+
+import (
+	"github.com/cespare/xxhash/v2"
+)
+
+// StableHash is a labels hashing implementation which is guaranteed to not change over time.
+// This function should be used whenever labels hashing backward compatibility must be guaranteed.
+func StableHash(ls Labels) uint64 {
+	// Use xxhash.Sum64(b) for fast path as it's faster.
+	b := make([]byte, 0, 1024)
+	var h *xxhash.Digest
+	for i := 0; i < len(ls.data); {
+		var v Label
+		v.Name, i = decodeString(ls.data, i)
+		v.Value, i = decodeString(ls.data, i)
+		if h == nil && len(b)+len(v.Name)+len(v.Value)+2 >= cap(b) {
+			// If labels entry is 1KB+, switch to Write API. Copy in the values up to this point.
+			h = xxhash.New()
+			_, _ = h.Write(b)
+		}
+		if h != nil {
+			_, _ = h.WriteString(v.Name)
+			_, _ = h.Write(seps)
+			_, _ = h.WriteString(v.Value)
+			_, _ = h.Write(seps)
+			continue
+		}
+
+		b = append(b, v.Name...)
+		b = append(b, seps[0])
+		b = append(b, v.Value...)
+		b = append(b, seps[0])
+	}
+	if h != nil {
+		return h.Sum64()
+	}
+	return xxhash.Sum64(b)
+}
--- a/model/labels/sharding_test.go
+++ b/model/labels/sharding_test.go
@ -0,0 +1,19 @@
+package labels
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+// TestStableHash tests that StableHash is stable.
+// The hashes this test asserts should not be changed.
+func TestStableHash(t *testing.T) {
+	for expectedHash, lbls := range map[uint64]Labels{
+		0xef46db3751d8e999: EmptyLabels(),
+		0x347c8ee7a9e29708: FromStrings("hello", "world"),
+		0xcbab40540f26097d: FromStrings(MetricName, "metric", "label", "value"),
+	} {
+		require.Equal(t, expectedHash, StableHash(lbls))
+	}
+}
--- a/model/rulefmt/rulefmt.go
+++ b/model/rulefmt/rulefmt.go
@ -135,10 +135,13 @@ func (g *RuleGroups) Validate(node ruleGroups) (errs []error) {

 // RuleGroup is a list of sequentially evaluated recording and alerting rules.
 type RuleGroup struct {
-	Name     string         `yaml:"name"`
-	Interval model.Duration `yaml:"interval,omitempty"`
-	Limit    int            `yaml:"limit,omitempty"`
-	Rules    []RuleNode     `yaml:"rules"`
+	Name                          string          `yaml:"name"`
+	Interval                      model.Duration  `yaml:"interval,omitempty"`
+	EvaluationDelay               *model.Duration `yaml:"evaluation_delay,omitempty"`
+	Limit                         int             `yaml:"limit,omitempty"`
+	Rules                         []RuleNode      `yaml:"rules"`
+	SourceTenants                 []string        `yaml:"source_tenants,omitempty"`
+	AlignEvaluationTimeOnInterval bool            `yaml:"align_evaluation_time_on_interval,omitempty"`
 }

 // Rule describes an alerting or recording rule.
--- a/model/rulefmt/testdata/test.yaml
+++ b/model/rulefmt/testdata/test.yaml
@ -36,6 +36,7 @@ groups:

  - name: my-another-name
    interval: 30s # defaults to global interval
+    source_tenants: [tenant-1]
    rules:
      - alert: HighErrors
        expr: |
--- a/promql/bench_test.go
+++ b/promql/bench_test.go
@ -186,6 +186,10 @@ func rangeQueryCases() []benchCase {
 			expr:  "count({__name__!=\"\",l=\"\"})",
 			steps: 1,
 		},
+		// timestamp() function
+		{
+			expr: "timestamp(a_X)",
+		},
 	}

 	// X in an expr will be replaced by different metric sizes.
--- a/promql/engine.go
+++ b/promql/engine.go
@ -476,6 +476,7 @@ func (ng *Engine) NewRangeQuery(ctx context.Context, q storage.Queryable, opts Q
 }

 func (ng *Engine) newQuery(q storage.Queryable, qs string, opts QueryOpts, start, end time.Time, interval time.Duration) (*parser.Expr, *query) {
+	// Default to empty QueryOpts if not provided.
 	if opts == nil {
 		opts = NewPrometheusQueryOpts(false, 0)
 	}
@ -1387,15 +1388,7 @@ func (ev *evaluator) eval(expr parser.Expr) (parser.Value, storage.Warnings) {
 			unwrapParenExpr(&arg)
 			vs, ok := arg.(*parser.VectorSelector)
 			if ok {
-				return ev.rangeEval(nil, func(v []parser.Value, _ [][]EvalSeriesHelper, enh *EvalNodeHelper) (Vector, storage.Warnings) {
-					if vs.Timestamp != nil {
-						// This is a special case only for "timestamp" since the offset
-						// needs to be adjusted for every point.
-						vs.Offset = time.Duration(enh.Ts-*vs.Timestamp) * time.Millisecond
-					}
-					val, ws := ev.vectorSelector(vs, enh.Ts)
-					return call([]parser.Value{val}, e.Args, enh), ws
-				})
+				return ev.evalTimestampFunctionOverVectorSelector(vs, call, e)
 			}
 		}

@ -1833,38 +1826,47 @@ func (ev *evaluator) eval(expr parser.Expr) (parser.Value, storage.Warnings) {
 	panic(fmt.Errorf("unhandled expression of type: %T", expr))
 }

-// vectorSelector evaluates a *parser.VectorSelector expression.
-func (ev *evaluator) vectorSelector(node *parser.VectorSelector, ts int64) (Vector, storage.Warnings) {
-	ws, err := checkAndExpandSeriesSet(ev.ctx, node)
+func (ev *evaluator) evalTimestampFunctionOverVectorSelector(vs *parser.VectorSelector, call FunctionCall, e *parser.Call) (parser.Value, storage.Warnings) {
+	ws, err := checkAndExpandSeriesSet(ev.ctx, vs)
 	if err != nil {
 		ev.error(errWithWarnings{fmt.Errorf("expanding series: %w", err), ws})
 	}
-	vec := make(Vector, 0, len(node.Series))
-	it := storage.NewMemoizedEmptyIterator(durationMilliseconds(ev.lookbackDelta))
-	var chkIter chunkenc.Iterator
-	for i, s := range node.Series {
-		chkIter = s.Iterator(chkIter)
-		it.Reset(chkIter)

-		t, f, h, ok := ev.vectorSelectorSingle(it, node, ts)
-		if ok {
-			vec = append(vec, Sample{
-				Metric: node.Series[i].Labels(),
-				T:      t,
-				F:      f,
-				H:      h,
-			})
+	seriesIterators := make([]*storage.MemoizedSeriesIterator, len(vs.Series))
+	for i, s := range vs.Series {
+		it := s.Iterator(nil)
+		seriesIterators[i] = storage.NewMemoizedIterator(it, durationMilliseconds(ev.lookbackDelta))
+	}

-			ev.currentSamples++
-			ev.samplesStats.IncrementSamplesAtTimestamp(ts, 1)
-			if ev.currentSamples > ev.maxSamples {
-				ev.error(ErrTooManySamples(env))
-			}
+	return ev.rangeEval(nil, func(v []parser.Value, _ [][]EvalSeriesHelper, enh *EvalNodeHelper) (Vector, storage.Warnings) {
+		if vs.Timestamp != nil {
+			// This is a special case only for "timestamp" since the offset
+			// needs to be adjusted for every point.
+			vs.Offset = time.Duration(enh.Ts-*vs.Timestamp) * time.Millisecond
 		}

-	}
-	ev.samplesStats.UpdatePeak(ev.currentSamples)
-	return vec, ws
+		vec := make(Vector, 0, len(vs.Series))
+		for i, s := range vs.Series {
+			it := seriesIterators[i]
+			t, f, h, ok := ev.vectorSelectorSingle(it, vs, enh.Ts)
+			if ok {
+				vec = append(vec, Sample{
+					Metric: s.Labels(),
+					T:      t,
+					F:      f,
+					H:      h,
+				})
+
+				ev.currentSamples++
+				ev.samplesStats.IncrementSamplesAtTimestamp(enh.Ts, 1)
+				if ev.currentSamples > ev.maxSamples {
+					ev.error(ErrTooManySamples(env))
+				}
+			}
+		}
+		ev.samplesStats.UpdatePeak(ev.currentSamples)
+		return call([]parser.Value{vec}, e.Args, enh), ws
+	})
 }

 // vectorSelectorSingle evaluates an instant vector for the iterator of one time series.
--- a/promql/engine_test.go
+++ b/promql/engine_test.go
@ -26,9 +26,9 @@ import (
 	"github.com/go-kit/log"

 	"github.com/prometheus/prometheus/tsdb/tsdbutil"
+	"github.com/prometheus/prometheus/util/testutil"

 	"github.com/stretchr/testify/require"
-	"go.uber.org/goleak"

 	"github.com/prometheus/prometheus/model/histogram"
 	"github.com/prometheus/prometheus/model/labels"
@ -39,7 +39,7 @@ import (
 )

 func TestMain(m *testing.M) {
-	goleak.VerifyTestMain(m)
+	testutil.TolerantVerifyLeak(m)
 }

 func TestQueryConcurrency(t *testing.T) {
@ -1977,6 +1977,100 @@ func TestSubquerySelector(t *testing.T) {
 	}
 }

+func TestTimestampFunction_StepsMoreOftenThanSamples(t *testing.T) {
+	test, err := NewTest(t, `
+load 1m
+  metric 0+1x1000
+`)
+	require.NoError(t, err)
+	defer test.Close()
+
+	err = test.Run()
+	require.NoError(t, err)
+
+	query := "timestamp(metric)"
+	start := time.Unix(0, 0)
+	end := time.Unix(61, 0)
+	interval := time.Second
+
+	expectedResult := Matrix{
+		Series{
+			Floats: []FPoint{
+				{F: 0, T: 0},
+				{F: 0, T: 1_000},
+				{F: 0, T: 2_000},
+				{F: 0, T: 3_000},
+				{F: 0, T: 4_000},
+				{F: 0, T: 5_000},
+				{F: 0, T: 6_000},
+				{F: 0, T: 7_000},
+				{F: 0, T: 8_000},
+				{F: 0, T: 9_000},
+				{F: 0, T: 10_000},
+				{F: 0, T: 11_000},
+				{F: 0, T: 12_000},
+				{F: 0, T: 13_000},
+				{F: 0, T: 14_000},
+				{F: 0, T: 15_000},
+				{F: 0, T: 16_000},
+				{F: 0, T: 17_000},
+				{F: 0, T: 18_000},
+				{F: 0, T: 19_000},
+				{F: 0, T: 20_000},
+				{F: 0, T: 21_000},
+				{F: 0, T: 22_000},
+				{F: 0, T: 23_000},
+				{F: 0, T: 24_000},
+				{F: 0, T: 25_000},
+				{F: 0, T: 26_000},
+				{F: 0, T: 27_000},
+				{F: 0, T: 28_000},
+				{F: 0, T: 29_000},
+				{F: 0, T: 30_000},
+				{F: 0, T: 31_000},
+				{F: 0, T: 32_000},
+				{F: 0, T: 33_000},
+				{F: 0, T: 34_000},
+				{F: 0, T: 35_000},
+				{F: 0, T: 36_000},
+				{F: 0, T: 37_000},
+				{F: 0, T: 38_000},
+				{F: 0, T: 39_000},
+				{F: 0, T: 40_000},
+				{F: 0, T: 41_000},
+				{F: 0, T: 42_000},
+				{F: 0, T: 43_000},
+				{F: 0, T: 44_000},
+				{F: 0, T: 45_000},
+				{F: 0, T: 46_000},
+				{F: 0, T: 47_000},
+				{F: 0, T: 48_000},
+				{F: 0, T: 49_000},
+				{F: 0, T: 50_000},
+				{F: 0, T: 51_000},
+				{F: 0, T: 52_000},
+				{F: 0, T: 53_000},
+				{F: 0, T: 54_000},
+				{F: 0, T: 55_000},
+				{F: 0, T: 56_000},
+				{F: 0, T: 57_000},
+				{F: 0, T: 58_000},
+				{F: 0, T: 59_000},
+				{F: 60, T: 60_000},
+				{F: 60, T: 61_000},
+			},
+			Metric: labels.EmptyLabels(),
+		},
+	}
+
+	qry, err := test.QueryEngine().NewRangeQuery(test.context, test.Queryable(), nil, query, start, end, interval)
+	require.NoError(t, err)
+
+	res := qry.Exec(test.Context())
+	require.NoError(t, res.Err)
+	require.Equal(t, expectedResult, res.Value)
+}
+
 type FakeQueryLogger struct {
 	closed bool
 	logs   []interface{}
@ -3109,6 +3203,26 @@ func TestRangeQuery(t *testing.T) {
 			End:      time.Unix(120, 0),
 			Interval: 1 * time.Minute,
 		},
+		{
+			Name: "short-circuit",
+			Load: `load 30s
+							foo{job="1"} 1+1x4
+							bar{job="2"} 1+1x4`,
+			Query: `foo > 2 or bar`,
+			Result: Matrix{
+				Series{
+					Floats: []FPoint{{F: 1, T: 0}, {F: 3, T: 60000}, {F: 5, T: 120000}},
+					Metric: labels.FromStrings("__name__", "bar", "job", "2"),
+				},
+				Series{
+					Floats: []FPoint{{F: 3, T: 60000}, {F: 5, T: 120000}},
+					Metric: labels.FromStrings("__name__", "foo", "job", "1"),
+				},
+			},
+			Start:    time.Unix(0, 0),
+			End:      time.Unix(120, 0),
+			Interval: 1 * time.Minute,
+		},
 	}
 	for _, c := range cases {
 		t.Run(c.Name, func(t *testing.T) {
--- a/promql/parser/parse_test.go
+++ b/promql/parser/parse_test.go
@ -3565,7 +3565,32 @@ func TestParseExpressions(t *testing.T) {

 			if !test.fail {
 				require.NoError(t, err)
-				require.Equal(t, test.expected, expr, "error on input '%s'", test.input)
+				expected := test.expected
+
+				// The FastRegexMatcher introduced in mimir-prometheus is not comparable with
+				// a deep equal, so only compare its String() version.
+				if actualVector, ok := expr.(*VectorSelector); ok {
+					require.IsType(t, &VectorSelector{}, test.expected, "error on input '%s'", test.input)
+					expectedVector := test.expected.(*VectorSelector)
+
+					require.Len(t, actualVector.LabelMatchers, len(expectedVector.LabelMatchers), "error on input '%s'", test.input)
+
+					for i := 0; i < len(actualVector.LabelMatchers); i++ {
+						expectedMatcher := expectedVector.LabelMatchers[i].String()
+						actualMatcher := actualVector.LabelMatchers[i].String()
+
+						require.Equal(t, expectedMatcher, actualMatcher, "unexpected label matcher '%s' on input '%s'", actualMatcher, test.input)
+					}
+
+					// Make a shallow copy of the expected expr (because the test cases are defined in a global variable)
+					// and then reset the LabelMatcher to not compared them with the following deep equal.
+					expectedCopy := *expectedVector
+					expectedCopy.LabelMatchers = nil
+					expected = &expectedCopy
+					actualVector.LabelMatchers = nil
+				}
+
+				require.Equal(t, expected, expr, "error on input '%s'", test.input)
 			} else {
 				require.Error(t, err)
 				require.Contains(t, err.Error(), test.errMsg, "unexpected error on input '%s', expected '%s', got '%s'", test.input, test.errMsg, err.Error())
--- a/rules/alerting.go
+++ b/rules/alerting.go
@ -323,10 +323,10 @@ const resolvedRetention = 15 * time.Minute

 // Eval evaluates the rule expression and then creates pending alerts and fires
 // or removes previously pending alerts accordingly.
-func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, externalURL *url.URL, limit int) (promql.Vector, error) {
+func (r *AlertingRule) Eval(ctx context.Context, evalDelay time.Duration, ts time.Time, query QueryFunc, externalURL *url.URL, limit int) (promql.Vector, error) {
 	ctx = NewOriginContext(ctx, NewRuleDetail(r))

-	res, err := query(ctx, r.vector.String(), ts)
+	res, err := query(ctx, r.vector.String(), ts.Add(-evalDelay))
 	if err != nil {
 		return nil, err
 	}
@ -458,8 +458,8 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc,
 		}

 		if r.restored.Load() {
-			vec = append(vec, r.sample(a, ts))
-			vec = append(vec, r.forStateSample(a, ts, float64(a.ActiveAt.Unix())))
+			vec = append(vec, r.sample(a, ts.Add(-evalDelay)))
+			vec = append(vec, r.forStateSample(a, ts.Add(-evalDelay), float64(a.ActiveAt.Unix())))
 		}
 	}

@ -535,7 +535,7 @@ func (r *AlertingRule) sendAlerts(ctx context.Context, ts time.Time, resendDelay
 			if interval > resendDelay {
 				delta = interval
 			}
-			alert.ValidUntil = ts.Add(4 * delta)
+			alert.ValidUntil = ts.Add(5 * delta)
 			anew := *alert
 			// The notifier re-uses the labels slice, hence make a copy.
 			anew.Labels = alert.Labels.Copy()
--- a/rules/alerting_test.go
+++ b/rules/alerting_test.go
@ -158,7 +158,8 @@ func TestAlertingRuleLabelsUpdate(t *testing.T) {
 		t.Logf("case %d", i)
 		evalTime := baseTime.Add(time.Duration(i) * time.Minute)
 		result[0].T = timestamp.FromTime(evalTime)
-		res, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0)
+		res, err := rule.Eval(suite.Context(), 0, evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0)
+
 		require.NoError(t, err)

 		var filteredRes promql.Vector // After removing 'ALERTS_FOR_STATE' samples.
@ -175,7 +176,7 @@ func TestAlertingRuleLabelsUpdate(t *testing.T) {
 		require.Equal(t, result, filteredRes)
 	}
 	evalTime := baseTime.Add(time.Duration(len(results)) * time.Minute)
-	res, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0)
+	res, err := rule.Eval(suite.Context(), 0, evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0)
 	require.NoError(t, err)
 	require.Equal(t, 0, len(res))
 }
@ -246,7 +247,7 @@ func TestAlertingRuleExternalLabelsInTemplate(t *testing.T) {

 	var filteredRes promql.Vector // After removing 'ALERTS_FOR_STATE' samples.
 	res, err := ruleWithoutExternalLabels.Eval(
-		suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0,
+		suite.Context(), 0, evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0,
 	)
 	require.NoError(t, err)
 	for _, smpl := range res {
@ -260,7 +261,7 @@ func TestAlertingRuleExternalLabelsInTemplate(t *testing.T) {
 	}

 	res, err = ruleWithExternalLabels.Eval(
-		suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0,
+		suite.Context(), 0, evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0,
 	)
 	require.NoError(t, err)
 	for _, smpl := range res {
@ -342,7 +343,7 @@ func TestAlertingRuleExternalURLInTemplate(t *testing.T) {

 	var filteredRes promql.Vector // After removing 'ALERTS_FOR_STATE' samples.
 	res, err := ruleWithoutExternalURL.Eval(
-		suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0,
+		suite.Context(), 0, evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0,
 	)
 	require.NoError(t, err)
 	for _, smpl := range res {
@ -356,7 +357,7 @@ func TestAlertingRuleExternalURLInTemplate(t *testing.T) {
 	}

 	res, err = ruleWithExternalURL.Eval(
-		suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0,
+		suite.Context(), 0, evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0,
 	)
 	require.NoError(t, err)
 	for _, smpl := range res {
@ -414,7 +415,7 @@ func TestAlertingRuleEmptyLabelFromTemplate(t *testing.T) {

 	var filteredRes promql.Vector // After removing 'ALERTS_FOR_STATE' samples.
 	res, err := rule.Eval(
-		suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0,
+		suite.Context(), 0, evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0,
 	)
 	require.NoError(t, err)
 	for _, smpl := range res {
@ -484,7 +485,7 @@ instance: {{ $v.Labels.instance }}, value: {{ printf "%.0f" $v.Value }};
 		close(getDoneCh)
 	}()
 	_, err = ruleWithQueryInTemplate.Eval(
-		suite.Context(), evalTime, slowQueryFunc, nil, 0,
+		suite.Context(), 0, evalTime, slowQueryFunc, nil, 0,
 	)
 	require.NoError(t, err)
 }
@ -536,7 +537,7 @@ func TestAlertingRuleDuplicate(t *testing.T) {
 		"",
 		true, log.NewNopLogger(),
 	)
-	_, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil, 0)
+	_, err := rule.Eval(ctx, 0, now, EngineQueryFunc(engine, storage), nil, 0)
 	require.Error(t, err)
 	require.EqualError(t, err, "vector contains metrics with the same labelset after applying alert labels")
 }
@ -587,7 +588,7 @@ func TestAlertingRuleLimit(t *testing.T) {
 	evalTime := time.Unix(0, 0)

 	for _, test := range tests {
-		switch _, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, test.limit); {
+		switch _, err := rule.Eval(suite.Context(), 0, evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, test.limit); {
 		case err != nil:
 			require.EqualError(t, err, test.err)
 		case test.err != "":
@ -819,7 +820,7 @@ func TestKeepFiringFor(t *testing.T) {
 		t.Logf("case %d", i)
 		evalTime := baseTime.Add(time.Duration(i) * time.Minute)
 		result[0].T = timestamp.FromTime(evalTime)
-		res, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0)
+		res, err := rule.Eval(suite.Context(), 0, evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0)
 		require.NoError(t, err)

 		var filteredRes promql.Vector // After removing 'ALERTS_FOR_STATE' samples.
@ -836,7 +837,7 @@ func TestKeepFiringFor(t *testing.T) {
 		require.Equal(t, result, filteredRes)
 	}
 	evalTime := baseTime.Add(time.Duration(len(results)) * time.Minute)
-	res, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0)
+	res, err := rule.Eval(suite.Context(), 0, evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0)
 	require.NoError(t, err)
 	require.Equal(t, 0, len(res))
 }
@ -876,7 +877,7 @@ func TestPendingAndKeepFiringFor(t *testing.T) {

 	baseTime := time.Unix(0, 0)
 	result.T = timestamp.FromTime(baseTime)
-	res, err := rule.Eval(suite.Context(), baseTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0)
+	res, err := rule.Eval(suite.Context(), 0, baseTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0)
 	require.NoError(t, err)

 	require.Len(t, res, 2)
@ -891,7 +892,7 @@ func TestPendingAndKeepFiringFor(t *testing.T) {
 	}

 	evalTime := baseTime.Add(time.Minute)
-	res, err = rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0)
+	res, err = rule.Eval(suite.Context(), 0, evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0)
 	require.NoError(t, err)
 	require.Equal(t, 0, len(res))
 }
@ -925,7 +926,7 @@ func TestAlertingEvalWithOrigin(t *testing.T) {
 		true, log.NewNopLogger(),
 	)

-	_, err = rule.Eval(ctx, now, func(ctx context.Context, qs string, _ time.Time) (promql.Vector, error) {
+	_, err = rule.Eval(ctx, 0, now, func(ctx context.Context, qs string, _ time.Time) (promql.Vector, error) {
 		detail = FromOriginContext(ctx)
 		return nil, nil
 	}, nil, 0)
--- a/rules/fixtures/rules_alerts.yaml
+++ b/rules/fixtures/rules_alerts.yaml
@ -0,0 +1,5 @@
+groups:
+  - name: test
+    rules:
+      - alert: test
+        expr: sum by (job)(rate(http_requests_total[5m]))
--- a/rules/fixtures/rules_alerts2.yaml
+++ b/rules/fixtures/rules_alerts2.yaml
@ -0,0 +1,5 @@
+groups:
+  - name: test
+    rules:
+      - alert: test_2
+        expr: sum by (job)(rate(http_requests_total[5m]))
--- a/rules/fixtures/rules_with_alignment.yaml
+++ b/rules/fixtures/rules_with_alignment.yaml
@ -0,0 +1,27 @@
+groups:
+  - name: aligned
+    align_evaluation_time_on_interval: true
+    interval: 5m
+    rules:
+      - record: job:http_requests:rate5m
+        expr: sum by (job)(rate(http_requests_total[5m]))
+
+  - name: aligned_with_crazy_interval
+    align_evaluation_time_on_interval: true
+    interval: 1m27s
+    rules:
+      - record: job:http_requests:rate5m
+        expr: sum by (job)(rate(http_requests_total[5m]))
+
+  - name: unaligned_default
+    interval: 5m
+    rules:
+      - record: job:http_requests:rate5m
+        expr: sum by (job)(rate(http_requests_total[5m]))
+
+  - name: unaligned_explicit
+    interval: 5m
+    align_evaluation_time_on_interval: false
+    rules:
+      - record: job:http_requests:rate5m
+        expr: sum by (job)(rate(http_requests_total[5m]))
--- a/rules/fixtures/rules_with_source_tenants.yaml
+++ b/rules/fixtures/rules_with_source_tenants.yaml
@ -0,0 +1,6 @@
+groups:
+  - name: test
+    rules:
+      - record: job:http_requests:rate5m
+        expr: sum by (job)(rate(http_requests_total[5m]))
+    source_tenants: [tenant-1, tenant-2]
--- a/rules/manager.go
+++ b/rules/manager.go
@ -19,6 +19,7 @@ import (
 	"fmt"
 	"math"
 	"net/url"
+	"sort"
 	"sync"
 	"time"

@ -218,8 +219,9 @@ type Rule interface {
 	Name() string
 	// Labels of the rule.
 	Labels() labels.Labels
-	// eval evaluates the rule, including any associated recording or alerting actions.
-	Eval(context.Context, time.Time, QueryFunc, *url.URL, int) (promql.Vector, error)
+	// Eval evaluates the rule, including any associated recording or alerting actions.
+	// The duration passed is the evaluation delay.
+	Eval(context.Context, time.Duration, time.Time, QueryFunc, *url.URL, int) (promql.Vector, error)
 	// String returns a human-readable string representation of the rule.
 	String() string
 	// Query returns the rule query expression.
@ -247,8 +249,10 @@ type Group struct {
 	name                 string
 	file                 string
 	interval             time.Duration
+	evaluationDelay      *time.Duration
 	limit                int
 	rules                []Rule
+	sourceTenants        []string
 	seriesInPreviousEval []map[string]labels.Labels // One per Rule.
 	staleSeries          []labels.Labels
 	opts                 *ManagerOptions
@ -271,6 +275,8 @@ type Group struct {
 	// Rule group evaluation iteration function,
 	// defaults to DefaultEvalIterationFunc.
 	evalIterationFunc GroupEvalIterationFunc
+
+	alignEvaluationTimeOnInterval bool
 }

 // GroupEvalIterationFunc is used to implement and extend rule group
@ -281,14 +287,17 @@ type Group struct {
 type GroupEvalIterationFunc func(ctx context.Context, g *Group, evalTimestamp time.Time)

 type GroupOptions struct {
-	Name, File        string
-	Interval          time.Duration
-	Limit             int
-	Rules             []Rule
-	ShouldRestore     bool
-	Opts              *ManagerOptions
-	done              chan struct{}
-	EvalIterationFunc GroupEvalIterationFunc
+	Name, File                    string
+	Interval                      time.Duration
+	Limit                         int
+	Rules                         []Rule
+	SourceTenants                 []string
+	ShouldRestore                 bool
+	Opts                          *ManagerOptions
+	EvaluationDelay               *time.Duration
+	done                          chan struct{}
+	EvalIterationFunc             GroupEvalIterationFunc
+	AlignEvaluationTimeOnInterval bool
 }

 // NewGroup makes a new Group with the given name, options, and rules.
@ -315,20 +324,23 @@ func NewGroup(o GroupOptions) *Group {
 	}

 	return &Group{
-		name:                 o.Name,
-		file:                 o.File,
-		interval:             o.Interval,
-		limit:                o.Limit,
-		rules:                o.Rules,
-		shouldRestore:        o.ShouldRestore,
-		opts:                 o.Opts,
-		seriesInPreviousEval: make([]map[string]labels.Labels, len(o.Rules)),
-		done:                 make(chan struct{}),
-		managerDone:          o.done,
-		terminated:           make(chan struct{}),
-		logger:               log.With(o.Opts.Logger, "file", o.File, "group", o.Name),
-		metrics:              metrics,
-		evalIterationFunc:    evalIterationFunc,
+		name:                          o.Name,
+		file:                          o.File,
+		interval:                      o.Interval,
+		evaluationDelay:               o.EvaluationDelay,
+		limit:                         o.Limit,
+		rules:                         o.Rules,
+		shouldRestore:                 o.ShouldRestore,
+		opts:                          o.Opts,
+		sourceTenants:                 o.SourceTenants,
+		seriesInPreviousEval:          make([]map[string]labels.Labels, len(o.Rules)),
+		done:                          make(chan struct{}),
+		managerDone:                   o.done,
+		terminated:                    make(chan struct{}),
+		logger:                        log.With(o.Opts.Logger, "file", o.File, "group", o.Name),
+		metrics:                       metrics,
+		evalIterationFunc:             evalIterationFunc,
+		alignEvaluationTimeOnInterval: o.AlignEvaluationTimeOnInterval,
 	}
 }

@ -353,6 +365,10 @@ func (g *Group) Interval() time.Duration { return g.interval }
 // Limit returns the group's limit.
 func (g *Group) Limit() int { return g.limit }

+// SourceTenants returns the source tenants for the group.
+// If it's empty or nil, then the owning user/tenant is considered to be the source tenant.
+func (g *Group) SourceTenants() []string { return g.sourceTenants }
+
 func (g *Group) Logger() log.Logger { return g.logger }

 func (g *Group) run(ctx context.Context) {
@ -558,9 +574,11 @@ func (g *Group) setLastEvalTimestamp(ts time.Time) {

 // EvalTimestamp returns the immediately preceding consistently slotted evaluation time.
 func (g *Group) EvalTimestamp(startTime int64) time.Time {
-	var (
+	var offset int64
+	if !g.alignEvaluationTimeOnInterval {
 		offset = int64(g.hash() % uint64(g.interval))
-
+	}
+	var (
 		// This group's evaluation times differ from the perfect time intervals by `offset` nanoseconds.
 		// But we can only use `% interval` to align with the interval. And `% interval` will always
 		// align with the perfect time intervals, instead of this group's. Because of this we add
@ -642,6 +660,7 @@ func (g *Group) CopyState(from *Group) {
 // Eval runs a single evaluation cycle in which all rules are evaluated sequentially.
 func (g *Group) Eval(ctx context.Context, ts time.Time) {
 	var samplesTotal float64
+	evaluationDelay := g.EvaluationDelay()
 	for i, rule := range g.rules {
 		select {
 		case <-g.done:
@ -663,7 +682,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {

 			g.metrics.EvalTotal.WithLabelValues(GroupKey(g.File(), g.Name())).Inc()

-			vector, err := rule.Eval(ctx, ts, g.opts.QueryFunc, g.opts.ExternalURL, g.Limit())
+			vector, err := rule.Eval(ctx, evaluationDelay, ts, g.opts.QueryFunc, g.opts.ExternalURL, g.Limit())
 			if err != nil {
 				rule.SetHealth(HealthBad)
 				rule.SetLastError(err)
@ -752,7 +771,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
 			for metric, lset := range g.seriesInPreviousEval[i] {
 				if _, ok := seriesReturned[metric]; !ok {
 					// Series no longer exposed, mark it stale.
-					_, err = app.Append(0, lset, timestamp.FromTime(ts), math.Float64frombits(value.StaleNaN))
+					_, err = app.Append(0, lset, timestamp.FromTime(ts.Add(-evaluationDelay)), math.Float64frombits(value.StaleNaN))
 					unwrappedErr := errors.Unwrap(err)
 					if unwrappedErr == nil {
 						unwrappedErr = err
@ -777,14 +796,25 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
 	g.cleanupStaleSeries(ctx, ts)
 }

+func (g *Group) EvaluationDelay() time.Duration {
+	if g.evaluationDelay != nil {
+		return *g.evaluationDelay
+	}
+	if g.opts.DefaultEvaluationDelay != nil {
+		return g.opts.DefaultEvaluationDelay()
+	}
+	return time.Duration(0)
+}
+
 func (g *Group) cleanupStaleSeries(ctx context.Context, ts time.Time) {
 	if len(g.staleSeries) == 0 {
 		return
 	}
 	app := g.opts.Appendable.Appender(ctx)
+	evaluationDelay := g.EvaluationDelay()
 	for _, s := range g.staleSeries {
 		// Rule that produced series no longer configured, mark it stale.
-		_, err := app.Append(0, s, timestamp.FromTime(ts), math.Float64frombits(value.StaleNaN))
+		_, err := app.Append(0, s, timestamp.FromTime(ts.Add(-evaluationDelay)), math.Float64frombits(value.StaleNaN))
 		unwrappedErr := errors.Unwrap(err)
 		if unwrappedErr == nil {
 			unwrappedErr = err
@ -940,11 +970,37 @@ func (g *Group) Equals(ng *Group) bool {
 		return false
 	}

+	if g.alignEvaluationTimeOnInterval != ng.alignEvaluationTimeOnInterval {
+		return false
+	}
+
 	for i, gr := range g.rules {
 		if gr.String() != ng.rules[i].String() {
 			return false
 		}
 	}
+	{
+		// compare source tenants
+		if len(g.sourceTenants) != len(ng.sourceTenants) {
+			return false
+		}
+
+		copyAndSort := func(x []string) []string {
+			copied := make([]string, len(x))
+			copy(copied, x)
+			sort.Strings(copied)
+			return copied
+		}
+
+		ngSourceTenantsCopy := copyAndSort(ng.sourceTenants)
+		gSourceTenantsCopy := copyAndSort(g.sourceTenants)
+
+		for i := range ngSourceTenantsCopy {
+			if gSourceTenantsCopy[i] != ngSourceTenantsCopy[i] {
+				return false
+			}
+		}
+	}

 	return true
 }
@ -964,20 +1020,30 @@ type Manager struct {
 // NotifyFunc sends notifications about a set of alerts generated by the given expression.
 type NotifyFunc func(ctx context.Context, expr string, alerts ...*Alert)

+type ContextWrapFunc func(ctx context.Context, g *Group) context.Context
+
 // ManagerOptions bundles options for the Manager.
 type ManagerOptions struct {
-	ExternalURL     *url.URL
-	QueryFunc       QueryFunc
-	NotifyFunc      NotifyFunc
-	Context         context.Context
-	Appendable      storage.Appendable
-	Queryable       storage.Queryable
-	Logger          log.Logger
-	Registerer      prometheus.Registerer
-	OutageTolerance time.Duration
-	ForGracePeriod  time.Duration
-	ResendDelay     time.Duration
-	GroupLoader     GroupLoader
+	ExternalURL *url.URL
+	QueryFunc   QueryFunc
+	NotifyFunc  NotifyFunc
+	Context     context.Context
+	// GroupEvaluationContextFunc will be called to wrap Context based on the group being evaluated.
+	// Will be skipped if nil.
+	GroupEvaluationContextFunc ContextWrapFunc
+	Appendable                 storage.Appendable
+	Queryable                  storage.Queryable
+	Logger                     log.Logger
+	Registerer                 prometheus.Registerer
+	OutageTolerance            time.Duration
+	ForGracePeriod             time.Duration
+	ResendDelay                time.Duration
+	GroupLoader                GroupLoader
+	DefaultEvaluationDelay     func() time.Duration
+
+	// AlwaysRestoreAlertState forces all new or changed groups in calls to Update to restore.
+	// Useful when you know you will be adding alerting rules after the manager has already started.
+	AlwaysRestoreAlertState bool

 	Metrics *Metrics
 }
@ -1071,11 +1137,16 @@ func (m *Manager) Update(interval time.Duration, files []string, externalLabels
 				newg.CopyState(oldg)
 			}
 			wg.Done()
+
+			ctx := m.opts.Context
+			if m.opts.GroupEvaluationContextFunc != nil {
+				ctx = m.opts.GroupEvaluationContextFunc(ctx, newg)
+			}
 			// Wait with starting evaluation until the rule manager
 			// is told to run. This is necessary to avoid running
 			// queries against a bootstrapping storage.
 			<-m.block
-			newg.run(m.opts.Context)
+			newg.run(ctx)
 		}(newg)
 	}

@ -1128,7 +1199,7 @@ func (m *Manager) LoadGroups(
 ) (map[string]*Group, []error) {
 	groups := make(map[string]*Group)

-	shouldRestore := !m.restored
+	shouldRestore := !m.restored || m.opts.AlwaysRestoreAlertState

 	for _, fn := range filenames {
 		rgs, errs := m.opts.GroupLoader.Load(fn)
@ -1159,7 +1230,7 @@ func (m *Manager) LoadGroups(
 						labels.FromMap(r.Annotations),
 						externalLabels,
 						externalURL,
-						m.restored,
+						!shouldRestore,
 						log.With(m.logger, "alert", r.Alert),
 					))
 					continue
@ -1172,15 +1243,18 @@ func (m *Manager) LoadGroups(
 			}

 			groups[GroupKey(fn, rg.Name)] = NewGroup(GroupOptions{
-				Name:              rg.Name,
-				File:              fn,
-				Interval:          itv,
-				Limit:             rg.Limit,
-				Rules:             rules,
-				ShouldRestore:     shouldRestore,
-				Opts:              m.opts,
-				done:              m.done,
-				EvalIterationFunc: groupEvalIterationFunc,
+				Name:                          rg.Name,
+				File:                          fn,
+				Interval:                      itv,
+				Limit:                         rg.Limit,
+				Rules:                         rules,
+				SourceTenants:                 rg.SourceTenants,
+				ShouldRestore:                 shouldRestore,
+				Opts:                          m.opts,
+				EvaluationDelay:               (*time.Duration)(rg.EvaluationDelay),
+				done:                          m.done,
+				EvalIterationFunc:             groupEvalIterationFunc,
+				AlignEvaluationTimeOnInterval: rg.AlignEvaluationTimeOnInterval,
 			})
 		}
 	}
--- a/rules/manager_test.go
+++ b/rules/manager_test.go
--- a/rules/origin_test.go
+++ b/rules/origin_test.go
@ -30,7 +30,7 @@ type unknownRule struct{}

 func (u unknownRule) Name() string          { return "" }
 func (u unknownRule) Labels() labels.Labels { return labels.EmptyLabels() }
-func (u unknownRule) Eval(context.Context, time.Time, QueryFunc, *url.URL, int) (promql.Vector, error) {
+func (u unknownRule) Eval(context.Context, time.Duration, time.Time, QueryFunc, *url.URL, int) (promql.Vector, error) {
 	return nil, nil
 }
 func (u unknownRule) String() string                       { return "" }
--- a/rules/recording.go
+++ b/rules/recording.go
@ -72,10 +72,10 @@ func (rule *RecordingRule) Labels() labels.Labels {
 }

 // Eval evaluates the rule and then overrides the metric names and labels accordingly.
-func (rule *RecordingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, _ *url.URL, limit int) (promql.Vector, error) {
+func (rule *RecordingRule) Eval(ctx context.Context, evalDelay time.Duration, ts time.Time, query QueryFunc, _ *url.URL, limit int) (promql.Vector, error) {
 	ctx = NewOriginContext(ctx, NewRuleDetail(rule))

-	vector, err := query(ctx, rule.vector.String(), ts)
+	vector, err := query(ctx, rule.vector.String(), ts.Add(-evalDelay))
 	if err != nil {
 		return nil, err
 	}
--- a/rules/recording_test.go
+++ b/rules/recording_test.go
@ -129,7 +129,7 @@ func TestRuleEval(t *testing.T) {
 	for _, scenario := range ruleEvalTestScenarios {
 		t.Run(scenario.name, func(t *testing.T) {
 			rule := NewRecordingRule("test_rule", scenario.expr, scenario.ruleLabels)
-			result, err := rule.Eval(suite.Context(), ruleEvaluationTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0)
+			result, err := rule.Eval(suite.Context(), 0, ruleEvaluationTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0)
 			require.NoError(t, err)
 			require.Equal(t, scenario.expected, result)
 		})
@ -149,7 +149,7 @@ func BenchmarkRuleEval(b *testing.B) {
 			b.ResetTimer()

 			for i := 0; i < b.N; i++ {
-				_, err := rule.Eval(suite.Context(), ruleEvaluationTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0)
+				_, err := rule.Eval(suite.Context(), 0, ruleEvaluationTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, 0)
 				if err != nil {
 					require.NoError(b, err)
 				}
@ -178,7 +178,7 @@ func TestRuleEvalDuplicate(t *testing.T) {

 	expr, _ := parser.ParseExpr(`vector(0) or label_replace(vector(0),"test","x","","")`)
 	rule := NewRecordingRule("foo", expr, labels.FromStrings("test", "test"))
-	_, err := rule.Eval(ctx, now, EngineQueryFunc(engine, storage), nil, 0)
+	_, err := rule.Eval(ctx, 0, now, EngineQueryFunc(engine, storage), nil, 0)
 	require.Error(t, err)
 	require.EqualError(t, err, "vector contains metrics with the same labelset after applying rule labels")
 }
@ -223,7 +223,7 @@ func TestRecordingRuleLimit(t *testing.T) {
 	evalTime := time.Unix(0, 0)

 	for _, test := range tests {
-		switch _, err := rule.Eval(suite.Context(), evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, test.limit); {
+		switch _, err := rule.Eval(suite.Context(), 0, evalTime, EngineQueryFunc(suite.QueryEngine(), suite.Storage()), nil, test.limit); {
 		case err != nil:
 			require.EqualError(t, err, test.err)
 		case test.err != "":
@ -251,7 +251,7 @@ func TestRecordingEvalWithOrigin(t *testing.T) {
 	require.NoError(t, err)

 	rule := NewRecordingRule(name, expr, lbs)
-	_, err = rule.Eval(ctx, now, func(ctx context.Context, qs string, _ time.Time) (promql.Vector, error) {
+	_, err = rule.Eval(ctx, 0, now, func(ctx context.Context, qs string, _ time.Time) (promql.Vector, error) {
 		detail = FromOriginContext(ctx)
 		return nil, nil
 	}, nil, 0)
--- a/storage/interface.go
+++ b/storage/interface.go
@ -193,6 +193,9 @@ type SelectHints struct {
 	By       bool     // Indicate whether it is without or by.
 	Range    int64    // Range vector selector range in milliseconds.

+	ShardIndex uint64 // Current shard index (starts from 0 and up to ShardCount-1).
+	ShardCount uint64 // Total number of shards (0 means sharding is disabled).
+
 	// DisableTrimming allows to disable trimming of matching series chunks based on query Start and End time.
 	// When disabled, the result may contain samples outside the queried time range but Select() performances
 	// may be improved.
@ -412,6 +415,12 @@ type ChunkSeriesSet interface {
 type ChunkSeries interface {
 	Labels
 	ChunkIterable
+
+	// ChunkCount returns the number of chunks available from this ChunkSeries.
+	//
+	// This value is used by Mimir's ingesters to report the number of chunks expected to be returned by a query,
+	// which is used by queriers to enforce the 'max chunks per query' limit.
+	ChunkCount() (int, error)
 }

 // Labels represents an item that has labels e.g. time series.
--- a/storage/merge.go
+++ b/storage/merge.go
@ -658,22 +658,42 @@ func NewCompactingChunkSeriesMerger(mergeFunc VerticalSeriesMergeFunc) VerticalC
 		if len(series) == 0 {
 			return nil
 		}
+
+		chunkIteratorFn := func(chunks.Iterator) chunks.Iterator {
+			iterators := make([]chunks.Iterator, 0, len(series))
+			for _, s := range series {
+				iterators = append(iterators, s.Iterator(nil))
+			}
+			return &compactChunkIterator{
+				mergeFunc: mergeFunc,
+				iterators: iterators,
+			}
+		}
+
 		return &ChunkSeriesEntry{
-			Lset: series[0].Labels(),
-			ChunkIteratorFn: func(chunks.Iterator) chunks.Iterator {
-				iterators := make([]chunks.Iterator, 0, len(series))
-				for _, s := range series {
-					iterators = append(iterators, s.Iterator(nil))
-				}
-				return &compactChunkIterator{
-					mergeFunc: mergeFunc,
-					iterators: iterators,
-				}
+			Lset:            series[0].Labels(),
+			ChunkIteratorFn: chunkIteratorFn,
+			ChunkCountFn: func() (int, error) {
+				// This method is expensive, but we don't expect to ever actually use this on the ingester query path in Mimir -
+				// it's just here to ensure things don't break if this assumption ever changes.
+				// Ingesters return uncompacted chunks to queriers, so this method is never called.
+				return countChunks(chunkIteratorFn)
 			},
 		}
 	}
 }

+func countChunks(chunkIteratorFn func(chunks.Iterator) chunks.Iterator) (int, error) {
+	chunkCount := 0
+	it := chunkIteratorFn(nil)
+
+	for it.Next() {
+		chunkCount++
+	}
+
+	return chunkCount, it.Err()
+}
+
 // compactChunkIterator is responsible to compact chunks from different iterators of the same time series into single chainSeries.
 // If time-overlapping chunks are found, they are encoded and passed to series merge and encoded again into one bigger chunk.
 // TODO(bwplotka): Currently merge will compact overlapping chunks with bigger chunk, without limit. Split it: https://github.com/prometheus/tsdb/issues/670
@ -801,6 +821,7 @@ func NewConcatenatingChunkSeriesMerger() VerticalChunkSeriesMergeFunc {
 		if len(series) == 0 {
 			return nil
 		}
+
 		return &ChunkSeriesEntry{
 			Lset: series[0].Labels(),
 			ChunkIteratorFn: func(chunks.Iterator) chunks.Iterator {
@ -812,6 +833,20 @@ func NewConcatenatingChunkSeriesMerger() VerticalChunkSeriesMergeFunc {
 					iterators: iterators,
 				}
 			},
+			ChunkCountFn: func() (int, error) {
+				chunkCount := 0
+
+				for _, series := range series {
+					c, err := series.ChunkCount()
+					if err != nil {
+						return 0, err
+					}
+
+					chunkCount += c
+				}
+
+				return chunkCount, nil
+			},
 		}
 	}
 }
--- a/storage/merge_test.go
+++ b/storage/merge_test.go
@ -428,6 +428,14 @@ func TestCompactingChunkSeriesMerger(t *testing.T) {
 			},
 			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{fSample{1, 1}, fSample{2, 2}}, []tsdbutil.Sample{fSample{3, 3}, fSample{5, 5}}, []tsdbutil.Sample{fSample{7, 7}, fSample{9, 9}}, []tsdbutil.Sample{fSample{10, 10}}),
 		},
+		{
+			name: "two non overlapping in reverse order",
+			input: []ChunkSeries{
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{fSample{7, 7}, fSample{9, 9}}, []tsdbutil.Sample{fSample{10, 10}}),
+				NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{fSample{1, 1}, fSample{2, 2}}, []tsdbutil.Sample{fSample{3, 3}, fSample{5, 5}}),
+			},
+			expected: NewListChunkSeriesFromSamples(labels.FromStrings("bar", "baz"), []tsdbutil.Sample{fSample{1, 1}, fSample{2, 2}}, []tsdbutil.Sample{fSample{3, 3}, fSample{5, 5}}, []tsdbutil.Sample{fSample{7, 7}, fSample{9, 9}}, []tsdbutil.Sample{fSample{10, 10}}),
+		},
 		{
 			name: "two overlapping",
 			input: []ChunkSeries{
@ -570,6 +578,10 @@ func TestCompactingChunkSeriesMerger(t *testing.T) {

 			require.Equal(t, expErr, actErr)
 			require.Equal(t, expChks, actChks)
+
+			count, err := merged.ChunkCount()
+			require.NoError(t, err)
+			require.Len(t, actChks, count)
 		})
 	}
 }
@ -704,6 +716,10 @@ func TestConcatenatingChunkSeriesMerger(t *testing.T) {

 			require.Equal(t, expErr, actErr)
 			require.Equal(t, expChks, actChks)
+
+			count, err := merged.ChunkCount()
+			require.NoError(t, err)
+			require.Equal(t, len(expChks), count)
 		})
 	}
 }
--- a/storage/series.go
+++ b/storage/series.go
@ -35,11 +35,13 @@ func (s *SeriesEntry) Iterator(it chunkenc.Iterator) chunkenc.Iterator { return

 type ChunkSeriesEntry struct {
 	Lset            labels.Labels
+	ChunkCountFn    func() (int, error)
 	ChunkIteratorFn func(chunks.Iterator) chunks.Iterator
 }

 func (s *ChunkSeriesEntry) Labels() labels.Labels                       { return s.Lset }
 func (s *ChunkSeriesEntry) Iterator(it chunks.Iterator) chunks.Iterator { return s.ChunkIteratorFn(it) }
+func (s *ChunkSeriesEntry) ChunkCount() (int, error)                    { return s.ChunkCountFn() }

 // NewListSeries returns series entry with iterator that allows to iterate over provided samples.
 func NewListSeries(lset labels.Labels, s []tsdbutil.Sample) *SeriesEntry {
@ -90,6 +92,7 @@ func NewListChunkSeriesFromSamples(lset labels.Labels, samples ...[]tsdbutil.Sam
 			}
 			return NewListChunkSeriesIterator(chks...)
 		},
+		ChunkCountFn: func() (int, error) { return len(samples), nil }, // We create one chunk per slice of samples.
 	}
 }

@ -390,6 +393,20 @@ func (s *seriesToChunkEncoder) Iterator(it chunks.Iterator) chunks.Iterator {
 	return NewListChunkSeriesIterator(chks...)
 }

+func (s *seriesToChunkEncoder) ChunkCount() (int, error) {
+	// This method is expensive, but we don't expect to ever actually use this on the ingester query path in Mimir -
+	// it's just here to ensure things don't break if this assumption ever changes.
+
+	chunkCount := 0
+	it := s.Iterator(nil)
+
+	for it.Next() {
+		chunkCount++
+	}
+
+	return chunkCount, it.Err()
+}
+
 func appendChunk(chks []chunks.Meta, mint, maxt int64, chk chunkenc.Chunk) []chunks.Meta {
 	if chk != nil {
 		chks = append(chks, chunks.Meta{
--- a/storage/series_test.go
+++ b/storage/series_test.go
@ -73,6 +73,39 @@ func TestListSeriesIterator(t *testing.T) {
 	require.Equal(t, chunkenc.ValNone, it.Seek(2))
 }

+func TestNewListChunkSeriesFromSamples(t *testing.T) {
+	lbls := labels.FromStrings("__name__", "the_series")
+	series := NewListChunkSeriesFromSamples(
+		lbls,
+		samples{
+			fSample{0, 0},
+			fSample{1, 1},
+			fSample{1, 1.5},
+			fSample{2, 2},
+			fSample{3, 3},
+		},
+		samples{
+			fSample{4, 5},
+		},
+	)
+
+	require.Equal(t, lbls, series.Labels())
+
+	it := series.Iterator(nil)
+	chks := []chunks.Meta{}
+
+	for it.Next() {
+		chks = append(chks, it.At())
+	}
+
+	require.NoError(t, it.Err())
+	require.Len(t, chks, 2)
+
+	count, err := series.ChunkCount()
+	require.NoError(t, err)
+	require.Equal(t, len(chks), count, "should have one chunk per group of samples")
+}
+
 // TestSeriesSetToChunkSet test the property of SeriesSet that says
 // returned series should be iterable even after Next is called.
 func TestChunkSeriesSetToSeriesSet(t *testing.T) {
@ -125,6 +158,82 @@ func TestChunkSeriesSetToSeriesSet(t *testing.T) {
 	}
 }

+func TestSeriesToChunks(t *testing.T) {
+	generateSamples := func(count int) []tsdbutil.Sample {
+		s := make([]tsdbutil.Sample, count)
+
+		for i := 0; i < count; i++ {
+			s[i] = fSample{t: int64(i), f: float64(i) * 10.0}
+		}
+
+		return s
+	}
+
+	h := &histogram.Histogram{
+		Count:         0,
+		ZeroThreshold: 0.001,
+		Schema:        0,
+	}
+
+	testCases := map[string]struct {
+		samples            []tsdbutil.Sample
+		expectedChunkCount int
+	}{
+		"no samples": {
+			samples:            []tsdbutil.Sample{},
+			expectedChunkCount: 0,
+		},
+		"single sample": {
+			samples:            generateSamples(1),
+			expectedChunkCount: 1,
+		},
+		"120 samples": {
+			samples:            generateSamples(120),
+			expectedChunkCount: 1,
+		},
+		"121 samples": {
+			samples:            generateSamples(121),
+			expectedChunkCount: 2,
+		},
+		"240 samples": {
+			samples:            generateSamples(240),
+			expectedChunkCount: 2,
+		},
+		"241 samples": {
+			samples:            generateSamples(241),
+			expectedChunkCount: 3,
+		},
+		"float samples and histograms": {
+			samples: []tsdbutil.Sample{
+				fSample{t: 1, f: 10},
+				fSample{t: 2, f: 20},
+				hSample{t: 3, h: h},
+				fSample{t: 4, f: 40},
+			},
+			expectedChunkCount: 3,
+		},
+	}
+
+	for name, testCase := range testCases {
+		t.Run(name, func(t *testing.T) {
+			lset := labels.FromStrings("__name__", "test_series")
+			series := NewListSeries(lset, testCase.samples)
+			encoder := NewSeriesToChunkEncoder(series)
+			require.Equal(t, lset, encoder.Labels())
+
+			chks, err := ExpandChunks(encoder.Iterator(nil))
+			require.NoError(t, err)
+			require.Len(t, chks, testCase.expectedChunkCount)
+			count, err := encoder.ChunkCount()
+			require.NoError(t, err)
+			require.Equal(t, testCase.expectedChunkCount, count)
+
+			encodedSamples := expandChunks(chks)
+			require.Equal(t, testCase.samples, encodedSamples)
+		})
+	}
+}
+
 type histogramTest struct {
 	samples                     []tsdbutil.Sample
 	expectedCounterResetHeaders []chunkenc.CounterResetHeader
@ -430,8 +539,12 @@ func testHistogramsSeriesToChunks(t *testing.T, test histogramTest) {
 	require.NoError(t, err)
 	require.Equal(t, len(test.expectedCounterResetHeaders), len(chks))

+	count, err := encoder.ChunkCount()
+	require.NoError(t, err)
+	require.Len(t, chks, count)
+
 	// Decode all encoded samples and assert they are equal to the original ones.
-	encodedSamples := expandHistogramSamples(chks)
+	encodedSamples := expandChunks(chks)
 	require.Equal(t, len(test.samples), len(encodedSamples))

 	for i, s := range test.samples {
@ -470,9 +583,9 @@ func testHistogramsSeriesToChunks(t *testing.T, test histogramTest) {
 	}
 }

-func expandHistogramSamples(chunks []chunks.Meta) (result []tsdbutil.Sample) {
+func expandChunks(chunks []chunks.Meta) (result []tsdbutil.Sample) {
 	if len(chunks) == 0 {
-		return
+		return []tsdbutil.Sample{}
 	}

 	for _, chunk := range chunks {
@ -485,6 +598,9 @@ func expandHistogramSamples(chunks []chunks.Meta) (result []tsdbutil.Sample) {
 			case chunkenc.ValFloatHistogram:
 				t, fh := it.AtFloatHistogram()
 				result = append(result, fhSample{t: t, fh: fh})
+			case chunkenc.ValFloat:
+				t, f := it.At()
+				result = append(result, fSample{t: t, f: f})
 			default:
 				panic("unexpected value type")
 			}
--- a/tsdb/addsymbol.go
+++ b/tsdb/addsymbol.go
@ -0,0 +1,10 @@
+//go:build !stringlabels
+
+// Split out function which needs to be coded differently for stringlabels case.
+
+package tsdb
+
+func (sw *symbolsBatcher) addSymbol(sym string) error {
+	sw.buffer[sym] = struct{}{}
+	return sw.flushSymbols(false)
+}
--- a/tsdb/addsymbol_stringlabels.go
+++ b/tsdb/addsymbol_stringlabels.go
@ -0,0 +1,15 @@
+//go:build stringlabels
+
+// Split out function which needs to be coded differently for stringlabels case.
+
+package tsdb
+
+import "strings"
+
+func (sw *symbolsBatcher) addSymbol(sym string) error {
+	if _, found := sw.buffer[sym]; !found {
+		sym = strings.Clone(sym) // So we don't retain reference to the entire labels block.
+		sw.buffer[sym] = struct{}{}
+	}
+	return sw.flushSymbols(false)
+}
--- a/tsdb/async_block_writer.go
+++ b/tsdb/async_block_writer.go
@ -0,0 +1,175 @@
+package tsdb
+
+import (
+	"context"
+
+	"github.com/pkg/errors"
+	"go.uber.org/atomic"
+	"golang.org/x/sync/semaphore"
+
+	"github.com/prometheus/prometheus/model/labels"
+	"github.com/prometheus/prometheus/storage"
+	"github.com/prometheus/prometheus/tsdb/chunkenc"
+	"github.com/prometheus/prometheus/tsdb/chunks"
+)
+
+var errAsyncBlockWriterNotRunning = errors.New("asyncBlockWriter doesn't run anymore")
+
+// asyncBlockWriter runs a background goroutine that writes series and chunks to the block asynchronously.
+// All calls on asyncBlockWriter must be done from single goroutine, it is not safe for concurrent usage from multiple goroutines.
+type asyncBlockWriter struct {
+	chunkPool chunkenc.Pool // Where to return chunks after writing.
+
+	chunkw ChunkWriter
+	indexw IndexWriter
+
+	closeSemaphore *semaphore.Weighted
+
+	seriesChan chan seriesToWrite
+	finishedCh chan asyncBlockWriterResult
+
+	closed bool
+	result asyncBlockWriterResult
+}
+
+type asyncBlockWriterResult struct {
+	stats BlockStats
+	err   error
+}
+
+type seriesToWrite struct {
+	lbls labels.Labels
+	chks []chunks.Meta
+}
+
+func newAsyncBlockWriter(chunkPool chunkenc.Pool, chunkw ChunkWriter, indexw IndexWriter, closeSema *semaphore.Weighted) *asyncBlockWriter {
+	bw := &asyncBlockWriter{
+		chunkPool:      chunkPool,
+		chunkw:         chunkw,
+		indexw:         indexw,
+		seriesChan:     make(chan seriesToWrite, 64),
+		finishedCh:     make(chan asyncBlockWriterResult, 1),
+		closeSemaphore: closeSema,
+	}
+
+	go bw.loop()
+	return bw
+}
+
+// loop doing the writes. Return value is only used by defer statement, and is sent to the channel,
+// before closing it.
+func (bw *asyncBlockWriter) loop() (res asyncBlockWriterResult) {
+	defer func() {
+		bw.finishedCh <- res
+		close(bw.finishedCh)
+	}()
+
+	stats := BlockStats{}
+	ref := storage.SeriesRef(0)
+	for sw := range bw.seriesChan {
+		if err := bw.chunkw.WriteChunks(sw.chks...); err != nil {
+			return asyncBlockWriterResult{err: errors.Wrap(err, "write chunks")}
+		}
+		if err := bw.indexw.AddSeries(ref, sw.lbls, sw.chks...); err != nil {
+			return asyncBlockWriterResult{err: errors.Wrap(err, "add series")}
+		}
+
+		stats.NumChunks += uint64(len(sw.chks))
+		stats.NumSeries++
+		for _, chk := range sw.chks {
+			stats.NumSamples += uint64(chk.Chunk.NumSamples())
+		}
+
+		for _, chk := range sw.chks {
+			if err := bw.chunkPool.Put(chk.Chunk); err != nil {
+				return asyncBlockWriterResult{err: errors.Wrap(err, "put chunk")}
+			}
+		}
+		ref++
+	}
+
+	err := bw.closeSemaphore.Acquire(context.Background(), 1)
+	if err != nil {
+		return asyncBlockWriterResult{err: errors.Wrap(err, "failed to acquire semaphore before closing writers")}
+	}
+	defer bw.closeSemaphore.Release(1)
+
+	// If everything went fine with writing so far, close writers.
+	if err := bw.chunkw.Close(); err != nil {
+		return asyncBlockWriterResult{err: errors.Wrap(err, "closing chunk writer")}
+	}
+	if err := bw.indexw.Close(); err != nil {
+		return asyncBlockWriterResult{err: errors.Wrap(err, "closing index writer")}
+	}
+
+	return asyncBlockWriterResult{stats: stats}
+}
+
+func (bw *asyncBlockWriter) addSeries(lbls labels.Labels, chks []chunks.Meta) error {
+	select {
+	case bw.seriesChan <- seriesToWrite{lbls: lbls, chks: chks}:
+		return nil
+	case result, ok := <-bw.finishedCh:
+		if ok {
+			bw.result = result
+		}
+
+		// If the writer isn't running anymore because of an error occurred in loop()
+		// then we should return that error too, otherwise it may be never reported
+		// and we'll never know the actual root cause.
+		if bw.result.err != nil {
+			return errors.Wrap(bw.result.err, errAsyncBlockWriterNotRunning.Error())
+		}
+		return errAsyncBlockWriterNotRunning
+	}
+}
+
+func (bw *asyncBlockWriter) closeAsync() {
+	if !bw.closed {
+		bw.closed = true
+
+		close(bw.seriesChan)
+	}
+}
+
+func (bw *asyncBlockWriter) waitFinished() (BlockStats, error) {
+	// Wait for flusher to finish.
+	result, ok := <-bw.finishedCh
+	if ok {
+		bw.result = result
+	}
+
+	return bw.result.stats, bw.result.err
+}
+
+type preventDoubleCloseIndexWriter struct {
+	IndexWriter
+	closed atomic.Bool
+}
+
+func newPreventDoubleCloseIndexWriter(iw IndexWriter) *preventDoubleCloseIndexWriter {
+	return &preventDoubleCloseIndexWriter{IndexWriter: iw}
+}
+
+func (p *preventDoubleCloseIndexWriter) Close() error {
+	if p.closed.CompareAndSwap(false, true) {
+		return p.IndexWriter.Close()
+	}
+	return nil
+}
+
+type preventDoubleCloseChunkWriter struct {
+	ChunkWriter
+	closed atomic.Bool
+}
+
+func newPreventDoubleCloseChunkWriter(cw ChunkWriter) *preventDoubleCloseChunkWriter {
+	return &preventDoubleCloseChunkWriter{ChunkWriter: cw}
+}
+
+func (p *preventDoubleCloseChunkWriter) Close() error {
+	if p.closed.CompareAndSwap(false, true) {
+		return p.ChunkWriter.Close()
+	}
+	return nil
+}
--- a/tsdb/block.go
+++ b/tsdb/block.go
@ -20,6 +20,7 @@ import (
 	"os"
 	"path/filepath"
 	"sync"
+	"time"

 	"github.com/go-kit/log"
 	"github.com/go-kit/log/level"
@ -75,10 +76,21 @@ type IndexReader interface {
 	// during background garbage collections.
 	Postings(name string, values ...string) (index.Postings, error)

+	// PostingsForMatchers assembles a single postings iterator based on the given matchers.
+	// The resulting postings are not ordered by series.
+	// If concurrent hint is set to true, call will be optimized for a (most likely) concurrent call with same matchers,
+	// avoiding same calculations twice, however this implementation may lead to a worse performance when called once.
+	PostingsForMatchers(concurrent bool, ms ...*labels.Matcher) (index.Postings, error)
+
 	// SortedPostings returns a postings list that is reordered to be sorted
 	// by the label set of the underlying series.
 	SortedPostings(index.Postings) index.Postings

+	// ShardedPostings returns a postings list filtered by the provided shardIndex
+	// out of shardCount. For a given posting, its shard MUST be computed hashing
+	// the series labels mod shardCount, using a hash function which is consistent over time.
+	ShardedPostings(p index.Postings, shardIndex, shardCount uint64) index.Postings
+
 	// Series populates the given builder and chunk metas for the series identified
 	// by the reference.
 	// Returns storage.ErrNotFound if the ref does not resolve to a known series.
@ -158,6 +170,9 @@ type BlockMeta struct {

 	// Version of the index format.
 	Version int `json:"version"`
+
+	// OutOfOrder is true if the block was directly created from out-of-order samples.
+	OutOfOrder bool `json:"out_of_order"`
 }

 // BlockStats contains stats about contents of a block.
@ -308,6 +323,11 @@ type Block struct {
 // OpenBlock opens the block in the directory. It can be passed a chunk pool, which is used
 // to instantiate chunk structs.
 func OpenBlock(logger log.Logger, dir string, pool chunkenc.Pool) (pb *Block, err error) {
+	return OpenBlockWithOptions(logger, dir, pool, nil, defaultPostingsForMatchersCacheTTL, defaultPostingsForMatchersCacheSize, false)
+}
+
+// OpenBlockWithOptions is like OpenBlock but allows to pass a cache provider and sharding function.
+func OpenBlockWithOptions(logger log.Logger, dir string, pool chunkenc.Pool, cache index.ReaderCacheProvider, postingsCacheTTL time.Duration, postingsCacheSize int, postingsCacheForce bool) (pb *Block, err error) {
 	if logger == nil {
 		logger = log.NewNopLogger()
 	}
@ -328,10 +348,12 @@ func OpenBlock(logger log.Logger, dir string, pool chunkenc.Pool) (pb *Block, er
 	}
 	closers = append(closers, cr)

-	ir, err := index.NewFileReader(filepath.Join(dir, indexFilename))
+	indexReader, err := index.NewFileReaderWithOptions(filepath.Join(dir, indexFilename), cache)
 	if err != nil {
 		return nil, err
 	}
+	pfmc := NewPostingsForMatchersCache(postingsCacheTTL, postingsCacheSize, postingsCacheForce)
+	ir := indexReaderWithPostingsForMatchers{indexReader, pfmc}
 	closers = append(closers, ir)

 	tr, sizeTomb, err := tombstones.ReadTombstones(dir)
@ -495,10 +517,18 @@ func (r blockIndexReader) Postings(name string, values ...string) (index.Posting
 	return p, nil
 }

+func (r blockIndexReader) PostingsForMatchers(concurrent bool, ms ...*labels.Matcher) (index.Postings, error) {
+	return r.ir.PostingsForMatchers(concurrent, ms...)
+}
+
 func (r blockIndexReader) SortedPostings(p index.Postings) index.Postings {
 	return r.ir.SortedPostings(p)
 }

+func (r blockIndexReader) ShardedPostings(p index.Postings, shardIndex, shardCount uint64) index.Postings {
+	return r.ir.ShardedPostings(p, shardIndex, shardCount)
+}
+
 func (r blockIndexReader) Series(ref storage.SeriesRef, builder *labels.ScratchBuilder, chks *[]chunks.Meta) error {
 	if err := r.ir.Series(ref, builder, chks); err != nil {
 		return errors.Wrapf(err, "block: %s", r.b.Meta().ULID)
@ -551,7 +581,7 @@ func (pb *Block) Delete(mint, maxt int64, ms ...*labels.Matcher) error {
 		return ErrClosing
 	}

-	p, err := PostingsForMatchers(pb.indexr, ms...)
+	p, err := pb.indexr.PostingsForMatchers(false, ms...)
 	if err != nil {
 		return errors.Wrap(err, "select series")
 	}
--- a/tsdb/block_test.go
+++ b/tsdb/block_test.go
@ -312,7 +312,7 @@ func TestBlockSize(t *testing.T) {
 		require.NoError(t, err)
 		require.Equal(t, expAfterDelete, actAfterDelete, "after a delete reported block size doesn't match actual disk size")

-		c, err := NewLeveledCompactor(context.Background(), nil, log.NewNopLogger(), []int64{0}, nil, nil)
+		c, err := NewLeveledCompactor(context.Background(), nil, log.NewNopLogger(), []int64{0}, nil, nil, true)
 		require.NoError(t, err)
 		blockDirAfterCompact, err := c.Compact(tmpdir, []string{blockInit.Dir()}, nil)
 		require.NoError(t, err)
@ -349,6 +349,9 @@ func TestReadIndexFormatV1(t *testing.T) {
 	blockDir := filepath.Join("testdata", "index_format_v1")
 	block, err := OpenBlock(nil, blockDir, nil)
 	require.NoError(t, err)
+	t.Cleanup(func() {
+		require.NoError(t, block.Close())
+	})

 	q, err := NewBlockQuerier(block, 0, 1000)
 	require.NoError(t, err)
@ -487,7 +490,7 @@ func createBlock(tb testing.TB, dir string, series []storage.Series) string {
 }

 func createBlockFromHead(tb testing.TB, dir string, head *Head) string {
-	compactor, err := NewLeveledCompactor(context.Background(), nil, log.NewNopLogger(), []int64{1000000}, nil, nil)
+	compactor, err := NewLeveledCompactor(context.Background(), nil, log.NewNopLogger(), []int64{1000000}, nil, nil, true)
 	require.NoError(tb, err)

 	require.NoError(tb, os.MkdirAll(dir, 0o777))
--- a/tsdb/blockwriter.go
+++ b/tsdb/blockwriter.go
@ -100,7 +100,7 @@ func (w *BlockWriter) Flush(ctx context.Context) (ulid.ULID, error) {
 		nil,
 		w.logger,
 		[]int64{w.blockSize},
-		chunkenc.NewPool(), nil)
+		chunkenc.NewPool(), nil, true)
 	if err != nil {
 		return ulid.ULID{}, errors.Wrap(err, "create leveled compactor")
 	}
--- a/tsdb/chunkenc/histogram_meta.go
+++ b/tsdb/chunkenc/histogram_meta.go
@ -490,7 +490,7 @@ func counterResetHint(crh CounterResetHeader, numRead uint16) histogram.CounterR

 // Handle pathological case of empty span when advancing span idx.
 func nextNonEmptySpanSliceIdx(idx int, spans []histogram.Span) (newIdx int) {
-	for idx++; idx < len(spans) && spans[idx].Length == 0; idx++ {
+	for idx++; idx < len(spans) && spans[idx].Length == 0; idx++ { //nolint:revive // This "empty" block is intentional
 	}
 	return idx
 }
--- a/tsdb/chunks/chunk_write_queue.go
+++ b/tsdb/chunks/chunk_write_queue.go
@ -249,7 +249,7 @@ func (c *chunkWriteQueue) queueIsEmpty() bool {
 }

 func (c *chunkWriteQueue) queueIsFull() bool {
-	// When the queue is full and blocked on the writer the chunkRefMap has one more job than the cap of the jobCh
+	// When the queue is full and blocked on the writer the chunkRefMap has one more job than the capacity of the queue
 	// because one job is currently being processed and blocked in the writer.
 	return c.queueSize() == c.jobs.maxSize+1
 }
@ -258,7 +258,7 @@ func (c *chunkWriteQueue) queueSize() int {
 	c.chunkRefMapMtx.Lock()
 	defer c.chunkRefMapMtx.Unlock()

-	// Looking at chunkRefMap instead of jobCh because the job is popped from the chan before it has
-	// been fully processed, it remains in the chunkRefMap until the processing is complete.
+	// Looking at chunkRefMap instead of jobs queue because the job is popped from the queue before it has
+	// been fully processed, but it remains in the chunkRefMap until the processing is complete.
 	return len(c.chunkRefMap)
 }
--- a/tsdb/chunks/chunk_write_queue_test.go
+++ b/tsdb/chunks/chunk_write_queue_test.go
@ -178,7 +178,6 @@ func TestChunkWriteQueue_WrappingAroundSizeLimit(t *testing.T) {

 	// Wait until all jobs have been processed.
 	callbackWg.Wait()
-
 	require.Eventually(t, q.queueIsEmpty, 500*time.Millisecond, 50*time.Millisecond)
 }

--- a/tsdb/chunks/head_chunks.go
+++ b/tsdb/chunks/head_chunks.go
@ -534,11 +534,12 @@ func (cdm *ChunkDiskMapper) writeChunk(seriesRef HeadSeriesRef, mint, maxt int64
 }

 // CutNewFile makes that a new file will be created the next time a chunk is written.
-func (cdm *ChunkDiskMapper) CutNewFile() {
+func (cdm *ChunkDiskMapper) CutNewFile() error {
 	cdm.evtlPosMtx.Lock()
 	defer cdm.evtlPosMtx.Unlock()

 	cdm.evtlPos.cutFileOnNextChunk()
+	return nil
 }

 func (cdm *ChunkDiskMapper) IsQueueEmpty() bool {
@ -940,7 +941,7 @@ func (cdm *ChunkDiskMapper) Truncate(fileNo uint32) error {
 		// There is a known race condition here because between the check of curFileSize() and the call to CutNewFile()
 		// a new file could already be cut, this is acceptable because it will simply result in an empty file which
 		// won't do any harm.
-		cdm.CutNewFile()
+		errs.Add(cdm.CutNewFile())
 	}
 	pendingDeletes, err := cdm.deleteFiles(removedFiles)
 	errs.Add(err)
--- a/tsdb/chunks/head_chunks_test.go
+++ b/tsdb/chunks/head_chunks_test.go
@ -15,7 +15,6 @@ package chunks

 import (
 	"encoding/binary"
-	"errors"
 	"math/rand"
 	"os"
 	"strconv"
@ -23,6 +22,7 @@ import (
 	"testing"
 	"time"

+	"github.com/pkg/errors"
 	"github.com/stretchr/testify/require"

 	"github.com/prometheus/prometheus/tsdb/chunkenc"
@ -123,7 +123,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) {
 			}
 		}
 		addChunks(100)
-		hrw.CutNewFile()
+		require.NoError(t, hrw.CutNewFile())
 		addChunks(10) // For chunks in in-memory buffer.
 	}

@ -161,7 +161,7 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) {
 		expData := expectedData[idx]
 		require.Equal(t, expData.seriesRef, seriesRef)
 		require.Equal(t, expData.chunkRef, chunkRef)
-		require.Equal(t, expData.maxt, maxt)
+		require.Equal(t, expData.mint, mint)
 		require.Equal(t, expData.maxt, maxt)
 		require.Equal(t, expData.numSamples, numSamples)
 		require.Equal(t, expData.isOOO, isOOO)
@ -176,6 +176,44 @@ func TestChunkDiskMapper_WriteChunk_Chunk_IterateChunks(t *testing.T) {
 	require.Equal(t, len(expectedData), idx)
 }

+func TestChunkDiskMapper_WriteUnsupportedChunk_Chunk_IterateChunks(t *testing.T) {
+	hrw := createChunkDiskMapper(t, "")
+	defer func() {
+		require.NoError(t, hrw.Close())
+	}()
+
+	ucSeriesRef, ucChkRef, ucMint, ucMaxt, uchunk := writeUnsupportedChunk(t, 0, hrw)
+
+	// Checking on-disk bytes for the first file.
+	require.Equal(t, 1, len(hrw.mmappedChunkFiles), "expected 1 mmapped file, got %d", len(hrw.mmappedChunkFiles))
+	require.Equal(t, len(hrw.mmappedChunkFiles), len(hrw.closers))
+
+	// Testing IterateAllChunks method.
+	dir := hrw.dir.Name()
+	require.NoError(t, hrw.Close())
+	hrw = createChunkDiskMapper(t, dir)
+
+	require.NoError(t, hrw.IterateAllChunks(func(seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, numSamples uint16, encoding chunkenc.Encoding, isOOO bool) error {
+		t.Helper()
+
+		require.Equal(t, ucSeriesRef, seriesRef)
+		require.Equal(t, ucChkRef, chunkRef)
+		require.Equal(t, ucMint, mint)
+		require.Equal(t, ucMaxt, maxt)
+		require.Equal(t, uchunk.Encoding(), encoding) // Asserts that the encoding is EncUnsupportedXOR
+
+		actChunk, err := hrw.Chunk(chunkRef)
+		// The chunk encoding is unknown so Chunk() should fail but us the caller
+		// are ok with that. Above we asserted that the encoding we expected was
+		// EncUnsupportedXOR
+		require.NotNil(t, err)
+		require.Contains(t, err.Error(), "invalid chunk encoding \"<unknown>\"")
+		require.Nil(t, actChunk)
+
+		return nil
+	}))
+}
+
 // TestChunkDiskMapper_Truncate tests
 // * If truncation is happening properly based on the time passed.
 // * The active file is not deleted even if the passed time makes it eligible to be deleted.
@ -220,7 +258,7 @@ func TestChunkDiskMapper_Truncate(t *testing.T) {

 	// Create segments 1 to 7.
 	for i := 1; i <= 7; i++ {
-		hrw.CutNewFile()
+		require.NoError(t, hrw.CutNewFile())
 		addChunk()
 	}
 	verifyFiles([]int{1, 2, 3, 4, 5, 6, 7})
@ -463,7 +501,7 @@ func TestHeadReadWriter_ReadRepairOnEmptyLastFile(t *testing.T) {
 	nonEmptyFile := func() {
 		t.Helper()

-		hrw.CutNewFile()
+		require.NoError(t, hrw.CutNewFile())
 		addChunk()
 	}

@ -564,6 +602,17 @@ func randomChunk(t *testing.T) chunkenc.Chunk {
 	return chunk
 }

+func randomUnsupportedChunk(t *testing.T) chunkenc.Chunk {
+	chunk := newUnsupportedChunk()
+	length := rand.Int() % 120
+	app, err := chunk.Appender()
+	require.NoError(t, err)
+	for i := 0; i < length; i++ {
+		app.Append(rand.Int63(), rand.Float64())
+	}
+	return chunk
+}
+
 func createChunk(t *testing.T, idx int, hrw *ChunkDiskMapper) (seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, chunk chunkenc.Chunk, isOOO bool) {
 	var err error
 	seriesRef = HeadSeriesRef(rand.Int63())
@ -581,3 +630,36 @@ func createChunk(t *testing.T, idx int, hrw *ChunkDiskMapper) (seriesRef HeadSer
 	<-awaitCb
 	return
 }
+
+func writeUnsupportedChunk(t *testing.T, idx int, hrw *ChunkDiskMapper) (seriesRef HeadSeriesRef, chunkRef ChunkDiskMapperRef, mint, maxt int64, chunk chunkenc.Chunk) {
+	var err error
+	seriesRef = HeadSeriesRef(rand.Int63())
+	mint = int64((idx)*1000 + 1)
+	maxt = int64((idx + 1) * 1000)
+	chunk = randomUnsupportedChunk(t)
+	awaitCb := make(chan struct{})
+	chunkRef = hrw.WriteChunk(seriesRef, mint, maxt, chunk, false, func(cbErr error) {
+		require.NoError(t, err)
+		close(awaitCb)
+	})
+	<-awaitCb
+	return
+}
+
+const (
+	UnsupportedMask   = 0b01000000
+	EncUnsupportedXOR = chunkenc.EncXOR | UnsupportedMask
+)
+
+// unsupportedChunk holds a XORChunk and overrides the Encoding() method.
+type unsupportedChunk struct {
+	*chunkenc.XORChunk
+}
+
+func newUnsupportedChunk() *unsupportedChunk {
+	return &unsupportedChunk{chunkenc.NewXORChunk()}
+}
+
+func (c *unsupportedChunk) Encoding() chunkenc.Encoding {
+	return EncUnsupportedXOR
+}
--- a/tsdb/compact.go
+++ b/tsdb/compact.go
--- a/tsdb/compact_test.go
+++ b/tsdb/compact_test.go
@ -15,6 +15,7 @@ package tsdb

 import (
 	"context"
+	crand "crypto/rand"
 	"fmt"
 	"math"
 	"math/rand"
@ -30,6 +31,7 @@ import (
 	"github.com/pkg/errors"
 	prom_testutil "github.com/prometheus/client_golang/prometheus/testutil"
 	"github.com/stretchr/testify/require"
+	"golang.org/x/sync/semaphore"

 	"github.com/prometheus/prometheus/model/histogram"
 	"github.com/prometheus/prometheus/model/labels"
@ -37,6 +39,7 @@ import (
 	"github.com/prometheus/prometheus/tsdb/chunkenc"
 	"github.com/prometheus/prometheus/tsdb/chunks"
 	"github.com/prometheus/prometheus/tsdb/fileutil"
+	"github.com/prometheus/prometheus/tsdb/index"
 	"github.com/prometheus/prometheus/tsdb/tombstones"
 	"github.com/prometheus/prometheus/tsdb/tsdbutil"
 	"github.com/prometheus/prometheus/tsdb/wlog"
@ -164,7 +167,7 @@ func TestNoPanicFor0Tombstones(t *testing.T) {
 		},
 	}

-	c, err := NewLeveledCompactor(context.Background(), nil, nil, []int64{50}, nil, nil)
+	c, err := NewLeveledCompactor(context.Background(), nil, nil, []int64{50}, nil, nil, true)
 	require.NoError(t, err)

 	c.plan(metas)
@ -178,7 +181,7 @@ func TestLeveledCompactor_plan(t *testing.T) {
 		180,
 		540,
 		1620,
-	}, nil, nil)
+	}, nil, nil, true)
 	require.NoError(t, err)

 	cases := map[string]struct {
@ -387,7 +390,7 @@ func TestRangeWithFailedCompactionWontGetSelected(t *testing.T) {
 		240,
 		720,
 		2160,
-	}, nil, nil)
+	}, nil, nil, true)
 	require.NoError(t, err)

 	cases := []struct {
@ -431,20 +434,35 @@ func TestRangeWithFailedCompactionWontGetSelected(t *testing.T) {
 }

 func TestCompactionFailWillCleanUpTempDir(t *testing.T) {
-	compactor, err := NewLeveledCompactor(context.Background(), nil, log.NewNopLogger(), []int64{
+	compactor, err := NewLeveledCompactorWithChunkSize(context.Background(), nil, log.NewNopLogger(), []int64{
 		20,
 		60,
 		240,
 		720,
 		2160,
-	}, nil, nil)
+	}, nil, chunks.DefaultChunkSegmentSize, nil, true)
 	require.NoError(t, err)

 	tmpdir := t.TempDir()

-	require.Error(t, compactor.write(tmpdir, &BlockMeta{}, DefaultBlockPopulator{}, erringBReader{}))
-	_, err = os.Stat(filepath.Join(tmpdir, BlockMeta{}.ULID.String()) + tmpForCreationBlockDirSuffix)
-	require.True(t, os.IsNotExist(err), "directory is not cleaned up")
+	shardedBlocks := []shardedBlock{
+		{meta: &BlockMeta{ULID: ulid.MustNew(ulid.Now(), crand.Reader)}},
+		{meta: &BlockMeta{ULID: ulid.MustNew(ulid.Now(), crand.Reader)}},
+		{meta: &BlockMeta{ULID: ulid.MustNew(ulid.Now(), crand.Reader)}},
+	}
+
+	require.Error(t, compactor.write(tmpdir, shardedBlocks, DefaultBlockPopulator{}, erringBReader{}))
+
+	// We rely on the fact that blockDir and tmpDir will be updated by compactor.write.
+	for _, b := range shardedBlocks {
+		require.NotEmpty(t, b.tmpDir)
+		_, err = os.Stat(b.tmpDir)
+		require.True(t, os.IsNotExist(err), "tmp directory is not cleaned up")
+
+		require.NotEmpty(t, b.blockDir)
+		_, err = os.Stat(b.blockDir)
+		require.True(t, os.IsNotExist(err), "block directory is not cleaned up")
+	}
 }

 func metaRange(name string, mint, maxt int64, stats *BlockStats) dirMeta {
@ -486,6 +504,189 @@ func samplesForRange(minTime, maxTime int64, maxSamplesPerChunk int) (ret [][]sa
 	return ret
 }

+func TestCompaction_CompactWithSplitting(t *testing.T) {
+	seriesCounts := []int{10, 1234}
+	shardCounts := []uint64{1, 13}
+
+	for _, series := range seriesCounts {
+		dir, err := os.MkdirTemp("", "compact")
+		require.NoError(t, err)
+		defer func() {
+			require.NoError(t, os.RemoveAll(dir))
+		}()
+
+		ranges := [][2]int64{{0, 5000}, {3000, 8000}, {6000, 11000}, {9000, 14000}}
+
+		// Generate blocks.
+		var blockDirs []string
+		var openBlocks []*Block
+
+		for _, r := range ranges {
+			block, err := OpenBlock(nil, createBlock(t, dir, genSeries(series, 10, r[0], r[1])), nil)
+			require.NoError(t, err)
+			defer func() {
+				require.NoError(t, block.Close())
+			}()
+
+			openBlocks = append(openBlocks, block)
+			blockDirs = append(blockDirs, block.Dir())
+		}
+
+		for _, shardCount := range shardCounts {
+			t.Run(fmt.Sprintf("series=%d, shards=%d", series, shardCount), func(t *testing.T) {
+				c, err := NewLeveledCompactorWithChunkSize(context.Background(), nil, log.NewNopLogger(), []int64{0}, nil, chunks.DefaultChunkSegmentSize, nil, true)
+				require.NoError(t, err)
+
+				blockIDs, err := c.CompactWithSplitting(dir, blockDirs, openBlocks, shardCount)
+
+				require.NoError(t, err)
+				require.Equal(t, shardCount, uint64(len(blockIDs)))
+
+				// Verify resulting blocks. We will iterate over all series in all blocks, and check two things:
+				// 1) Make sure that each series in the block belongs to the block (based on sharding).
+				// 2) Verify that total number of series over all blocks is correct.
+				totalSeries := uint64(0)
+
+				ts := uint64(0)
+				for shardIndex, blockID := range blockIDs {
+					// Some blocks may be empty, they will have zero block ID.
+					if blockID == (ulid.ULID{}) {
+						continue
+					}
+
+					// All blocks have the same timestamp.
+					if ts == 0 {
+						ts = blockID.Time()
+					} else {
+						require.Equal(t, ts, blockID.Time())
+					}
+
+					// Symbols found in series.
+					seriesSymbols := map[string]struct{}{}
+
+					// We always expect to find "" symbol in the symbols table even if it's not in the series.
+					// Head compaction always includes it, and then it survives additional non-sharded compactions.
+					// Our splitting compaction preserves it too.
+					seriesSymbols[""] = struct{}{}
+
+					block, err := OpenBlock(log.NewNopLogger(), filepath.Join(dir, blockID.String()), nil)
+					require.NoError(t, err)
+
+					defer func() {
+						require.NoError(t, block.Close())
+					}()
+
+					totalSeries += block.Meta().Stats.NumSeries
+
+					idxr, err := block.Index()
+					require.NoError(t, err)
+
+					defer func() {
+						require.NoError(t, idxr.Close())
+					}()
+
+					k, v := index.AllPostingsKey()
+					p, err := idxr.Postings(k, v)
+					require.NoError(t, err)
+
+					var lbls labels.ScratchBuilder
+					for p.Next() {
+						ref := p.At()
+						require.NoError(t, idxr.Series(ref, &lbls, nil))
+
+						require.Equal(t, uint64(shardIndex), labels.StableHash(lbls.Labels())%shardCount)
+
+						// Collect all symbols used by series.
+						lbls.Labels().Range(func(l labels.Label) {
+							seriesSymbols[l.Name] = struct{}{}
+							seriesSymbols[l.Value] = struct{}{}
+						})
+					}
+					require.NoError(t, p.Err())
+
+					// Check that all symbols in symbols table are actually used by series.
+					symIt := idxr.Symbols()
+					for symIt.Next() {
+						w := symIt.At()
+						_, ok := seriesSymbols[w]
+						require.True(t, ok, "not found in series: '%s'", w)
+						delete(seriesSymbols, w)
+					}
+
+					// Check that symbols table covered all symbols found from series.
+					require.Equal(t, 0, len(seriesSymbols))
+				}
+
+				require.Equal(t, uint64(series), totalSeries)
+
+				// Source blocks are *not* deletable.
+				for _, b := range openBlocks {
+					require.False(t, b.meta.Compaction.Deletable)
+				}
+			})
+		}
+	}
+}
+
+func TestCompaction_CompactEmptyBlocks(t *testing.T) {
+	dir, err := os.MkdirTemp("", "compact")
+	require.NoError(t, err)
+	defer func() {
+		require.NoError(t, os.RemoveAll(dir))
+	}()
+
+	ranges := [][2]int64{{0, 5000}, {3000, 8000}, {6000, 11000}, {9000, 14000}}
+
+	// Generate blocks.
+	var blockDirs []string
+
+	for _, r := range ranges {
+		// Generate blocks using index and chunk writer. CreateBlock would not return valid block for 0 series.
+		id := ulid.MustNew(ulid.Now(), crand.Reader)
+		m := &BlockMeta{
+			ULID:       id,
+			MinTime:    r[0],
+			MaxTime:    r[1],
+			Compaction: BlockMetaCompaction{Level: 1, Sources: []ulid.ULID{id}},
+			Version:    metaVersion1,
+		}
+
+		bdir := filepath.Join(dir, id.String())
+		require.NoError(t, os.Mkdir(bdir, 0o777))
+		require.NoError(t, os.Mkdir(chunkDir(bdir), 0o777))
+
+		_, err := writeMetaFile(log.NewNopLogger(), bdir, m)
+		require.NoError(t, err)
+
+		iw, err := index.NewWriter(context.Background(), filepath.Join(bdir, indexFilename))
+		require.NoError(t, err)
+
+		require.NoError(t, iw.AddSymbol("hello"))
+		require.NoError(t, iw.AddSymbol("world"))
+		require.NoError(t, iw.Close())
+
+		blockDirs = append(blockDirs, bdir)
+	}
+
+	c, err := NewLeveledCompactorWithChunkSize(context.Background(), nil, log.NewNopLogger(), []int64{0}, nil, chunks.DefaultChunkSegmentSize, nil, true)
+	require.NoError(t, err)
+
+	blockIDs, err := c.CompactWithSplitting(dir, blockDirs, nil, 5)
+	require.NoError(t, err)
+
+	// There are no output blocks.
+	for _, b := range blockIDs {
+		require.Equal(t, ulid.ULID{}, b)
+	}
+
+	// All source blocks are now marked for deletion.
+	for _, b := range blockDirs {
+		meta, _, err := readMetaFile(b)
+		require.NoError(t, err)
+		require.True(t, meta.Compaction.Deletable)
+	}
+}
+
 func TestCompaction_populateBlock(t *testing.T) {
 	for _, tc := range []struct {
 		title              string
@ -498,7 +699,7 @@ func TestCompaction_populateBlock(t *testing.T) {
 		{
 			title:              "Populate block from empty input should return error.",
 			inputSeriesSamples: [][]seriesSamples{},
-			expErr:             errors.New("cannot populate block from no readers"),
+			expErr:             errors.New("cannot populate block(s) from no readers"),
 		},
 		{
 			// Populate from single block without chunks. We expect these kind of series being ignored.
@ -942,7 +1143,7 @@ func TestCompaction_populateBlock(t *testing.T) {
 				blocks = append(blocks, &mockBReader{ir: ir, cr: cr, mint: mint, maxt: maxt})
 			}

-			c, err := NewLeveledCompactor(context.Background(), nil, nil, []int64{0}, nil, nil)
+			c, err := NewLeveledCompactorWithChunkSize(context.Background(), nil, nil, []int64{0}, nil, chunks.DefaultChunkSegmentSize, nil, true)
 			require.NoError(t, err)

 			meta := &BlockMeta{
@ -954,8 +1155,10 @@ func TestCompaction_populateBlock(t *testing.T) {
 			}

 			iw := &mockIndexWriter{}
+			ob := shardedBlock{meta: meta, indexw: iw, chunkw: nopChunkWriter{}}
 			blockPopulator := DefaultBlockPopulator{}
-			err = blockPopulator.PopulateBlock(c.ctx, c.metrics, c.logger, c.chunkPool, c.mergeFunc, blocks, meta, iw, nopChunkWriter{})
+			err = blockPopulator.PopulateBlock(c.ctx, c.metrics, c.logger, c.chunkPool, c.mergeFunc, c.concurrencyOpts, blocks, meta.MinTime, meta.MaxTime, []shardedBlock{ob})
+
 			if tc.expErr != nil {
 				require.Error(t, err)
 				require.Equal(t, tc.expErr.Error(), err.Error())
@ -1064,7 +1267,7 @@ func BenchmarkCompaction(b *testing.B) {
 				blockDirs = append(blockDirs, block.Dir())
 			}

-			c, err := NewLeveledCompactor(context.Background(), nil, log.NewNopLogger(), []int64{0}, nil, nil)
+			c, err := NewLeveledCompactor(context.Background(), nil, log.NewNopLogger(), []int64{0}, nil, nil, true)
 			require.NoError(b, err)

 			b.ResetTimer()
@ -1304,6 +1507,121 @@ func TestDeleteCompactionBlockAfterFailedReload(t *testing.T) {
 	}
 }

+func TestOpenBlocksForCompaction(t *testing.T) {
+	dir := t.TempDir()
+
+	const blocks = 5
+
+	var blockDirs []string
+	for ix := 0; ix < blocks; ix++ {
+		d := createBlock(t, dir, genSeries(100, 10, 0, 5000))
+		blockDirs = append(blockDirs, d)
+	}
+
+	// Open subset of blocks first.
+	const blocksToOpen = 2
+	opened, toClose, err := openBlocksForCompaction(blockDirs[:blocksToOpen], nil, log.NewNopLogger(), nil, 10)
+	for _, b := range toClose {
+		defer func(b *Block) { require.NoError(t, b.Close()) }(b)
+	}
+
+	require.NoError(t, err)
+	checkBlocks(t, opened, blockDirs[:blocksToOpen]...)
+	checkBlocks(t, toClose, blockDirs[:blocksToOpen]...)
+
+	// Open all blocks, but provide previously opened blocks.
+	opened2, toClose2, err := openBlocksForCompaction(blockDirs, opened, log.NewNopLogger(), nil, 10)
+	for _, b := range toClose2 {
+		defer func(b *Block) { require.NoError(t, b.Close()) }(b)
+	}
+
+	require.NoError(t, err)
+	checkBlocks(t, opened2, blockDirs...)
+	checkBlocks(t, toClose2, blockDirs[blocksToOpen:]...)
+}
+
+func TestOpenBlocksForCompactionErrorsNoMeta(t *testing.T) {
+	dir := t.TempDir()
+
+	const blocks = 5
+
+	var blockDirs []string
+	for ix := 0; ix < blocks; ix++ {
+		d := createBlock(t, dir, genSeries(100, 10, 0, 5000))
+		blockDirs = append(blockDirs, d)
+
+		if ix == 3 {
+			blockDirs = append(blockDirs, path.Join(dir, "invalid-block"))
+		}
+	}
+
+	// open block[0]
+	b0, err := OpenBlock(log.NewNopLogger(), blockDirs[0], nil)
+	require.NoError(t, err)
+	defer func() { require.NoError(t, b0.Close()) }()
+
+	_, toClose, err := openBlocksForCompaction(blockDirs, []*Block{b0}, log.NewNopLogger(), nil, 10)
+
+	require.Error(t, err)
+	// We didn't get to opening more blocks, because we found invalid dir, so there is nothing to close.
+	require.Empty(t, toClose)
+}
+
+func TestOpenBlocksForCompactionErrorsMissingIndex(t *testing.T) {
+	dir := t.TempDir()
+
+	const blocks = 5
+
+	var blockDirs []string
+	for ix := 0; ix < blocks; ix++ {
+		d := createBlock(t, dir, genSeries(100, 10, 0, 5000))
+		blockDirs = append(blockDirs, d)
+
+		if ix == 3 {
+			require.NoError(t, os.Remove(path.Join(d, indexFilename)))
+		}
+	}
+
+	// open block[1]
+	b1, err := OpenBlock(log.NewNopLogger(), blockDirs[1], nil)
+	require.NoError(t, err)
+	defer func() { require.NoError(t, b1.Close()) }()
+
+	// We use concurrency = 1 to simplify the test.
+	// Block[0] will be opened correctly.
+	// Block[1] is already opened.
+	// Block[2] will be opened correctly.
+	// Block[3] is invalid and will cause error.
+	// Block[4] will not be opened at all.
+	opened, toClose, err := openBlocksForCompaction(blockDirs, []*Block{b1}, log.NewNopLogger(), nil, 1)
+	for _, b := range toClose {
+		defer func(b *Block) { require.NoError(t, b.Close()) }(b)
+	}
+
+	require.Error(t, err)
+	checkBlocks(t, opened, blockDirs[0:3]...)
+	checkBlocks(t, toClose, blockDirs[0], blockDirs[2])
+}
+
+// Check that blocks match IDs from directories.
+func checkBlocks(t *testing.T, blocks []*Block, dirs ...string) {
+	t.Helper()
+
+	blockIDs := map[string]struct{}{}
+	for _, b := range blocks {
+		blockIDs[b.Meta().ULID.String()] = struct{}{}
+	}
+
+	dirBlockIDs := map[string]struct{}{}
+	for _, d := range dirs {
+		m, _, err := readMetaFile(d)
+		require.NoError(t, err)
+		dirBlockIDs[m.ULID.String()] = struct{}{}
+	}
+
+	require.Equal(t, blockIDs, dirBlockIDs)
+}
+
 func TestHeadCompactionWithHistograms(t *testing.T) {
 	for _, floatTest := range []bool{true, false} {
 		t.Run(fmt.Sprintf("float=%t", floatTest), func(t *testing.T) {
@ -1402,7 +1720,7 @@ func TestHeadCompactionWithHistograms(t *testing.T) {
 			// Compaction.
 			mint := head.MinTime()
 			maxt := head.MaxTime() + 1 // Block intervals are half-open: [b.MinTime, b.MaxTime).
-			compactor, err := NewLeveledCompactor(context.Background(), nil, nil, []int64{DefaultBlockDuration}, chunkenc.NewPool(), nil)
+			compactor, err := NewLeveledCompactor(context.Background(), nil, nil, []int64{DefaultBlockDuration}, chunkenc.NewPool(), nil, true)
 			require.NoError(t, err)
 			id, err := compactor.Write(head.opts.ChunkDirRoot, head, mint, maxt, nil)
 			require.NoError(t, err)
@ -1544,7 +1862,7 @@ func TestSparseHistogramSpaceSavings(t *testing.T) {
 					// Sparse head compaction.
 					mint := sparseHead.MinTime()
 					maxt := sparseHead.MaxTime() + 1 // Block intervals are half-open: [b.MinTime, b.MaxTime).
-					compactor, err := NewLeveledCompactor(context.Background(), nil, nil, []int64{DefaultBlockDuration}, chunkenc.NewPool(), nil)
+					compactor, err := NewLeveledCompactor(context.Background(), nil, nil, []int64{DefaultBlockDuration}, chunkenc.NewPool(), nil, true)
 					require.NoError(t, err)
 					sparseULID, err = compactor.Write(sparseHead.opts.ChunkDirRoot, sparseHead, mint, maxt, nil)
 					require.NoError(t, err)
@ -1595,7 +1913,7 @@ func TestSparseHistogramSpaceSavings(t *testing.T) {
 					// Old head compaction.
 					mint := oldHead.MinTime()
 					maxt := oldHead.MaxTime() + 1 // Block intervals are half-open: [b.MinTime, b.MaxTime).
-					compactor, err := NewLeveledCompactor(context.Background(), nil, nil, []int64{DefaultBlockDuration}, chunkenc.NewPool(), nil)
+					compactor, err := NewLeveledCompactor(context.Background(), nil, nil, []int64{DefaultBlockDuration}, chunkenc.NewPool(), nil, true)
 					require.NoError(t, err)
 					oldULID, err = compactor.Write(oldHead.opts.ChunkDirRoot, oldHead, mint, maxt, nil)
 					require.NoError(t, err)
@ -1766,3 +2084,306 @@ func TestCompactBlockMetas(t *testing.T) {
 	}
 	require.Equal(t, expected, output)
 }
+
+func TestLeveledCompactor_plan_overlapping_disabled(t *testing.T) {
+	// This mimics our default ExponentialBlockRanges with min block size equals to 20.
+	compactor, err := NewLeveledCompactor(context.Background(), nil, nil, []int64{
+		20,
+		60,
+		180,
+		540,
+		1620,
+	}, nil, nil, false)
+	require.NoError(t, err)
+
+	cases := map[string]struct {
+		metas    []dirMeta
+		expected []string
+	}{
+		"Outside Range": {
+			metas: []dirMeta{
+				metaRange("1", 0, 20, nil),
+			},
+			expected: nil,
+		},
+		"We should wait for four blocks of size 20 to appear before compacting.": {
+			metas: []dirMeta{
+				metaRange("1", 0, 20, nil),
+				metaRange("2", 20, 40, nil),
+			},
+			expected: nil,
+		},
+		`We should wait for a next block of size 20 to appear before compacting
+		the existing ones. We have three, but we ignore the fresh one from WAl`: {
+			metas: []dirMeta{
+				metaRange("1", 0, 20, nil),
+				metaRange("2", 20, 40, nil),
+				metaRange("3", 40, 60, nil),
+			},
+			expected: nil,
+		},
+		"Block to fill the entire parent range appeared – should be compacted": {
+			metas: []dirMeta{
+				metaRange("1", 0, 20, nil),
+				metaRange("2", 20, 40, nil),
+				metaRange("3", 40, 60, nil),
+				metaRange("4", 60, 80, nil),
+			},
+			expected: []string{"1", "2", "3"},
+		},
+		`Block for the next parent range appeared with gap with size 20. Nothing will happen in the first one
+		anymore but we ignore fresh one still, so no compaction`: {
+			metas: []dirMeta{
+				metaRange("1", 0, 20, nil),
+				metaRange("2", 20, 40, nil),
+				metaRange("3", 60, 80, nil),
+			},
+			expected: nil,
+		},
+		`Block for the next parent range appeared, and we have a gap with size 20 between second and third block.
+		We will not get this missed gap anymore and we should compact just these two.`: {
+			metas: []dirMeta{
+				metaRange("1", 0, 20, nil),
+				metaRange("2", 20, 40, nil),
+				metaRange("3", 60, 80, nil),
+				metaRange("4", 80, 100, nil),
+			},
+			expected: []string{"1", "2"},
+		},
+		"We have 20, 20, 20, 60, 60 range blocks. '5' is marked as fresh one": {
+			metas: []dirMeta{
+				metaRange("1", 0, 20, nil),
+				metaRange("2", 20, 40, nil),
+				metaRange("3", 40, 60, nil),
+				metaRange("4", 60, 120, nil),
+				metaRange("5", 120, 180, nil),
+			},
+			expected: []string{"1", "2", "3"},
+		},
+		"We have 20, 60, 20, 60, 240 range blocks. We can compact 20 + 60 + 60": {
+			metas: []dirMeta{
+				metaRange("2", 20, 40, nil),
+				metaRange("4", 60, 120, nil),
+				metaRange("5", 960, 980, nil), // Fresh one.
+				metaRange("6", 120, 180, nil),
+				metaRange("7", 720, 960, nil),
+			},
+			expected: []string{"2", "4", "6"},
+		},
+		"Do not select large blocks that have many tombstones when there is no fresh block": {
+			metas: []dirMeta{
+				metaRange("1", 0, 540, &BlockStats{
+					NumSeries:     10,
+					NumTombstones: 3,
+				}),
+			},
+			expected: nil,
+		},
+		"Select large blocks that have many tombstones when fresh appears": {
+			metas: []dirMeta{
+				metaRange("1", 0, 540, &BlockStats{
+					NumSeries:     10,
+					NumTombstones: 3,
+				}),
+				metaRange("2", 540, 560, nil),
+			},
+			expected: []string{"1"},
+		},
+		"For small blocks, do not compact tombstones, even when fresh appears.": {
+			metas: []dirMeta{
+				metaRange("1", 0, 60, &BlockStats{
+					NumSeries:     10,
+					NumTombstones: 3,
+				}),
+				metaRange("2", 60, 80, nil),
+			},
+			expected: nil,
+		},
+		`Regression test: we were stuck in a compact loop where we always recompacted
+		the same block when tombstones and series counts were zero`: {
+			metas: []dirMeta{
+				metaRange("1", 0, 540, &BlockStats{
+					NumSeries:     0,
+					NumTombstones: 0,
+				}),
+				metaRange("2", 540, 560, nil),
+			},
+			expected: nil,
+		},
+		`Regression test: we were wrongly assuming that new block is fresh from WAL when its ULID is newest.
+		We need to actually look on max time instead.
+		With previous, wrong approach "8" block was ignored, so we were wrongly compacting 5 and 7 and introducing
+		block overlaps`: {
+			metas: []dirMeta{
+				metaRange("5", 0, 360, nil),
+				metaRange("6", 540, 560, nil), // Fresh one.
+				metaRange("7", 360, 420, nil),
+				metaRange("8", 420, 540, nil),
+			},
+			expected: []string{"7", "8"},
+		},
+		// |--------------|
+		//               |----------------|
+		//                                |--------------|
+		"Overlapping blocks 1": {
+			metas: []dirMeta{
+				metaRange("1", 0, 20, nil),
+				metaRange("2", 19, 40, nil),
+				metaRange("3", 40, 60, nil),
+			},
+			expected: nil,
+		},
+		// |--------------|
+		//                |--------------|
+		//                        |--------------|
+		"Overlapping blocks 2": {
+			metas: []dirMeta{
+				metaRange("1", 0, 20, nil),
+				metaRange("2", 20, 40, nil),
+				metaRange("3", 30, 50, nil),
+			},
+			expected: nil,
+		},
+		// |--------------|
+		//         |---------------------|
+		//                       |--------------|
+		"Overlapping blocks 3": {
+			metas: []dirMeta{
+				metaRange("1", 0, 20, nil),
+				metaRange("2", 10, 40, nil),
+				metaRange("3", 30, 50, nil),
+			},
+			expected: nil,
+		},
+		// |--------------|
+		//               |--------------------------------|
+		//                |--------------|
+		//                               |--------------|
+		"Overlapping blocks 4": {
+			metas: []dirMeta{
+				metaRange("5", 0, 360, nil),
+				metaRange("6", 340, 560, nil),
+				metaRange("7", 360, 420, nil),
+				metaRange("8", 420, 540, nil),
+			},
+			expected: nil,
+		},
+		// |--------------|
+		//               |--------------|
+		//                                            |--------------|
+		//                                                          |--------------|
+		"Overlapping blocks 5": {
+			metas: []dirMeta{
+				metaRange("1", 0, 10, nil),
+				metaRange("2", 9, 20, nil),
+				metaRange("3", 30, 40, nil),
+				metaRange("4", 39, 50, nil),
+			},
+			expected: nil,
+		},
+	}
+
+	for title, c := range cases {
+		if !t.Run(title, func(t *testing.T) {
+			res, err := compactor.plan(c.metas)
+			require.NoError(t, err)
+			require.Equal(t, c.expected, res)
+		}) {
+			return
+		}
+	}
+}
+
+func TestAsyncBlockWriterSuccess(t *testing.T) {
+	cw, err := chunks.NewWriter(t.TempDir())
+	require.NoError(t, err)
+
+	const series = 100
+	// prepare index, add all symbols
+	iw, err := index.NewWriter(context.Background(), filepath.Join(t.TempDir(), indexFilename))
+	require.NoError(t, err)
+
+	require.NoError(t, iw.AddSymbol("__name__"))
+	for ix := 0; ix < series; ix++ {
+		s := fmt.Sprintf("s_%3d", ix)
+		require.NoError(t, iw.AddSymbol(s))
+	}
+
+	// async block writer expects index writer ready to receive series.
+	abw := newAsyncBlockWriter(chunkenc.NewPool(), cw, iw, semaphore.NewWeighted(int64(1)))
+
+	for ix := 0; ix < series; ix++ {
+		s := fmt.Sprintf("s_%3d", ix)
+		require.NoError(t, abw.addSeries(labels.FromStrings("__name__", s), []chunks.Meta{{Chunk: randomChunk(t), MinTime: 0, MaxTime: math.MaxInt64}}))
+	}
+
+	// signal that no more series are coming
+	abw.closeAsync()
+
+	// We can do this repeatedly.
+	abw.closeAsync()
+	abw.closeAsync()
+
+	// wait for result
+	stats, err := abw.waitFinished()
+	require.NoError(t, err)
+	require.Equal(t, uint64(series), stats.NumSeries)
+	require.Equal(t, uint64(series), stats.NumChunks)
+
+	// We get the same result on subsequent calls to waitFinished.
+	for i := 0; i < 5; i++ {
+		newstats, err := abw.waitFinished()
+		require.NoError(t, err)
+		require.Equal(t, stats, newstats)
+
+		// We can call close async again, as long as it's on the same goroutine.
+		abw.closeAsync()
+	}
+}
+
+func TestAsyncBlockWriterFailure(t *testing.T) {
+	cw, err := chunks.NewWriter(t.TempDir())
+	require.NoError(t, err)
+
+	// We don't write symbols to this index writer, so adding series next will fail.
+	iw, err := index.NewWriter(context.Background(), filepath.Join(t.TempDir(), indexFilename))
+	require.NoError(t, err)
+
+	// async block writer expects index writer ready to receive series.
+	abw := newAsyncBlockWriter(chunkenc.NewPool(), cw, iw, semaphore.NewWeighted(int64(1)))
+
+	// Adding single series doesn't fail, as it just puts it onto the queue.
+	require.NoError(t, abw.addSeries(labels.FromStrings("__name__", "test"), []chunks.Meta{{Chunk: randomChunk(t), MinTime: 0, MaxTime: math.MaxInt64}}))
+
+	// Signal that no more series are coming.
+	abw.closeAsync()
+
+	// We can do this repeatedly.
+	abw.closeAsync()
+	abw.closeAsync()
+
+	// Wait for result, this time we get error due to missing symbols.
+	_, err = abw.waitFinished()
+	require.Error(t, err)
+	require.ErrorContains(t, err, "unknown symbol")
+
+	// We get the same error on each repeated call to waitFinished.
+	for i := 0; i < 5; i++ {
+		_, nerr := abw.waitFinished()
+		require.Equal(t, err, nerr)
+
+		// We can call close async again, as long as it's on the same goroutine.
+		abw.closeAsync()
+	}
+}
+
+func randomChunk(t *testing.T) chunkenc.Chunk {
+	chunk := chunkenc.NewXORChunk()
+	l := rand.Int() % 120
+	app, err := chunk.Appender()
+	require.NoError(t, err)
+	for i := 0; i < l; i++ {
+		app.Append(rand.Int63(), rand.Float64())
+	}
+	return chunk
+}
--- a/tsdb/db.go
+++ b/tsdb/db.go
@ -44,6 +44,8 @@ import (
 	tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
 	"github.com/prometheus/prometheus/tsdb/fileutil"
 	_ "github.com/prometheus/prometheus/tsdb/goversion" // Load the package into main to make sure minium Go version is met.
+	"github.com/prometheus/prometheus/tsdb/hashcache"
+	"github.com/prometheus/prometheus/tsdb/index"
 	"github.com/prometheus/prometheus/tsdb/tsdbutil"
 	"github.com/prometheus/prometheus/tsdb/wlog"
 )
@ -70,20 +72,27 @@ var ErrNotReady = errors.New("TSDB not ready")
 // millisecond precision timestamps.
 func DefaultOptions() *Options {
 	return &Options{
-		WALSegmentSize:             wlog.DefaultSegmentSize,
-		MaxBlockChunkSegmentSize:   chunks.DefaultChunkSegmentSize,
-		RetentionDuration:          int64(15 * 24 * time.Hour / time.Millisecond),
-		MinBlockDuration:           DefaultBlockDuration,
-		MaxBlockDuration:           DefaultBlockDuration,
-		NoLockfile:                 false,
-		AllowOverlappingCompaction: true,
-		SamplesPerChunk:            DefaultSamplesPerChunk,
-		WALCompression:             wlog.CompressionNone,
-		StripeSize:                 DefaultStripeSize,
-		HeadChunksWriteBufferSize:  chunks.DefaultWriteBufferSize,
-		IsolationDisabled:          defaultIsolationDisabled,
-		HeadChunksWriteQueueSize:   chunks.DefaultWriteQueueSize,
-		OutOfOrderCapMax:           DefaultOutOfOrderCapMax,
+		WALSegmentSize:                     wlog.DefaultSegmentSize,
+		MaxBlockChunkSegmentSize:           chunks.DefaultChunkSegmentSize,
+		RetentionDuration:                  int64(15 * 24 * time.Hour / time.Millisecond),
+		MinBlockDuration:                   DefaultBlockDuration,
+		MaxBlockDuration:                   DefaultBlockDuration,
+		NoLockfile:                         false,
+		AllowOverlappingCompaction:         true,
+		SamplesPerChunk:                    DefaultSamplesPerChunk,
+		WALCompression:                     wlog.CompressionNone,
+		StripeSize:                         DefaultStripeSize,
+		HeadChunksWriteBufferSize:          chunks.DefaultWriteBufferSize,
+		IsolationDisabled:                  defaultIsolationDisabled,
+		HeadChunksEndTimeVariance:          0,
+		HeadChunksWriteQueueSize:           chunks.DefaultWriteQueueSize,
+		OutOfOrderCapMax:                   DefaultOutOfOrderCapMax,
+		HeadPostingsForMatchersCacheTTL:    defaultPostingsForMatchersCacheTTL,
+		HeadPostingsForMatchersCacheSize:   defaultPostingsForMatchersCacheSize,
+		HeadPostingsForMatchersCacheForce:  false,
+		BlockPostingsForMatchersCacheTTL:   defaultPostingsForMatchersCacheTTL,
+		BlockPostingsForMatchersCacheSize:  defaultPostingsForMatchersCacheSize,
+		BlockPostingsForMatchersCacheForce: false,
 	}
 }

@ -147,6 +156,10 @@ type Options struct {
 	// HeadChunksWriteBufferSize configures the write buffer size used by the head chunks mapper.
 	HeadChunksWriteBufferSize int

+	// HeadChunksEndTimeVariance is how much variance (between 0 and 1) should be applied to the chunk end time,
+	// to spread chunks writing across time. Doesn't apply to the last chunk of the chunk range. 0 to disable variance.
+	HeadChunksEndTimeVariance float64
+
 	// HeadChunksWriteQueueSize configures the size of the chunk write queue used in the head chunks mapper.
 	HeadChunksWriteQueueSize int

@ -175,6 +188,10 @@ type Options struct {
 	// Disables isolation between reads and in-flight appends.
 	IsolationDisabled bool

+	// SeriesHashCache specifies the series hash cache used when querying shards via Querier.Select().
+	// If nil, the cache won't be used.
+	SeriesHashCache *hashcache.SeriesHashCache
+
 	// EnableNativeHistograms enables the ingestion of native histograms.
 	EnableNativeHistograms bool

@ -186,6 +203,29 @@ type Options struct {
 	// OutOfOrderCapMax is maximum capacity for OOO chunks (in samples).
 	// If it is <=0, the default value is assumed.
 	OutOfOrderCapMax int64
+
+	// HeadPostingsForMatchersCacheTTL is the TTL of the postings for matchers cache in the Head.
+	// If it's 0, the cache will only deduplicate in-flight requests, deleting the results once the first request has finished.
+	HeadPostingsForMatchersCacheTTL time.Duration
+
+	// HeadPostingsForMatchersCacheSize is the maximum size of cached postings for matchers elements in the Head.
+	// It's ignored when HeadPostingsForMatchersCacheTTL is 0.
+	HeadPostingsForMatchersCacheSize int
+
+	// HeadPostingsForMatchersCacheForce forces the usage of postings for matchers cache for all calls on Head and OOOHead regardless of the `concurrent` param.
+	HeadPostingsForMatchersCacheForce bool
+
+	// BlockPostingsForMatchersCacheTTL is the TTL of the postings for matchers cache of each compacted block.
+	// If it's 0, the cache will only deduplicate in-flight requests, deleting the results once the first request has finished.
+	BlockPostingsForMatchersCacheTTL time.Duration
+
+	// BlockPostingsForMatchersCacheSize is the maximum size of cached postings for matchers elements in each compacted block.
+	// It's ignored when BlockPostingsForMatchersCacheTTL is 0.
+	BlockPostingsForMatchersCacheSize int
+
+	// BlockPostingsForMatchersCacheForce forces the usage of postings for matchers cache for all calls on compacted blocks
+	// regardless of the `concurrent` param.
+	BlockPostingsForMatchersCacheForce bool
 }

 type BlocksToDeleteFunc func(blocks []*Block) map[ulid.ULID]struct{}
@ -442,6 +482,7 @@ func (db *DBReadOnly) FlushWAL(dir string) (returnErr error) {
 		ExponentialBlockRanges(DefaultOptions().MinBlockDuration, 3, 5),
 		chunkenc.NewPool(),
 		nil,
+		false,
 	)
 	if err != nil {
 		return errors.Wrap(err, "create leveled compactor")
@ -551,7 +592,7 @@ func (db *DBReadOnly) Blocks() ([]BlockReader, error) {
 		return nil, ErrClosed
 	default:
 	}
-	loadable, corrupted, err := openBlocks(db.logger, db.dir, nil, nil)
+	loadable, corrupted, err := openBlocks(db.logger, db.dir, nil, nil, nil, defaultPostingsForMatchersCacheTTL, defaultPostingsForMatchersCacheSize, false)
 	if err != nil {
 		return nil, err
 	}
@ -691,6 +732,9 @@ func validateOpts(opts *Options, rngs []int64) (*Options, []int64) {
 	if opts.HeadChunksWriteBufferSize <= 0 {
 		opts.HeadChunksWriteBufferSize = chunks.DefaultWriteBufferSize
 	}
+	if opts.HeadChunksEndTimeVariance <= 0 {
+		opts.HeadChunksEndTimeVariance = 0
+	}
 	if opts.HeadChunksWriteQueueSize < 0 {
 		opts.HeadChunksWriteQueueSize = chunks.DefaultWriteQueueSize
 	}
@ -803,7 +847,7 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
 	}

 	ctx, cancel := context.WithCancel(context.Background())
-	db.compactor, err = NewLeveledCompactorWithChunkSize(ctx, r, l, rngs, db.chunkPool, opts.MaxBlockChunkSegmentSize, nil)
+	db.compactor, err = NewLeveledCompactorWithChunkSize(ctx, r, l, rngs, db.chunkPool, opts.MaxBlockChunkSegmentSize, nil, opts.AllowOverlappingCompaction)
 	if err != nil {
 		cancel()
 		return nil, errors.Wrap(err, "create leveled compactor")
@ -840,6 +884,7 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
 	headOpts.ChunkDirRoot = dir
 	headOpts.ChunkPool = db.chunkPool
 	headOpts.ChunkWriteBufferSize = opts.HeadChunksWriteBufferSize
+	headOpts.ChunkEndTimeVariance = opts.HeadChunksEndTimeVariance
 	headOpts.ChunkWriteQueueSize = opts.HeadChunksWriteQueueSize
 	headOpts.SamplesPerChunk = opts.SamplesPerChunk
 	headOpts.StripeSize = opts.StripeSize
@ -850,6 +895,9 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
 	headOpts.EnableNativeHistograms.Store(opts.EnableNativeHistograms)
 	headOpts.OutOfOrderTimeWindow.Store(opts.OutOfOrderTimeWindow)
 	headOpts.OutOfOrderCapMax.Store(opts.OutOfOrderCapMax)
+	headOpts.PostingsForMatchersCacheTTL = opts.HeadPostingsForMatchersCacheTTL
+	headOpts.PostingsForMatchersCacheSize = opts.HeadPostingsForMatchersCacheSize
+	headOpts.PostingsForMatchersCacheForce = opts.HeadPostingsForMatchersCacheForce
 	if opts.WALReplayConcurrency > 0 {
 		headOpts.WALReplayConcurrency = opts.WALReplayConcurrency
 	}
@ -1390,7 +1438,7 @@ func (db *DB) reloadBlocks() (err error) {
 	db.mtx.Lock()
 	defer db.mtx.Unlock()

-	loadable, corrupted, err := openBlocks(db.logger, db.dir, db.blocks, db.chunkPool)
+	loadable, corrupted, err := openBlocks(db.logger, db.dir, db.blocks, db.chunkPool, db.opts.SeriesHashCache, db.opts.BlockPostingsForMatchersCacheTTL, db.opts.BlockPostingsForMatchersCacheSize, db.opts.BlockPostingsForMatchersCacheForce)
 	if err != nil {
 		return err
 	}
@ -1458,7 +1506,7 @@ func (db *DB) reloadBlocks() (err error) {
 		blockMetas = append(blockMetas, b.Meta())
 	}
 	if overlaps := OverlappingBlocks(blockMetas); len(overlaps) > 0 {
-		level.Warn(db.logger).Log("msg", "Overlapping blocks found during reloadBlocks", "detail", overlaps.String())
+		level.Debug(db.logger).Log("msg", "Overlapping blocks found during reloadBlocks", "detail", overlaps.String())
 	}

 	// Append blocks to old, deletable blocks, so we can close them.
@ -1473,7 +1521,7 @@ func (db *DB) reloadBlocks() (err error) {
 	return nil
 }

-func openBlocks(l log.Logger, dir string, loaded []*Block, chunkPool chunkenc.Pool) (blocks []*Block, corrupted map[ulid.ULID]error, err error) {
+func openBlocks(l log.Logger, dir string, loaded []*Block, chunkPool chunkenc.Pool, cache *hashcache.SeriesHashCache, postingsCacheTTL time.Duration, postingsCacheSize int, postingsCacheForce bool) (blocks []*Block, corrupted map[ulid.ULID]error, err error) {
 	bDirs, err := blockDirs(dir)
 	if err != nil {
 		return nil, nil, errors.Wrap(err, "find blocks")
@ -1490,7 +1538,12 @@ func openBlocks(l log.Logger, dir string, loaded []*Block, chunkPool chunkenc.Po
 		// See if we already have the block in memory or open it otherwise.
 		block, open := getBlock(loaded, meta.ULID)
 		if !open {
-			block, err = OpenBlock(l, bDir, chunkPool)
+			var cacheProvider index.ReaderCacheProvider
+			if cache != nil {
+				cacheProvider = cache.GetBlockCacheProvider(meta.ULID.String())
+			}
+
+			block, err = OpenBlockWithOptions(l, bDir, chunkPool, cacheProvider, postingsCacheTTL, postingsCacheSize, postingsCacheForce)
 			if err != nil {
 				corrupted[meta.ULID] = err
 				continue
@ -1739,7 +1792,7 @@ func (db *DB) inOrderBlocksMaxTime() (maxt int64, ok bool) {
 	maxt, ok = int64(math.MinInt64), false
 	// If blocks are overlapping, last block might not have the max time. So check all blocks.
 	for _, b := range db.Blocks() {
-		if !b.meta.Compaction.FromOutOfOrder() && b.meta.MaxTime > maxt {
+		if !b.meta.OutOfOrder && !b.meta.Compaction.FromOutOfOrder() && b.meta.MaxTime > maxt {
 			ok = true
 			maxt = b.meta.MaxTime
 		}
@ -1990,6 +2043,16 @@ func (db *DB) ChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQu
 	return storage.NewMergeChunkQuerier(blockQueriers, nil, storage.NewCompactingChunkSeriesMerger(storage.ChainedSeriesMerge)), nil
 }

+// UnorderedChunkQuerier returns a new chunk querier over the data partition for the given time range.
+// The chunks can be overlapping and not sorted.
+func (db *DB) UnorderedChunkQuerier(_ context.Context, mint, maxt int64) (storage.ChunkQuerier, error) {
+	blockQueriers, err := db.blockChunkQuerierForRange(mint, maxt)
+	if err != nil {
+		return nil, err
+	}
+	return storage.NewMergeChunkQuerier(blockQueriers, nil, storage.NewConcatenatingChunkSeriesMerger()), nil
+}
+
 func (db *DB) ExemplarQuerier(ctx context.Context) (storage.ExemplarQuerier, error) {
 	return db.head.exemplars.ExemplarQuerier(ctx)
 }
--- a/tsdb/db_test.go
+++ b/tsdb/db_test.go
@ -62,7 +62,14 @@ func TestMain(m *testing.M) {
 	flag.Parse()
 	defaultIsolationDisabled = !isolationEnabled

-	goleak.VerifyTestMain(m, goleak.IgnoreTopFunction("github.com/prometheus/prometheus/tsdb.(*SegmentWAL).cut.func1"), goleak.IgnoreTopFunction("github.com/prometheus/prometheus/tsdb.(*SegmentWAL).cut.func2"))
+	goleak.VerifyTestMain(m,
+		goleak.IgnoreTopFunction("github.com/prometheus/prometheus/tsdb.(*SegmentWAL).cut.func1"),
+		goleak.IgnoreTopFunction("github.com/prometheus/prometheus/tsdb.(*SegmentWAL).cut.func2"),
+		// Ignore "ristretto" and its dependency "glog".
+		goleak.IgnoreTopFunction("github.com/dgraph-io/ristretto.(*defaultPolicy).processItems"),
+		goleak.IgnoreTopFunction("github.com/dgraph-io/ristretto.(*Cache).processItems"),
+		goleak.IgnoreTopFunction("github.com/golang/glog.(*fileSink).flushDaemon"),
+	)
 }

 func openTestDB(t testing.TB, opts *Options, rngs []int64) (db *DB) {
@ -5460,13 +5467,13 @@ func TestOOOMmapCorruption(t *testing.T) {
 	addSamples(120, 120, false)

 	// Second m-map file. We will corrupt this file. Sample 120 goes into this new file.
-	db.head.chunkDiskMapper.CutNewFile()
+	require.NoError(t, db.head.chunkDiskMapper.CutNewFile())

 	// More OOO samples.
 	addSamples(200, 230, false)
 	addSamples(240, 255, false)

-	db.head.chunkDiskMapper.CutNewFile()
+	require.NoError(t, db.head.chunkDiskMapper.CutNewFile())
 	addSamples(260, 290, false)

 	verifySamples := func(expSamples []tsdbutil.Sample) {
--- a/tsdb/hashcache/series_hash_cache.go
+++ b/tsdb/hashcache/series_hash_cache.go
@ -0,0 +1,192 @@
+package hashcache
+
+import (
+	"sync"
+
+	"go.uber.org/atomic"
+
+	"github.com/prometheus/prometheus/storage"
+)
+
+const (
+	numGenerations = 4
+
+	// approxBytesPerEntry is the estimated memory footprint (in bytes) of 1 cache
+	// entry, measured with TestSeriesHashCache_MeasureApproximateSizePerEntry().
+	approxBytesPerEntry = 28
+)
+
+// SeriesHashCache is a bounded cache mapping the per-block series ID with
+// its labels hash.
+type SeriesHashCache struct {
+	maxEntriesPerGeneration uint64
+
+	generationsMx sync.RWMutex
+	generations   [numGenerations]cacheGeneration
+}
+
+func NewSeriesHashCache(maxBytes uint64) *SeriesHashCache {
+	maxEntriesPerGeneration := maxBytes / approxBytesPerEntry / numGenerations
+	if maxEntriesPerGeneration < 1 {
+		maxEntriesPerGeneration = 1
+	}
+
+	c := &SeriesHashCache{maxEntriesPerGeneration: maxEntriesPerGeneration}
+
+	// Init generations.
+	for idx := 0; idx < numGenerations; idx++ {
+		c.generations[idx].blocks = &sync.Map{}
+		c.generations[idx].length = atomic.NewUint64(0)
+	}
+
+	return c
+}
+
+// GetBlockCache returns a reference to the series hash cache for the provided blockID.
+// The returned cache reference should be retained only for a short period (ie. the duration
+// of the execution of 1 single query).
+func (c *SeriesHashCache) GetBlockCache(blockID string) *BlockSeriesHashCache {
+	blockCache := &BlockSeriesHashCache{}
+
+	c.generationsMx.RLock()
+	defer c.generationsMx.RUnlock()
+
+	// Trigger a garbage collection if the current generation reached the max size.
+	if c.generations[0].length.Load() >= c.maxEntriesPerGeneration {
+		c.generationsMx.RUnlock()
+		c.gc()
+		c.generationsMx.RLock()
+	}
+
+	for idx := 0; idx < numGenerations; idx++ {
+		gen := c.generations[idx]
+
+		if value, ok := gen.blocks.Load(blockID); ok {
+			blockCache.generations[idx] = value.(*blockCacheGeneration)
+			continue
+		}
+
+		// Create a new per-block cache only for the current generation.
+		// If the cache for the older generation doesn't exist, then its
+		// value will be null and skipped when reading.
+		if idx == 0 {
+			value, _ := gen.blocks.LoadOrStore(blockID, newBlockCacheGeneration(gen.length))
+			blockCache.generations[idx] = value.(*blockCacheGeneration)
+		}
+	}
+
+	return blockCache
+}
+
+// GetBlockCacheProvider returns a cache provider bounded to the provided blockID.
+func (c *SeriesHashCache) GetBlockCacheProvider(blockID string) *BlockSeriesHashCacheProvider {
+	return NewBlockSeriesHashCacheProvider(c, blockID)
+}
+
+func (c *SeriesHashCache) gc() {
+	c.generationsMx.Lock()
+	defer c.generationsMx.Unlock()
+
+	// Make sure no other goroutines already GCed the current generation.
+	if c.generations[0].length.Load() < c.maxEntriesPerGeneration {
+		return
+	}
+
+	// Shift the current generation to old.
+	for idx := numGenerations - 2; idx >= 0; idx-- {
+		c.generations[idx+1] = c.generations[idx]
+	}
+
+	// Initialise a new empty current generation.
+	c.generations[0] = cacheGeneration{
+		blocks: &sync.Map{},
+		length: atomic.NewUint64(0),
+	}
+}
+
+// cacheGeneration holds a multi-blocks cache generation.
+type cacheGeneration struct {
+	// blocks maps the block ID with blockCacheGeneration.
+	blocks *sync.Map
+
+	// Keeps track of the number of items added to the cache. This counter
+	// is passed to each blockCacheGeneration belonging to this generation.
+	length *atomic.Uint64
+}
+
+// blockCacheGeneration holds a per-block cache generation.
+type blockCacheGeneration struct {
+	// hashes maps per-block series ID with its hash.
+	hashesMx sync.RWMutex
+	hashes   map[storage.SeriesRef]uint64
+
+	// Keeps track of the number of items added to the cache. This counter is
+	// shared with all blockCacheGeneration in the "parent" cacheGeneration.
+	length *atomic.Uint64
+}
+
+func newBlockCacheGeneration(length *atomic.Uint64) *blockCacheGeneration {
+	return &blockCacheGeneration{
+		hashes: make(map[storage.SeriesRef]uint64),
+		length: length,
+	}
+}
+
+type BlockSeriesHashCache struct {
+	generations [numGenerations]*blockCacheGeneration
+}
+
+// Fetch the hash of the given seriesID from the cache and returns a boolean
+// whether the series was found in the cache or not.
+func (c *BlockSeriesHashCache) Fetch(seriesID storage.SeriesRef) (uint64, bool) {
+	// Look for it in all generations, starting from the most recent one (index 0).
+	for idx := 0; idx < numGenerations; idx++ {
+		gen := c.generations[idx]
+
+		// Skip if the cache doesn't exist for this generation.
+		if gen == nil {
+			continue
+		}
+
+		gen.hashesMx.RLock()
+		value, ok := gen.hashes[seriesID]
+		gen.hashesMx.RUnlock()
+
+		if ok {
+			return value, true
+		}
+	}
+
+	return 0, false
+}
+
+// Store the hash of the given seriesID in the cache.
+func (c *BlockSeriesHashCache) Store(seriesID storage.SeriesRef, hash uint64) {
+	// Store it in the most recent generation (index 0).
+	gen := c.generations[0]
+
+	gen.hashesMx.Lock()
+	gen.hashes[seriesID] = hash
+	gen.hashesMx.Unlock()
+
+	gen.length.Add(1)
+}
+
+type BlockSeriesHashCacheProvider struct {
+	cache   *SeriesHashCache
+	blockID string
+}
+
+// NewBlockSeriesHashCacheProvider makes a new BlockSeriesHashCacheProvider.
+func NewBlockSeriesHashCacheProvider(cache *SeriesHashCache, blockID string) *BlockSeriesHashCacheProvider {
+	return &BlockSeriesHashCacheProvider{
+		cache:   cache,
+		blockID: blockID,
+	}
+}
+
+// SeriesHashCache returns a reference to the cache bounded to block provided
+// to NewBlockSeriesHashCacheProvider().
+func (p *BlockSeriesHashCacheProvider) SeriesHashCache() *BlockSeriesHashCache {
+	return p.cache.GetBlockCache(p.blockID)
+}
--- a/tsdb/hashcache/series_hash_cache_test.go
+++ b/tsdb/hashcache/series_hash_cache_test.go
@ -0,0 +1,137 @@
+package hashcache
+
+import (
+	"crypto/rand"
+	"fmt"
+	"runtime"
+	"strconv"
+	"sync"
+	"testing"
+
+	"github.com/oklog/ulid"
+	"github.com/stretchr/testify/require"
+
+	"github.com/prometheus/prometheus/storage"
+)
+
+func TestSeriesHashCache(t *testing.T) {
+	// Set the max cache size to store at most 1 entry per generation,
+	// so that we test the GC logic too.
+	c := NewSeriesHashCache(numGenerations * approxBytesPerEntry)
+
+	block1 := c.GetBlockCache("1")
+	assertFetch(t, block1, 1, 0, false)
+	block1.Store(1, 100)
+	assertFetch(t, block1, 1, 100, true)
+
+	block2 := c.GetBlockCache("2")
+	assertFetch(t, block2, 1, 0, false)
+	block2.Store(1, 1000)
+	assertFetch(t, block2, 1, 1000, true)
+
+	block3 := c.GetBlockCache("3")
+	assertFetch(t, block1, 1, 100, true)
+	assertFetch(t, block2, 1, 1000, true)
+	assertFetch(t, block3, 1, 0, false)
+
+	// Get again the block caches.
+	block1 = c.GetBlockCache("1")
+	block2 = c.GetBlockCache("2")
+	block3 = c.GetBlockCache("3")
+
+	assertFetch(t, block1, 1, 100, true)
+	assertFetch(t, block2, 1, 1000, true)
+	assertFetch(t, block3, 1, 0, false)
+}
+
+func TestSeriesHashCache_MeasureApproximateSizePerEntry(t *testing.T) {
+	// This test measures the approximate size (in bytes) per cache entry.
+	// We only take in account the memory used by the map, which is the largest amount.
+	const numEntries = 100000
+	c := NewSeriesHashCache(1024 * 1024 * 1024)
+	b := c.GetBlockCache(ulid.MustNew(0, rand.Reader).String())
+
+	before := runtime.MemStats{}
+	runtime.ReadMemStats(&before)
+
+	// Preallocate the map in order to not account for re-allocations
+	// since we want to measure the heap utilization and not allocations.
+	b.generations[0].hashes = make(map[storage.SeriesRef]uint64, numEntries)
+
+	for i := uint64(0); i < numEntries; i++ {
+		b.Store(storage.SeriesRef(i), i)
+	}
+
+	after := runtime.MemStats{}
+	runtime.ReadMemStats(&after)
+
+	t.Logf("approximate size per entry: %d bytes", (after.TotalAlloc-before.TotalAlloc)/numEntries)
+	require.Equal(t, uint64(approxBytesPerEntry), (after.TotalAlloc-before.TotalAlloc)/numEntries, "approxBytesPerEntry constant is out date")
+}
+
+func TestSeriesHashCache_Concurrency(t *testing.T) {
+	const (
+		concurrency   = 100
+		numIterations = 10000
+		numBlocks     = 10
+	)
+
+	// Set the max cache size to store at most 10 entries per generation,
+	// so that we stress test the GC too.
+	c := NewSeriesHashCache(10 * numGenerations * approxBytesPerEntry)
+
+	wg := sync.WaitGroup{}
+	wg.Add(concurrency)
+
+	for i := 0; i < concurrency; i++ {
+		go func() {
+			defer wg.Done()
+
+			for n := 0; n < numIterations; n++ {
+				blockID := strconv.Itoa(n % numBlocks)
+
+				blockCache := c.GetBlockCache(blockID)
+				blockCache.Store(storage.SeriesRef(n), uint64(n))
+				actual, ok := blockCache.Fetch(storage.SeriesRef(n))
+
+				require.True(t, ok)
+				require.Equal(t, uint64(n), actual)
+			}
+		}()
+	}
+
+	wg.Wait()
+}
+
+func BenchmarkSeriesHashCache_StoreAndFetch(b *testing.B) {
+	for _, numBlocks := range []int{1, 10, 100, 1000, 10000} {
+		b.Run(fmt.Sprintf("blocks=%d", numBlocks), func(b *testing.B) {
+			c := NewSeriesHashCache(1024 * 1024)
+
+			// In this benchmark we assume the usage pattern is calling Fetch() and Store() will be
+			// orders of magnitude more frequent than GetBlockCache(), so we call GetBlockCache() just
+			// once per block.
+			blockCaches := make([]*BlockSeriesHashCache, numBlocks)
+			for idx := 0; idx < numBlocks; idx++ {
+				blockCaches[idx] = c.GetBlockCache(strconv.Itoa(idx))
+			}
+
+			// In this benchmark we assume the ratio between Store() and Fetch() is 1:10.
+			storeOps := (b.N / 10) + 1
+
+			for n := 0; n < b.N; n++ {
+				if n < storeOps {
+					blockCaches[n%numBlocks].Store(storage.SeriesRef(n), uint64(n))
+				} else {
+					blockCaches[n%numBlocks].Fetch(storage.SeriesRef(n % storeOps))
+				}
+			}
+		})
+	}
+}
+
+func assertFetch(t *testing.T, c *BlockSeriesHashCache, seriesID storage.SeriesRef, expectedValue uint64, expectedOk bool) {
+	actualValue, actualOk := c.Fetch(seriesID)
+	require.Equal(t, expectedValue, actualValue)
+	require.Equal(t, expectedOk, actualOk)
+}
--- a/tsdb/head.go
+++ b/tsdb/head.go
@ -64,6 +64,20 @@ var (
 	defaultWALReplayConcurrency = runtime.GOMAXPROCS(0)
 )

+// chunkDiskMapper is a temporary interface while we transition from
+// 0 size queue to queue based chunk disk mapper.
+type chunkDiskMapper interface {
+	CutNewFile() (returnErr error)
+	IterateAllChunks(f func(seriesRef chunks.HeadSeriesRef, chunkRef chunks.ChunkDiskMapperRef, mint, maxt int64, numSamples uint16, encoding chunkenc.Encoding, isOOO bool) error) (err error)
+	Truncate(fileNo uint32) error
+	DeleteCorrupted(originalErr error) error
+	Size() (int64, error)
+	Close() error
+	Chunk(ref chunks.ChunkDiskMapperRef) (chunkenc.Chunk, error)
+	WriteChunk(seriesRef chunks.HeadSeriesRef, mint, maxt int64, chk chunkenc.Chunk, isOOO bool, callback func(err error)) (chkRef chunks.ChunkDiskMapperRef)
+	IsQueueEmpty() bool
+}
+
 // Head handles reads and writes of time series data within a time window.
 type Head struct {
 	chunkRange               atomic.Int64
@ -101,6 +115,7 @@ type Head struct {

 	// TODO(codesome): Extend MemPostings to return only OOOPostings, Set OOOStatus, ... Like an additional map of ooo postings.
 	postings *index.MemPostings // Postings lists for terms.
+	pfmc     *PostingsForMatchersCache

 	tombstones *tombstones.MemTombstones

@ -111,7 +126,7 @@ type Head struct {
 	lastPostingsStatsCall time.Duration        // Last posting stats call (PostingsCardinalityStats()) time for caching.

 	// chunkDiskMapper is used to write and read Head chunks to/from disk.
-	chunkDiskMapper *chunks.ChunkDiskMapper
+	chunkDiskMapper chunkDiskMapper

 	chunkSnapshotMtx sync.Mutex

@ -150,6 +165,7 @@ type HeadOptions struct {
 	ChunkDirRoot         string
 	ChunkPool            chunkenc.Pool
 	ChunkWriteBufferSize int
+	ChunkEndTimeVariance float64
 	ChunkWriteQueueSize  int

 	SamplesPerChunk int
@ -164,6 +180,10 @@ type HeadOptions struct {

 	IsolationDisabled bool

+	PostingsForMatchersCacheTTL   time.Duration
+	PostingsForMatchersCacheSize  int
+	PostingsForMatchersCacheForce bool
+
 	// Maximum number of CPUs that can simultaneously processes WAL replay.
 	// The default value is GOMAXPROCS.
 	// If it is set to a negative value or zero, the default value is used.
@ -179,16 +199,20 @@ const (

 func DefaultHeadOptions() *HeadOptions {
 	ho := &HeadOptions{
-		ChunkRange:           DefaultBlockDuration,
-		ChunkDirRoot:         "",
-		ChunkPool:            chunkenc.NewPool(),
-		ChunkWriteBufferSize: chunks.DefaultWriteBufferSize,
-		ChunkWriteQueueSize:  chunks.DefaultWriteQueueSize,
-		SamplesPerChunk:      DefaultSamplesPerChunk,
-		StripeSize:           DefaultStripeSize,
-		SeriesCallback:       &noopSeriesLifecycleCallback{},
-		IsolationDisabled:    defaultIsolationDisabled,
-		WALReplayConcurrency: defaultWALReplayConcurrency,
+		ChunkRange:                    DefaultBlockDuration,
+		ChunkDirRoot:                  "",
+		ChunkPool:                     chunkenc.NewPool(),
+		ChunkWriteBufferSize:          chunks.DefaultWriteBufferSize,
+		ChunkEndTimeVariance:          0,
+		ChunkWriteQueueSize:           chunks.DefaultWriteQueueSize,
+		SamplesPerChunk:               DefaultSamplesPerChunk,
+		StripeSize:                    DefaultStripeSize,
+		SeriesCallback:                &noopSeriesLifecycleCallback{},
+		IsolationDisabled:             defaultIsolationDisabled,
+		PostingsForMatchersCacheTTL:   defaultPostingsForMatchersCacheTTL,
+		PostingsForMatchersCacheSize:  defaultPostingsForMatchersCacheSize,
+		PostingsForMatchersCacheForce: false,
+		WALReplayConcurrency:          defaultWALReplayConcurrency,
 	}
 	ho.OutOfOrderCapMax.Store(DefaultOutOfOrderCapMax)
 	return ho
@ -254,6 +278,8 @@ func NewHead(r prometheus.Registerer, l log.Logger, wal, wbl *wlog.WL, opts *Hea
 		},
 		stats: stats,
 		reg:   r,
+
+		pfmc: NewPostingsForMatchersCache(opts.PostingsForMatchersCacheTTL, opts.PostingsForMatchersCacheSize, opts.PostingsForMatchersCacheForce),
 	}
 	if err := h.resetInMemoryState(); err != nil {
 		return nil, err
@ -497,6 +523,7 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
 			m.checkpointCreationTotal,
 			m.mmapChunkCorruptionTotal,
 			m.snapshotReplayErrorTotal,
+			m.oooHistogram,
 			// Metrics bound to functions and not needed in tests
 			// can be created and registered on the spot.
 			prometheus.NewGaugeFunc(prometheus.GaugeOpts{
@ -1427,7 +1454,7 @@ func (h *Head) Delete(mint, maxt int64, ms ...*labels.Matcher) error {

 	ir := h.indexRange(mint, maxt)

-	p, err := PostingsForMatchers(ir, ms...)
+	p, err := ir.PostingsForMatchers(false, ms...)
 	if err != nil {
 		return errors.Wrap(err, "select series")
 	}
@ -1614,7 +1641,7 @@ func (h *Head) getOrCreate(hash uint64, lset labels.Labels) (*memSeries, bool, e

 func (h *Head) getOrCreateWithID(id chunks.HeadSeriesRef, hash uint64, lset labels.Labels) (*memSeries, bool, error) {
 	s, created, err := h.series.getOrSet(hash, lset, func() *memSeries {
-		return newMemSeries(lset, id, h.opts.IsolationDisabled)
+		return newMemSeries(lset, id, labels.StableHash(lset), h.opts.ChunkEndTimeVariance, h.opts.IsolationDisabled)
 	})
 	if err != nil {
 		return nil, false, err
@ -1905,6 +1932,9 @@ type memSeries struct {
 	lset labels.Labels
 	meta *metadata.Metadata

+	// Series labels hash to use for sharding purposes.
+	shardHash uint64
+
 	// Immutable chunks on disk that have not yet gone into a block, in order of ascending time stamps.
 	// When compaction runs, chunks get moved into a block and all pointers are shifted like so:
 	//
@ -1922,6 +1952,10 @@ type memSeries struct {

 	mmMaxTime int64 // Max time of any mmapped chunk, only used during WAL replay.

+	// chunkEndTimeVariance is how much variance (between 0 and 1) should be applied to the chunk end time,
+	// to spread chunks writing across time. Doesn't apply to the last chunk of the chunk range. 0 to disable variance.
+	chunkEndTimeVariance float64
+
 	nextAt int64 // Timestamp at which to cut the next chunk.

 	// We keep the last value here (in addition to appending it to the chunk) so we can check for duplicates.
@ -1950,11 +1984,13 @@ type memSeriesOOOFields struct {
 	firstOOOChunkID  chunks.HeadChunkID // HeadOOOChunkID for oooMmappedChunks[0].
 }

-func newMemSeries(lset labels.Labels, id chunks.HeadSeriesRef, isolationDisabled bool) *memSeries {
+func newMemSeries(lset labels.Labels, id chunks.HeadSeriesRef, shardHash uint64, chunkEndTimeVariance float64, isolationDisabled bool) *memSeries {
 	s := &memSeries{
-		lset:   lset,
-		ref:    id,
-		nextAt: math.MinInt64,
+		lset:                 lset,
+		ref:                  id,
+		nextAt:               math.MinInt64,
+		chunkEndTimeVariance: chunkEndTimeVariance,
+		shardHash:            shardHash,
 	}
 	if !isolationDisabled {
 		s.txs = newTxRing(4)
--- a/tsdb/head_append.go
+++ b/tsdb/head_append.go
@ -187,6 +187,13 @@ func (h *Head) AppendableMinValidTime() (int64, bool) {
 	return h.appendableMinValidTime(), true
 }

+func min(a, b int64) int64 {
+	if a < b {
+		return a
+	}
+	return b
+}
+
 func max(a, b int64) int64 {
 	if a > b {
 		return a
@ -1099,7 +1106,7 @@ func (a *headAppender) Commit() (err error) {
 }

 // insert is like append, except it inserts. Used for OOO samples.
-func (s *memSeries) insert(t int64, v float64, chunkDiskMapper *chunks.ChunkDiskMapper, oooCapMax int64) (inserted, chunkCreated bool, mmapRef chunks.ChunkDiskMapperRef) {
+func (s *memSeries) insert(t int64, v float64, chunkDiskMapper chunkDiskMapper, oooCapMax int64) (inserted, chunkCreated bool, mmapRef chunks.ChunkDiskMapperRef) {
 	if s.ooo == nil {
 		s.ooo = &memSeriesOOOFields{}
 	}
@ -1124,7 +1131,7 @@ func (s *memSeries) insert(t int64, v float64, chunkDiskMapper *chunks.ChunkDisk

 // chunkOpts are chunk-level options that are passed when appending to a memSeries.
 type chunkOpts struct {
-	chunkDiskMapper *chunks.ChunkDiskMapper
+	chunkDiskMapper chunkDiskMapper
 	chunkRange      int64
 	samplesPerChunk int
 }
@ -1310,7 +1317,10 @@ func (s *memSeries) appendPreprocessor(t int64, e chunkenc.Encoding, o chunkOpts
 	// the remaining chunks in the current chunk range.
 	// At latest it must happen at the timestamp set when the chunk was cut.
 	if numSamples == o.samplesPerChunk/4 {
-		s.nextAt = computeChunkEndTime(c.minTime, c.maxTime, s.nextAt)
+		maxNextAt := s.nextAt
+
+		s.nextAt = computeChunkEndTime(c.minTime, c.maxTime, maxNextAt)
+		s.nextAt = addJitterToChunkEndTime(s.shardHash, c.minTime, s.nextAt, maxNextAt, s.chunkEndTimeVariance)
 	}
 	// If numSamples > samplesPerChunk*2 then our previous prediction was invalid,
 	// most likely because samples rate has changed and now they are arriving more frequently.
@ -1338,8 +1348,32 @@ func computeChunkEndTime(start, cur, max int64) int64 {
 	return start + (max-start)/n
 }

+// addJitterToChunkEndTime return chunk's nextAt applying a jitter based on the provided expected variance.
+// The variance is applied to the estimated chunk duration (nextAt - chunkMinTime); the returned updated chunk
+// end time is guaranteed to be between "chunkDuration - (chunkDuration*(variance/2))" to
+// "chunkDuration + chunkDuration*(variance/2)", and never greater than maxNextAt.
+func addJitterToChunkEndTime(seriesHash uint64, chunkMinTime, nextAt, maxNextAt int64, variance float64) int64 {
+	if variance <= 0 {
+		return nextAt
+	}
+
+	// Do not apply the jitter if the chunk is expected to be the last one of the chunk range.
+	if nextAt >= maxNextAt {
+		return nextAt
+	}
+
+	// Compute the variance to apply to the chunk end time. The variance is based on the series hash so that
+	// different TSDBs ingesting the same exact samples (e.g. in a distributed system like Mimir) will have
+	// the same chunks for a given period.
+	chunkDuration := nextAt - chunkMinTime
+	chunkDurationMaxVariance := int64(float64(chunkDuration) * variance)
+	chunkDurationVariance := int64(seriesHash % uint64(chunkDurationMaxVariance))
+
+	return min(maxNextAt, nextAt+chunkDurationVariance-(chunkDurationMaxVariance/2))
+}
+
 func (s *memSeries) cutNewHeadChunk(
-	mint int64, e chunkenc.Encoding, chunkDiskMapper *chunks.ChunkDiskMapper, chunkRange int64,
+	mint int64, e chunkenc.Encoding, chunkDiskMapper chunkDiskMapper, chunkRange int64,
 ) *memChunk {
 	s.mmapCurrentHeadChunk(chunkDiskMapper)

@ -1372,7 +1406,7 @@ func (s *memSeries) cutNewHeadChunk(

 // cutNewOOOHeadChunk cuts a new OOO chunk and m-maps the old chunk.
 // The caller must ensure that s.ooo is not nil.
-func (s *memSeries) cutNewOOOHeadChunk(mint int64, chunkDiskMapper *chunks.ChunkDiskMapper) (*oooHeadChunk, chunks.ChunkDiskMapperRef) {
+func (s *memSeries) cutNewOOOHeadChunk(mint int64, chunkDiskMapper chunkDiskMapper) (*oooHeadChunk, chunks.ChunkDiskMapperRef) {
 	ref := s.mmapCurrentOOOHeadChunk(chunkDiskMapper)

 	s.ooo.oooHeadChunk = &oooHeadChunk{
@ -1384,7 +1418,7 @@ func (s *memSeries) cutNewOOOHeadChunk(mint int64, chunkDiskMapper *chunks.Chunk
 	return s.ooo.oooHeadChunk, ref
 }

-func (s *memSeries) mmapCurrentOOOHeadChunk(chunkDiskMapper *chunks.ChunkDiskMapper) chunks.ChunkDiskMapperRef {
+func (s *memSeries) mmapCurrentOOOHeadChunk(chunkDiskMapper chunkDiskMapper) chunks.ChunkDiskMapperRef {
 	if s.ooo == nil || s.ooo.oooHeadChunk == nil {
 		// There is no head chunk, so nothing to m-map here.
 		return 0
@ -1401,7 +1435,7 @@ func (s *memSeries) mmapCurrentOOOHeadChunk(chunkDiskMapper *chunks.ChunkDiskMap
 	return chunkRef
 }

-func (s *memSeries) mmapCurrentHeadChunk(chunkDiskMapper *chunks.ChunkDiskMapper) {
+func (s *memSeries) mmapCurrentHeadChunk(chunkDiskMapper chunkDiskMapper) {
 	if s.headChunk == nil || s.headChunk.chunk.NumSamples() == 0 {
 		// There is no head chunk, so nothing to m-map here.
 		return
--- a/tsdb/head_append_test.go
+++ b/tsdb/head_append_test.go
@ -0,0 +1,66 @@
+package tsdb
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestAddJitterToChunkEndTime_ShouldHonorMaxVarianceAndMaxNextAt(t *testing.T) {
+	chunkMinTime := int64(10)
+	nextAt := int64(95)
+	maxNextAt := int64(100)
+	variance := 0.2
+
+	// Compute the expected max variance.
+	expectedMaxVariance := int64(float64(nextAt-chunkMinTime) * variance)
+
+	for seriesHash := uint64(0); seriesHash < 1000; seriesHash++ {
+		actual := addJitterToChunkEndTime(seriesHash, chunkMinTime, nextAt, maxNextAt, variance)
+		require.GreaterOrEqual(t, actual, nextAt-(expectedMaxVariance/2))
+		require.LessOrEqual(t, actual, maxNextAt)
+	}
+}
+
+func TestAddJitterToChunkEndTime_Distribution(t *testing.T) {
+	chunkMinTime := int64(0)
+	nextAt := int64(50)
+	maxNextAt := int64(100)
+	variance := 0.2
+	numSeries := uint64(1000)
+
+	// Compute the expected max variance.
+	expectedMaxVariance := int64(float64(nextAt-chunkMinTime) * variance)
+
+	// Keep track of the distribution of the applied variance.
+	varianceDistribution := map[int64]int64{}
+
+	for seriesHash := uint64(0); seriesHash < numSeries; seriesHash++ {
+		actual := addJitterToChunkEndTime(seriesHash, chunkMinTime, nextAt, maxNextAt, variance)
+		require.GreaterOrEqual(t, actual, nextAt-(expectedMaxVariance/2))
+		require.LessOrEqual(t, actual, nextAt+(expectedMaxVariance/2))
+		require.LessOrEqual(t, actual, maxNextAt)
+
+		variance := nextAt - actual
+		varianceDistribution[variance]++
+	}
+
+	// Ensure a uniform distribution.
+	for variance, count := range varianceDistribution {
+		require.Equalf(t, int64(numSeries)/expectedMaxVariance, count, "variance = %d", variance)
+	}
+}
+
+func TestAddJitterToChunkEndTime_ShouldNotApplyJitterToTheLastChunkOfTheRange(t *testing.T) {
+	// Since the jitter could also be 0, we try it for multiple series.
+	for seriesHash := uint64(0); seriesHash < 10; seriesHash++ {
+		require.Equal(t, int64(200), addJitterToChunkEndTime(seriesHash, 150, 200, 200, 0.2))
+	}
+}
+
+func TestAddJitterToChunkEndTime_ShouldNotApplyJitterIfDisabled(t *testing.T) {
+	// Since the jitter could also be 0, we try it for multiple series.
+	for seriesHash := uint64(0); seriesHash < 10; seriesHash++ {
+		require.Equal(t, int64(130), addJitterToChunkEndTime(seriesHash, 100, 130, 200, 0))
+	}
+}
--- a/tsdb/head_read.go
+++ b/tsdb/head_read.go
@ -120,6 +120,10 @@ func (h *headIndexReader) Postings(name string, values ...string) (index.Posting
 	}
 }

+func (h *headIndexReader) PostingsForMatchers(concurrent bool, ms ...*labels.Matcher) (index.Postings, error) {
+	return h.head.pfmc.PostingsForMatchers(h, concurrent, ms...)
+}
+
 func (h *headIndexReader) SortedPostings(p index.Postings) index.Postings {
 	series := make([]*memSeries, 0, 128)

@ -148,6 +152,27 @@ func (h *headIndexReader) SortedPostings(p index.Postings) index.Postings {
 	return index.NewListPostings(ep)
 }

+func (h *headIndexReader) ShardedPostings(p index.Postings, shardIndex, shardCount uint64) index.Postings {
+	out := make([]storage.SeriesRef, 0, 128)
+
+	for p.Next() {
+		s := h.head.series.getByID(chunks.HeadSeriesRef(p.At()))
+		if s == nil {
+			level.Debug(h.head.logger).Log("msg", "Looked up series not found")
+			continue
+		}
+
+		// Check if the series belong to the shard.
+		if s.shardHash%shardCount != shardIndex {
+			continue
+		}
+
+		out = append(out, storage.SeriesRef(s.ref))
+	}
+
+	return index.NewListPostings(out)
+}
+
 // Series returns the series for the given reference.
 func (h *headIndexReader) Series(ref storage.SeriesRef, builder *labels.ScratchBuilder, chks *[]chunks.Meta) error {
 	s := h.head.series.getByID(chunks.HeadSeriesRef(ref))
@ -158,6 +183,10 @@ func (h *headIndexReader) Series(ref storage.SeriesRef, builder *labels.ScratchB
 	}
 	builder.Assign(s.lset)

+	if chks == nil {
+		return nil
+	}
+
 	s.Lock()
 	defer s.Unlock()

@ -341,7 +370,7 @@ func (h *headChunkReader) chunk(meta chunks.Meta, copyLastChunk bool) (chunkenc.
 // chunk returns the chunk for the HeadChunkID from memory or by m-mapping it from the disk.
 // If headChunk is false, it means that the returned *memChunk
 // (and not the chunkenc.Chunk inside it) can be garbage collected after its usage.
-func (s *memSeries) chunk(id chunks.HeadChunkID, chunkDiskMapper *chunks.ChunkDiskMapper, memChunkPool *sync.Pool) (chunk *memChunk, headChunk bool, err error) {
+func (s *memSeries) chunk(id chunks.HeadChunkID, cdm chunkDiskMapper, memChunkPool *sync.Pool) (chunk *memChunk, headChunk bool, err error) {
 	// ix represents the index of chunk in the s.mmappedChunks slice. The chunk id's are
 	// incremented by 1 when new chunk is created, hence (id - firstChunkID) gives the slice index.
 	// The max index for the s.mmappedChunks slice can be len(s.mmappedChunks)-1, hence if the ix
@ -357,7 +386,7 @@ func (s *memSeries) chunk(id chunks.HeadChunkID, chunkDiskMapper *chunks.ChunkDi
 		}
 		return s.headChunk, true, nil
 	}
-	chk, err := chunkDiskMapper.Chunk(s.mmappedChunks[ix].ref)
+	chk, err := cdm.Chunk(s.mmappedChunks[ix].ref)
 	if err != nil {
 		if _, ok := err.(*chunks.CorruptionErr); ok {
 			panic(err)
@ -377,7 +406,7 @@ func (s *memSeries) chunk(id chunks.HeadChunkID, chunkDiskMapper *chunks.ChunkDi
 // chunks in the OOOHead.
 // This function is not thread safe unless the caller holds a lock.
 // The caller must ensure that s.ooo is not nil.
-func (s *memSeries) oooMergedChunk(meta chunks.Meta, cdm *chunks.ChunkDiskMapper, mint, maxt int64) (chunk *mergedOOOChunks, err error) {
+func (s *memSeries) oooMergedChunk(meta chunks.Meta, cdm chunkDiskMapper, mint, maxt int64) (chunk *mergedOOOChunks, err error) {
 	_, cid := chunks.HeadChunkRef(meta.Ref).Unpack()

 	// ix represents the index of chunk in the s.mmappedChunks slice. The chunk meta's are
--- a/tsdb/head_test.go
+++ b/tsdb/head_test.go
@ -290,7 +290,8 @@ func BenchmarkLoadWAL(b *testing.B) {
 						}
 						for k := 0; k < c.batches*c.seriesPerBatch; k++ {
 							// Create one mmapped chunk per series, with one sample at the given time.
-							s := newMemSeries(labels.Labels{}, chunks.HeadSeriesRef(k)*101, defaultIsolationDisabled)
+							lbls := labels.Labels{}
+							s := newMemSeries(lbls, chunks.HeadSeriesRef(k)*101, labels.StableHash(lbls), 0, defaultIsolationDisabled)
 							s.append(c.mmappedChunkT, 42, 0, cOpts)
 							s.mmapCurrentHeadChunk(chunkDiskMapper)
 						}
@ -384,9 +385,9 @@ func TestHead_HighConcurrencyReadAndWrite(t *testing.T) {
 	workerReadyWg.Add(writeConcurrency + readConcurrency)

 	// Start the write workers.
-	for wid := 0; wid < writeConcurrency; wid++ {
+	for workerID := 0; workerID < writeConcurrency; workerID++ {
 		// Create copy of workerID to be used by worker routine.
-		workerID := wid
+		workerID := workerID

 		g.Go(func() error {
 			// The label sets which this worker will write.
@ -428,9 +429,9 @@ func TestHead_HighConcurrencyReadAndWrite(t *testing.T) {
 	readerTsCh := make(chan uint64)

 	// Start the read workers.
-	for wid := 0; wid < readConcurrency; wid++ {
+	for workerID := 0; workerID < readConcurrency; workerID++ {
 		// Create copy of threadID to be used by worker routine.
-		workerID := wid
+		workerID := workerID

 		g.Go(func() error {
 			querySeriesRef := (seriesCnt / readConcurrency) * workerID
@ -457,7 +458,7 @@ func TestHead_HighConcurrencyReadAndWrite(t *testing.T) {
 				}

 				if len(samples) != 1 {
-					return false, fmt.Errorf("expected 1 series, got %d", len(samples))
+					return false, fmt.Errorf("expected 1 sample, got %d", len(samples))
 				}

 				series := lbls.String()
@ -816,7 +817,8 @@ func TestMemSeries_truncateChunks(t *testing.T) {
 		},
 	}

-	s := newMemSeries(labels.FromStrings("a", "b"), 1, defaultIsolationDisabled)
+	lbls := labels.FromStrings("a", "b")
+	s := newMemSeries(lbls, 1, labels.StableHash(lbls), 0, defaultIsolationDisabled)

 	for i := 0; i < 4000; i += 5 {
 		ok, _ := s.append(int64(i), float64(i), 0, cOpts)
@ -1351,7 +1353,8 @@ func TestMemSeries_append(t *testing.T) {
 		samplesPerChunk: DefaultSamplesPerChunk,
 	}

-	s := newMemSeries(labels.Labels{}, 1, defaultIsolationDisabled)
+	lbls := labels.Labels{}
+	s := newMemSeries(lbls, 1, labels.StableHash(lbls), 0, defaultIsolationDisabled)

 	// Add first two samples at the very end of a chunk range and the next two
 	// on and after it.
@ -1409,7 +1412,8 @@ func TestMemSeries_appendHistogram(t *testing.T) {
 		samplesPerChunk: DefaultSamplesPerChunk,
 	}

-	s := newMemSeries(labels.Labels{}, 1, defaultIsolationDisabled)
+	lbls := labels.Labels{}
+	s := newMemSeries(lbls, 1, labels.StableHash(lbls), 0, defaultIsolationDisabled)

 	histograms := tsdbutil.GenerateTestHistograms(4)
 	histogramWithOneMoreBucket := histograms[3].Copy()
@ -1469,7 +1473,8 @@ func TestMemSeries_append_atVariableRate(t *testing.T) {
 		samplesPerChunk: samplesPerChunk,
 	}

-	s := newMemSeries(labels.Labels{}, 1, defaultIsolationDisabled)
+	lbls := labels.Labels{}
+	s := newMemSeries(lbls, 1, labels.StableHash(lbls), 0, defaultIsolationDisabled)

 	// At this slow rate, we will fill the chunk in two block durations.
 	slowRate := (DefaultBlockDuration * 2) / samplesPerChunk
@ -1840,7 +1845,7 @@ func TestHeadReadWriterRepair(t *testing.T) {
 			ok, chunkCreated = s.append(int64(i*chunkRange)+chunkRange-1, float64(i*chunkRange), 0, cOpts)
 			require.True(t, ok, "series append failed")
 			require.False(t, chunkCreated, "chunk was created")
-			h.chunkDiskMapper.CutNewFile()
+			require.NoError(t, h.chunkDiskMapper.CutNewFile())
 		}
 		require.NoError(t, h.Close())

@ -2523,6 +2528,67 @@ func TestHeadLabelNamesWithMatchers(t *testing.T) {
 	}
 }

+func TestHeadShardedPostings(t *testing.T) {
+	head, _ := newTestHead(t, 1000, wlog.CompressionNone, false)
+	defer func() {
+		require.NoError(t, head.Close())
+	}()
+
+	// Append some series.
+	app := head.Appender(context.Background())
+	for i := 0; i < 100; i++ {
+		_, err := app.Append(0, labels.FromStrings("unique", fmt.Sprintf("value%d", i), "const", "1"), 100, 0)
+		require.NoError(t, err)
+	}
+	require.NoError(t, app.Commit())
+
+	ir := head.indexRange(0, 200)
+
+	// List all postings for a given label value. This is what we expect to get
+	// in output from all shards.
+	p, err := ir.Postings("const", "1")
+	require.NoError(t, err)
+
+	var expected []storage.SeriesRef
+	for p.Next() {
+		expected = append(expected, p.At())
+	}
+	require.NoError(t, p.Err())
+	require.Greater(t, len(expected), 0)
+
+	// Query the same postings for each shard.
+	const shardCount = uint64(4)
+	actualShards := make(map[uint64][]storage.SeriesRef)
+	actualPostings := make([]storage.SeriesRef, 0, len(expected))
+
+	for shardIndex := uint64(0); shardIndex < shardCount; shardIndex++ {
+		p, err = ir.Postings("const", "1")
+		require.NoError(t, err)
+
+		p = ir.ShardedPostings(p, shardIndex, shardCount)
+		for p.Next() {
+			ref := p.At()
+
+			actualShards[shardIndex] = append(actualShards[shardIndex], ref)
+			actualPostings = append(actualPostings, ref)
+		}
+		require.NoError(t, p.Err())
+	}
+
+	// We expect the postings merged out of shards is the exact same of the non sharded ones.
+	require.ElementsMatch(t, expected, actualPostings)
+
+	// We expect the series in each shard are the expected ones.
+	for shardIndex, ids := range actualShards {
+		for _, id := range ids {
+			var lbls labels.ScratchBuilder
+
+			require.NoError(t, ir.Series(id, &lbls, nil))
+			require.Equal(t, shardIndex, labels.StableHash(lbls.Labels())%shardCount)
+		}
+	}
+}
+
 func TestErrReuseAppender(t *testing.T) {
 	head, _ := newTestHead(t, 1000, wlog.CompressionNone, false)
 	defer func() {
@ -2659,7 +2725,8 @@ func TestIteratorSeekIntoBuffer(t *testing.T) {
 		samplesPerChunk: DefaultSamplesPerChunk,
 	}

-	s := newMemSeries(labels.Labels{}, 1, defaultIsolationDisabled)
+	lbls := labels.Labels{}
+	s := newMemSeries(lbls, 1, labels.StableHash(lbls), 0, defaultIsolationDisabled)

 	for i := 0; i < 7; i++ {
 		ok, _ := s.append(int64(i), float64(i), 0, cOpts)
--- a/tsdb/head_wal.go
+++ b/tsdb/head_wal.go
@ -502,6 +502,8 @@ func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc, oooMmc []*m
 	}

 	// Any samples replayed till now would already be compacted. Resetting the head chunk.
+	// We do not reset oooHeadChunk because that is being replayed from a different WAL
+	// and has not been replayed here.
 	mSeries.nextAt = 0
 	mSeries.headChunk = nil
 	mSeries.app = nil
--- a/tsdb/index/index.go
+++ b/tsdb/index/index.go
@ -37,6 +37,7 @@ import (
 	"github.com/prometheus/prometheus/tsdb/encoding"
 	tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
 	"github.com/prometheus/prometheus/tsdb/fileutil"
+	"github.com/prometheus/prometheus/tsdb/hashcache"
 )

 const (
@ -1056,6 +1057,10 @@ type StringIter interface {
 	Err() error
 }

+type ReaderCacheProvider interface {
+	SeriesHashCache() *hashcache.BlockSeriesHashCache
+}
+
 type Reader struct {
 	b   ByteSlice
 	toc *TOC
@ -1076,6 +1081,9 @@ type Reader struct {
 	dec *Decoder

 	version int
+
+	// Provides a cache mapping series labels hash by series ID.
+	cacheProvider ReaderCacheProvider
 }

 type postingOffset struct {
@ -1106,16 +1114,26 @@ func (b realByteSlice) Sub(start, end int) ByteSlice {
 // NewReader returns a new index reader on the given byte slice. It automatically
 // handles different format versions.
 func NewReader(b ByteSlice) (*Reader, error) {
-	return newReader(b, io.NopCloser(nil))
+	return newReader(b, io.NopCloser(nil), nil)
+}
+
+// NewReaderWithCache is like NewReader but allows to pass a cache provider.
+func NewReaderWithCache(b ByteSlice, cacheProvider ReaderCacheProvider) (*Reader, error) {
+	return newReader(b, io.NopCloser(nil), cacheProvider)
 }

 // NewFileReader returns a new index reader against the given index file.
 func NewFileReader(path string) (*Reader, error) {
+	return NewFileReaderWithOptions(path, nil)
+}
+
+// NewFileReaderWithOptions is like NewFileReader but allows to pass a cache provider and sharding function.
+func NewFileReaderWithOptions(path string, cacheProvider ReaderCacheProvider) (*Reader, error) {
 	f, err := fileutil.OpenMmapFile(path)
 	if err != nil {
 		return nil, err
 	}
-	r, err := newReader(realByteSlice(f.Bytes()), f)
+	r, err := newReader(realByteSlice(f.Bytes()), f, cacheProvider)
 	if err != nil {
 		return nil, tsdb_errors.NewMulti(
 			err,
@ -1126,11 +1144,12 @@ func NewFileReader(path string) (*Reader, error) {
 	return r, nil
 }

-func newReader(b ByteSlice, c io.Closer) (*Reader, error) {
+func newReader(b ByteSlice, c io.Closer, cacheProvider ReaderCacheProvider) (*Reader, error) {
 	r := &Reader{
-		b:        b,
-		c:        c,
-		postings: map[string][]postingOffset{},
+		b:             b,
+		c:             c,
+		postings:      map[string][]postingOffset{},
+		cacheProvider: cacheProvider,
 	}

 	// Verify header.
@ -1712,6 +1731,57 @@ func (r *Reader) SortedPostings(p Postings) Postings {
 	return p
 }

+// ShardedPostings returns a postings list filtered by the provided shardIndex out of shardCount.
+func (r *Reader) ShardedPostings(p Postings, shardIndex, shardCount uint64) Postings {
+	var (
+		out     = make([]storage.SeriesRef, 0, 128)
+		bufLbls = labels.ScratchBuilder{}
+	)
+
+	// Request the cache each time because the cache implementation requires
+	// that the cache reference is retained for a short period.
+	var seriesHashCache *hashcache.BlockSeriesHashCache
+	if r.cacheProvider != nil {
+		seriesHashCache = r.cacheProvider.SeriesHashCache()
+	}
+
+	for p.Next() {
+		id := p.At()
+
+		var (
+			hash uint64
+			ok   bool
+		)
+
+		// Check if the hash is cached.
+		if seriesHashCache != nil {
+			hash, ok = seriesHashCache.Fetch(id)
+		}
+
+		if !ok {
+			// Get the series labels (no chunks).
+			err := r.Series(id, &bufLbls, nil)
+			if err != nil {
+				return ErrPostings(errors.Errorf("series %d not found", id))
+			}
+
+			hash = labels.StableHash(bufLbls.Labels())
+			if seriesHashCache != nil {
+				seriesHashCache.Store(id, hash)
+			}
+		}
+
+		// Check if the series belong to the shard.
+		if hash%shardCount != shardIndex {
+			continue
+		}
+
+		out = append(out, id)
+	}
+
+	return NewListPostings(out)
+}
+
 // Size returns the size of an index file.
 func (r *Reader) Size() int64 {
 	return int64(r.b.Len())
@ -1834,7 +1904,9 @@ func (dec *Decoder) LabelValueFor(b []byte, label string) (string, error) {
 // Previous contents of builder can be overwritten - make sure you copy before retaining.
 func (dec *Decoder) Series(b []byte, builder *labels.ScratchBuilder, chks *[]chunks.Meta) error {
 	builder.Reset()
-	*chks = (*chks)[:0]
+	if chks != nil {
+		*chks = (*chks)[:0]
+	}

 	d := encoding.Decbuf{B: b}

@ -1860,6 +1932,11 @@ func (dec *Decoder) Series(b []byte, builder *labels.ScratchBuilder, chks *[]chu
 		builder.Add(ln, lv)
 	}

+	// Skip reading chunks metadata if chks is nil.
+	if chks == nil {
+		return d.Err()
+	}
+
 	// Read the chunks meta data.
 	k = d.Uvarint()

--- a/tsdb/index/index_test.go
+++ b/tsdb/index/index_test.go
@ -25,18 +25,18 @@ import (

 	"github.com/pkg/errors"
 	"github.com/stretchr/testify/require"
-	"go.uber.org/goleak"

 	"github.com/prometheus/prometheus/model/labels"
 	"github.com/prometheus/prometheus/storage"
 	"github.com/prometheus/prometheus/tsdb/chunkenc"
 	"github.com/prometheus/prometheus/tsdb/chunks"
 	"github.com/prometheus/prometheus/tsdb/encoding"
+	"github.com/prometheus/prometheus/tsdb/hashcache"
 	"github.com/prometheus/prometheus/util/testutil"
 )

 func TestMain(m *testing.M) {
-	goleak.VerifyTestMain(m)
+	testutil.TolerantVerifyLeak(m)
 }

 type series struct {
@ -240,6 +240,63 @@ func TestIndexRW_Postings(t *testing.T) {
 		"b": {"1", "2", "3", "4"},
 	}, labelIndices)

+	// Test ShardedPostings() with and without series hash cache.
+	for _, cacheEnabled := range []bool{false, true} {
+		t.Run(fmt.Sprintf("ShardedPostings() cache enabled: %v", cacheEnabled), func(t *testing.T) {
+			var cache ReaderCacheProvider
+			if cacheEnabled {
+				cache = hashcache.NewSeriesHashCache(1024 * 1024 * 1024).GetBlockCacheProvider("test")
+			}
+
+			ir, err := NewFileReaderWithOptions(fn, cache)
+			require.NoError(t, err)
+
+			// List all postings for a given label value. This is what we expect to get
+			// in output from all shards.
+			p, err = ir.Postings("a", "1")
+			require.NoError(t, err)
+
+			var expected []storage.SeriesRef
+			for p.Next() {
+				expected = append(expected, p.At())
+			}
+			require.NoError(t, p.Err())
+			require.Greater(t, len(expected), 0)
+
+			// Query the same postings for each shard.
+			const shardCount = uint64(4)
+			actualShards := make(map[uint64][]storage.SeriesRef)
+			actualPostings := make([]storage.SeriesRef, 0, len(expected))
+
+			for shardIndex := uint64(0); shardIndex < shardCount; shardIndex++ {
+				p, err = ir.Postings("a", "1")
+				require.NoError(t, err)
+
+				p = ir.ShardedPostings(p, shardIndex, shardCount)
+				for p.Next() {
+					ref := p.At()
+
+					actualShards[shardIndex] = append(actualShards[shardIndex], ref)
+					actualPostings = append(actualPostings, ref)
+				}
+				require.NoError(t, p.Err())
+			}
+
+			// We expect the postings merged out of shards is the exact same of the non sharded ones.
+			require.ElementsMatch(t, expected, actualPostings)
+
+			// We expect the series in each shard are the expected ones.
+			for shardIndex, ids := range actualShards {
+				for _, id := range ids {
+					var lbls labels.ScratchBuilder
+
+					require.NoError(t, ir.Series(id, &lbls, nil))
+					require.Equal(t, shardIndex, labels.StableHash(lbls.Labels())%shardCount)
+				}
+			}
+		})
+	}
+
 	require.NoError(t, ir.Close())
 }

@ -562,6 +619,60 @@ func TestSymbols(t *testing.T) {
 	require.NoError(t, iter.Err())
 }

+func BenchmarkReader_ShardedPostings(b *testing.B) {
+	const (
+		numSeries = 10000
+		numShards = 16
+	)
+
+	dir, err := os.MkdirTemp("", "benchmark_reader_sharded_postings")
+	require.NoError(b, err)
+	defer func() {
+		require.NoError(b, os.RemoveAll(dir))
+	}()
+
+	// Generate an index.
+	fn := filepath.Join(dir, indexFilename)
+
+	iw, err := NewWriter(context.Background(), fn)
+	require.NoError(b, err)
+
+	for i := 1; i <= numSeries; i++ {
+		require.NoError(b, iw.AddSymbol(fmt.Sprintf("%10d", i)))
+	}
+	require.NoError(b, iw.AddSymbol("const"))
+	require.NoError(b, iw.AddSymbol("unique"))
+
+	for i := 1; i <= numSeries; i++ {
+		require.NoError(b, iw.AddSeries(storage.SeriesRef(i),
+			labels.FromStrings("const", fmt.Sprintf("%10d", 1), "unique", fmt.Sprintf("%10d", i))))
+	}
+
+	require.NoError(b, iw.Close())
+
+	for _, cacheEnabled := range []bool{true, false} {
+		b.Run(fmt.Sprintf("cached enabled: %v", cacheEnabled), func(b *testing.B) {
+			var cache ReaderCacheProvider
+			if cacheEnabled {
+				cache = hashcache.NewSeriesHashCache(1024 * 1024 * 1024).GetBlockCacheProvider("test")
+			}
+
+			// Create a reader to read back all postings from the index.
+			ir, err := NewFileReaderWithOptions(fn, cache)
+			require.NoError(b, err)
+
+			b.ResetTimer()
+
+			for n := 0; n < b.N; n++ {
+				allPostings, err := ir.Postings("const", fmt.Sprintf("%10d", 1))
+				require.NoError(b, err)
+
+				ir.ShardedPostings(allPostings, uint64(n%numShards), numShards)
+			}
+		})
+	}
+}
+
 func TestDecoder_Postings_WrongInput(t *testing.T) {
 	_, _, err := (&Decoder{}).Postings([]byte("the cake is a lie"))
 	require.Error(t, err)
--- a/tsdb/index/postings.go
+++ b/tsdb/index/postings.go
@ -839,6 +839,28 @@ func (it *bigEndianPostings) Err() error {
 	return nil
 }

+// PostingsCloner takes an existing Postings and allows independently clone them.
+type PostingsCloner struct {
+	ids []storage.SeriesRef
+	err error
+}
+
+// NewPostingsCloner takes an existing Postings and allows independently clone them.
+// The instance provided shouldn't have been used before (no Next() calls should have been done)
+// and it shouldn't be used once provided to the PostingsCloner.
+func NewPostingsCloner(p Postings) *PostingsCloner {
+	ids, err := ExpandPostings(p)
+	return &PostingsCloner{ids: ids, err: err}
+}
+
+// Clone returns another independent Postings instance.
+func (c *PostingsCloner) Clone() Postings {
+	if c.err != nil {
+		return ErrPostings(c.err)
+	}
+	return newListPostings(c.ids...)
+}
+
 // FindIntersectingPostings checks the intersection of p and candidates[i] for each i in candidates,
 // if intersection is non empty, then i is added to the indexes returned.
 // Returned indexes are not sorted.
--- a/tsdb/index/postings_test.go
+++ b/tsdb/index/postings_test.go
@ -975,6 +975,129 @@ func TestMemPostings_Delete(t *testing.T) {
 	require.Equal(t, 0, len(expanded), "expected empty postings, got %v", expanded)
 }

+func TestPostingsCloner(t *testing.T) {
+	for _, tc := range []struct {
+		name  string
+		check func(testing.TB, *PostingsCloner)
+	}{
+		{
+			name: "seek beyond highest value of postings, then other clone seeks higher",
+			check: func(t testing.TB, pc *PostingsCloner) {
+				p1 := pc.Clone()
+				require.False(t, p1.Seek(9))
+				require.Equal(t, storage.SeriesRef(0), p1.At())
+
+				p2 := pc.Clone()
+				require.False(t, p2.Seek(10))
+				require.Equal(t, storage.SeriesRef(0), p2.At())
+			},
+		},
+		{
+			name: "seek beyond highest value of postings, then other clone seeks lower",
+			check: func(t testing.TB, pc *PostingsCloner) {
+				p1 := pc.Clone()
+				require.False(t, p1.Seek(9))
+				require.Equal(t, storage.SeriesRef(0), p1.At())
+
+				p2 := pc.Clone()
+				require.True(t, p2.Seek(2))
+				require.Equal(t, storage.SeriesRef(2), p2.At())
+			},
+		},
+		{
+			name: "seek to posting with value 3 or higher",
+			check: func(t testing.TB, pc *PostingsCloner) {
+				p := pc.Clone()
+				require.True(t, p.Seek(3))
+				require.Equal(t, storage.SeriesRef(4), p.At())
+				require.True(t, p.Seek(4))
+				require.Equal(t, storage.SeriesRef(4), p.At())
+			},
+		},
+		{
+			name: "seek alternatively on different postings",
+			check: func(t testing.TB, pc *PostingsCloner) {
+				p1 := pc.Clone()
+				require.True(t, p1.Seek(1))
+				require.Equal(t, storage.SeriesRef(1), p1.At())
+
+				p2 := pc.Clone()
+				require.True(t, p2.Seek(2))
+				require.Equal(t, storage.SeriesRef(2), p2.At())
+
+				p3 := pc.Clone()
+				require.True(t, p3.Seek(4))
+				require.Equal(t, storage.SeriesRef(4), p3.At())
+
+				p4 := pc.Clone()
+				require.True(t, p4.Seek(5))
+				require.Equal(t, storage.SeriesRef(8), p4.At())
+
+				require.True(t, p1.Seek(3))
+				require.Equal(t, storage.SeriesRef(4), p1.At())
+				require.True(t, p1.Seek(4))
+				require.Equal(t, storage.SeriesRef(4), p1.At())
+			},
+		},
+		{
+			name: "iterate through the postings",
+			check: func(t testing.TB, pc *PostingsCloner) {
+				p1 := pc.Clone()
+				p2 := pc.Clone()
+
+				// both one step
+				require.True(t, p1.Next())
+				require.Equal(t, storage.SeriesRef(1), p1.At())
+				require.True(t, p2.Next())
+				require.Equal(t, storage.SeriesRef(1), p2.At())
+
+				require.True(t, p1.Next())
+				require.Equal(t, storage.SeriesRef(2), p1.At())
+				require.True(t, p1.Next())
+				require.Equal(t, storage.SeriesRef(4), p1.At())
+				require.True(t, p1.Next())
+				require.Equal(t, storage.SeriesRef(8), p1.At())
+				require.False(t, p1.Next())
+
+				require.True(t, p2.Next())
+				require.Equal(t, storage.SeriesRef(2), p2.At())
+				require.True(t, p2.Next())
+				require.Equal(t, storage.SeriesRef(4), p2.At())
+			},
+		},
+		{
+			name: "at before call of next shouldn't panic",
+			check: func(t testing.TB, pc *PostingsCloner) {
+				p := pc.Clone()
+				require.Equal(t, storage.SeriesRef(0), p.At())
+			},
+		},
+		{
+			name: "ensure a failed seek doesn't allow more next calls",
+			check: func(t testing.TB, pc *PostingsCloner) {
+				p := pc.Clone()
+				require.False(t, p.Seek(9))
+				require.Equal(t, storage.SeriesRef(0), p.At())
+				require.False(t, p.Next())
+				require.Equal(t, storage.SeriesRef(0), p.At())
+			},
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			pc := NewPostingsCloner(newListPostings(1, 2, 4, 8))
+			tc.check(t, pc)
+		})
+	}
+
+	t.Run("cloning an err postings", func(t *testing.T) {
+		expectedErr := fmt.Errorf("foobar")
+		pc := NewPostingsCloner(ErrPostings(expectedErr))
+		p := pc.Clone()
+		require.False(t, p.Next())
+		require.Equal(t, expectedErr, p.Err())
+	})
+}
+
 func TestFindIntersectingPostings(t *testing.T) {
 	t.Run("multiple intersections", func(t *testing.T) {
 		p := NewListPostings([]storage.SeriesRef{10, 15, 20, 25, 30, 35, 40, 45, 50})
--- a/tsdb/ooo_head_read.go
+++ b/tsdb/ooo_head_read.go
@ -154,6 +154,12 @@ func (oh *OOOHeadIndexReader) series(ref storage.SeriesRef, builder *labels.Scra
 	return nil
 }

+// PostingsForMatchers needs to be overridden so that the right IndexReader
+// implementation gets passed down to the PostingsForMatchers call.
+func (oh *OOOHeadIndexReader) PostingsForMatchers(concurrent bool, ms ...*labels.Matcher) (index.Postings, error) {
+	return oh.head.pfmc.PostingsForMatchers(oh, concurrent, ms...)
+}
+
 // LabelValues needs to be overridden from the headIndexReader implementation due
 // to the check that happens at the beginning where we make sure that the query
 // interval overlaps with the head minooot and maxooot.
@ -409,6 +415,10 @@ func (ir *OOOCompactionHeadIndexReader) SortedPostings(p index.Postings) index.P
 	return p
 }

+func (ir *OOOCompactionHeadIndexReader) ShardedPostings(p index.Postings, shardIndex, shardCount uint64) index.Postings {
+	return ir.ch.oooIR.ShardedPostings(p, shardIndex, shardCount)
+}
+
 func (ir *OOOCompactionHeadIndexReader) Series(ref storage.SeriesRef, builder *labels.ScratchBuilder, chks *[]chunks.Meta) error {
 	return ir.ch.oooIR.series(ref, builder, chks, ir.ch.lastMmapRef)
 }
--- a/tsdb/postings_for_matchers_cache.go
+++ b/tsdb/postings_for_matchers_cache.go
@ -0,0 +1,205 @@
+package tsdb
+
+import (
+	"container/list"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/prometheus/prometheus/model/labels"
+	"github.com/prometheus/prometheus/tsdb/index"
+)
+
+const (
+	defaultPostingsForMatchersCacheTTL  = 10 * time.Second
+	defaultPostingsForMatchersCacheSize = 100
+)
+
+// IndexPostingsReader is a subset of IndexReader methods, the minimum required to evaluate PostingsForMatchers
+type IndexPostingsReader interface {
+	// LabelValues returns possible label values which may not be sorted.
+	LabelValues(name string, matchers ...*labels.Matcher) ([]string, error)
+
+	// Postings returns the postings list iterator for the label pairs.
+	// The Postings here contain the offsets to the series inside the index.
+	// Found IDs are not strictly required to point to a valid Series, e.g.
+	// during background garbage collections. Input values must be sorted.
+	Postings(name string, values ...string) (index.Postings, error)
+}
+
+// NewPostingsForMatchersCache creates a new PostingsForMatchersCache.
+// If `ttl` is 0, then it only deduplicates in-flight requests.
+// If `force` is true, then all requests go through cache, regardless of the `concurrent` param provided.
+func NewPostingsForMatchersCache(ttl time.Duration, cacheSize int, force bool) *PostingsForMatchersCache {
+	b := &PostingsForMatchersCache{
+		calls:  &sync.Map{},
+		cached: list.New(),
+
+		ttl:       ttl,
+		cacheSize: cacheSize,
+		force:     force,
+
+		timeNow:             time.Now,
+		postingsForMatchers: PostingsForMatchers,
+	}
+
+	return b
+}
+
+// PostingsForMatchersCache caches PostingsForMatchers call results when the concurrent hint is passed in or force is true.
+type PostingsForMatchersCache struct {
+	calls *sync.Map
+
+	cachedMtx sync.RWMutex
+	cached    *list.List
+
+	ttl       time.Duration
+	cacheSize int
+	force     bool
+
+	// timeNow is the time.Now that can be replaced for testing purposes
+	timeNow func() time.Time
+	// postingsForMatchers can be replaced for testing purposes
+	postingsForMatchers func(ix IndexPostingsReader, ms ...*labels.Matcher) (index.Postings, error)
+}
+
+func (c *PostingsForMatchersCache) PostingsForMatchers(ix IndexPostingsReader, concurrent bool, ms ...*labels.Matcher) (index.Postings, error) {
+	if !concurrent && !c.force {
+		return c.postingsForMatchers(ix, ms...)
+	}
+	c.expire()
+	return c.postingsForMatchersPromise(ix, ms)()
+}
+
+func (c *PostingsForMatchersCache) postingsForMatchersPromise(ix IndexPostingsReader, ms []*labels.Matcher) func() (index.Postings, error) {
+	var (
+		wg       sync.WaitGroup
+		cloner   *index.PostingsCloner
+		outerErr error
+	)
+	wg.Add(1)
+
+	promise := func() (index.Postings, error) {
+		wg.Wait()
+		if outerErr != nil {
+			return nil, outerErr
+		}
+		return cloner.Clone(), nil
+	}
+
+	key := matchersKey(ms)
+	oldPromise, loaded := c.calls.LoadOrStore(key, promise)
+	if loaded {
+		return oldPromise.(func() (index.Postings, error))
+	}
+	defer wg.Done()
+
+	if postings, err := c.postingsForMatchers(ix, ms...); err != nil {
+		outerErr = err
+	} else {
+		cloner = index.NewPostingsCloner(postings)
+	}
+
+	c.created(key, c.timeNow())
+	return promise
+}
+
+type postingsForMatchersCachedCall struct {
+	key string
+	ts  time.Time
+}
+
+func (c *PostingsForMatchersCache) expire() {
+	if c.ttl <= 0 {
+		return
+	}
+
+	c.cachedMtx.RLock()
+	if !c.shouldEvictHead() {
+		c.cachedMtx.RUnlock()
+		return
+	}
+	c.cachedMtx.RUnlock()
+
+	c.cachedMtx.Lock()
+	defer c.cachedMtx.Unlock()
+
+	for c.shouldEvictHead() {
+		c.evictHead()
+	}
+}
+
+// shouldEvictHead returns true if cache head should be evicted, either because it's too old,
+// or because the cache has too many elements
+// should be called while read lock is held on cachedMtx
+func (c *PostingsForMatchersCache) shouldEvictHead() bool {
+	if c.cached.Len() > c.cacheSize {
+		return true
+	}
+	h := c.cached.Front()
+	if h == nil {
+		return false
+	}
+	ts := h.Value.(*postingsForMatchersCachedCall).ts
+	return c.timeNow().Sub(ts) >= c.ttl
+}
+
+func (c *PostingsForMatchersCache) evictHead() {
+	front := c.cached.Front()
+	oldest := front.Value.(*postingsForMatchersCachedCall)
+	c.calls.Delete(oldest.key)
+	c.cached.Remove(front)
+}
+
+// created has to be called when returning from the PostingsForMatchers call that creates the promise.
+// the ts provided should be the call time.
+func (c *PostingsForMatchersCache) created(key string, ts time.Time) {
+	if c.ttl <= 0 {
+		c.calls.Delete(key)
+		return
+	}
+
+	c.cachedMtx.Lock()
+	defer c.cachedMtx.Unlock()
+
+	c.cached.PushBack(&postingsForMatchersCachedCall{
+		key: key,
+		ts:  ts,
+	})
+}
+
+// matchersKey provides a unique string key for the given matchers slice
+// NOTE: different orders of matchers will produce different keys,
+// but it's unlikely that we'll receive same matchers in different orders at the same time
+func matchersKey(ms []*labels.Matcher) string {
+	const (
+		typeLen = 2
+		sepLen  = 1
+	)
+	var size int
+	for _, m := range ms {
+		size += len(m.Name) + len(m.Value) + typeLen + sepLen
+	}
+	sb := strings.Builder{}
+	sb.Grow(size)
+	for _, m := range ms {
+		sb.WriteString(m.Name)
+		sb.WriteString(m.Type.String())
+		sb.WriteString(m.Value)
+		sb.WriteByte(0)
+	}
+	key := sb.String()
+	return key
+}
+
+// indexReaderWithPostingsForMatchers adapts an index.Reader to be an IndexReader by adding the PostingsForMatchers method
+type indexReaderWithPostingsForMatchers struct {
+	*index.Reader
+	pfmc *PostingsForMatchersCache
+}
+
+func (ir indexReaderWithPostingsForMatchers) PostingsForMatchers(concurrent bool, ms ...*labels.Matcher) (index.Postings, error) {
+	return ir.pfmc.PostingsForMatchers(ir, concurrent, ms...)
+}
+
+var _ IndexReader = indexReaderWithPostingsForMatchers{}
--- a/tsdb/postings_for_matchers_cache_test.go
+++ b/tsdb/postings_for_matchers_cache_test.go
@ -0,0 +1,313 @@
+package tsdb
+
+import (
+	"fmt"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+
+	"github.com/prometheus/prometheus/model/labels"
+	"github.com/prometheus/prometheus/tsdb/index"
+)
+
+func TestPostingsForMatchersCache(t *testing.T) {
+	const testCacheSize = 5
+	// newPostingsForMatchersCache tests the NewPostingsForMatcherCache constructor, but overrides the postingsForMatchers func
+	newPostingsForMatchersCache := func(ttl time.Duration, pfm func(ix IndexPostingsReader, ms ...*labels.Matcher) (index.Postings, error), timeMock *timeNowMock, force bool) *PostingsForMatchersCache {
+		c := NewPostingsForMatchersCache(ttl, testCacheSize, force)
+		if c.postingsForMatchers == nil {
+			t.Fatalf("NewPostingsForMatchersCache() didn't assign postingsForMatchers func")
+		}
+		c.postingsForMatchers = pfm
+		c.timeNow = timeMock.timeNow
+		return c
+	}
+
+	t.Run("happy case one call", func(t *testing.T) {
+		for _, concurrent := range []bool{true, false} {
+			t.Run(fmt.Sprintf("concurrent=%t", concurrent), func(t *testing.T) {
+				expectedMatchers := []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar")}
+				expectedPostingsErr := fmt.Errorf("failed successfully")
+
+				c := newPostingsForMatchersCache(defaultPostingsForMatchersCacheTTL, func(ix IndexPostingsReader, ms ...*labels.Matcher) (index.Postings, error) {
+					require.IsType(t, indexForPostingsMock{}, ix, "Incorrect IndexPostingsReader was provided to PostingsForMatchers, expected the mock, was given %v (%T)", ix, ix)
+					require.Equal(t, expectedMatchers, ms, "Wrong label matchers provided, expected %v, got %v", expectedMatchers, ms)
+					return index.ErrPostings(expectedPostingsErr), nil
+				}, &timeNowMock{}, false)
+
+				p, err := c.PostingsForMatchers(indexForPostingsMock{}, concurrent, expectedMatchers...)
+				require.NoError(t, err)
+				require.NotNil(t, p)
+				require.Equal(t, p.Err(), expectedPostingsErr, "Expected ErrPostings with err %q, got %T with err %q", expectedPostingsErr, p, p.Err())
+			})
+		}
+	})
+
+	t.Run("err returned", func(t *testing.T) {
+		expectedMatchers := []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar")}
+		expectedErr := fmt.Errorf("failed successfully")
+
+		c := newPostingsForMatchersCache(defaultPostingsForMatchersCacheTTL, func(ix IndexPostingsReader, ms ...*labels.Matcher) (index.Postings, error) {
+			return nil, expectedErr
+		}, &timeNowMock{}, false)
+
+		_, err := c.PostingsForMatchers(indexForPostingsMock{}, true, expectedMatchers...)
+		require.Equal(t, expectedErr, err)
+	})
+
+	t.Run("happy case multiple concurrent calls: two same one different", func(t *testing.T) {
+		for _, cacheEnabled := range []bool{true, false} {
+			t.Run(fmt.Sprintf("cacheEnabled=%t", cacheEnabled), func(t *testing.T) {
+				for _, forced := range []bool{true, false} {
+					concurrent := !forced
+					t.Run(fmt.Sprintf("forced=%t", forced), func(t *testing.T) {
+						calls := [][]*labels.Matcher{
+							{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar")},                                                         // 1
+							{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar")},                                                         // 1 same
+							{labels.MustNewMatcher(labels.MatchRegexp, "foo", "bar")},                                                        // 2: different match type
+							{labels.MustNewMatcher(labels.MatchEqual, "diff", "bar")},                                                        // 3: different name
+							{labels.MustNewMatcher(labels.MatchEqual, "foo", "diff")},                                                        // 4: different value
+							{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar"), labels.MustNewMatcher(labels.MatchEqual, "boo", "bam")}, // 5
+							{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar"), labels.MustNewMatcher(labels.MatchEqual, "boo", "bam")}, // 5 same
+						}
+
+						// we'll identify results by each call's error, and the error will be the string value of the first matcher
+						matchersString := func(ms []*labels.Matcher) string {
+							s := strings.Builder{}
+							for i, m := range ms {
+								if i > 0 {
+									s.WriteByte(',')
+								}
+								s.WriteString(m.String())
+							}
+							return s.String()
+						}
+						expectedResults := make([]string, len(calls))
+						for i, c := range calls {
+							expectedResults[i] = c[0].String()
+						}
+
+						expectedPostingsForMatchersCalls := 5
+						// we'll block all the calls until we receive the exact amount. if we receive more, WaitGroup will panic
+						called := make(chan struct{}, expectedPostingsForMatchersCalls)
+						release := make(chan struct{})
+						var ttl time.Duration
+						if cacheEnabled {
+							ttl = defaultPostingsForMatchersCacheTTL
+						}
+						c := newPostingsForMatchersCache(ttl, func(ix IndexPostingsReader, ms ...*labels.Matcher) (index.Postings, error) {
+							select {
+							case called <- struct{}{}:
+							default:
+							}
+							<-release
+							return nil, fmt.Errorf(matchersString(ms))
+						}, &timeNowMock{}, forced)
+
+						results := make([]string, len(calls))
+						resultsWg := sync.WaitGroup{}
+						resultsWg.Add(len(calls))
+
+						// perform all calls
+						for i := 0; i < len(calls); i++ {
+							go func(i int) {
+								_, err := c.PostingsForMatchers(indexForPostingsMock{}, concurrent, calls[i]...)
+								results[i] = err.Error()
+								resultsWg.Done()
+							}(i)
+						}
+
+						// wait until all calls arrive to the mocked function
+						for i := 0; i < expectedPostingsForMatchersCalls; i++ {
+							<-called
+						}
+
+						// let them all return
+						close(release)
+
+						// wait for the results
+						resultsWg.Wait()
+
+						// check that we got correct results
+						for i, c := range calls {
+							require.Equal(t, matchersString(c), results[i], "Call %d should have returned error %q, but got %q instead", i, matchersString(c), results[i])
+						}
+					})
+				}
+			})
+		}
+	})
+
+	t.Run("with concurrent==false, result is not cached", func(t *testing.T) {
+		expectedMatchers := []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar")}
+
+		var call int
+		c := newPostingsForMatchersCache(defaultPostingsForMatchersCacheTTL, func(ix IndexPostingsReader, ms ...*labels.Matcher) (index.Postings, error) {
+			call++
+			return index.ErrPostings(fmt.Errorf("result from call %d", call)), nil
+		}, &timeNowMock{}, false)
+
+		// first call, fills the cache
+		p, err := c.PostingsForMatchers(indexForPostingsMock{}, false, expectedMatchers...)
+		require.NoError(t, err)
+		require.EqualError(t, p.Err(), "result from call 1")
+
+		// second call within the ttl (we didn't advance the time), should call again because concurrent==false
+		p, err = c.PostingsForMatchers(indexForPostingsMock{}, false, expectedMatchers...)
+		require.NoError(t, err)
+		require.EqualError(t, p.Err(), "result from call 2")
+	})
+
+	t.Run("with cache disabled, result is not cached", func(t *testing.T) {
+		expectedMatchers := []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar")}
+
+		var call int
+		c := newPostingsForMatchersCache(0, func(ix IndexPostingsReader, ms ...*labels.Matcher) (index.Postings, error) {
+			call++
+			return index.ErrPostings(fmt.Errorf("result from call %d", call)), nil
+		}, &timeNowMock{}, false)
+
+		// first call, fills the cache
+		p, err := c.PostingsForMatchers(indexForPostingsMock{}, true, expectedMatchers...)
+		require.NoError(t, err)
+		require.EqualError(t, p.Err(), "result from call 1")
+
+		// second call within the ttl (we didn't advance the time), should call again because concurrent==false
+		p, err = c.PostingsForMatchers(indexForPostingsMock{}, true, expectedMatchers...)
+		require.NoError(t, err)
+		require.EqualError(t, p.Err(), "result from call 2")
+	})
+
+	t.Run("cached value is returned, then it expires", func(t *testing.T) {
+		timeNow := &timeNowMock{}
+		expectedMatchers := []*labels.Matcher{
+			labels.MustNewMatcher(labels.MatchEqual, "foo", "bar"),
+		}
+
+		var call int
+		c := newPostingsForMatchersCache(defaultPostingsForMatchersCacheTTL, func(ix IndexPostingsReader, ms ...*labels.Matcher) (index.Postings, error) {
+			call++
+			return index.ErrPostings(fmt.Errorf("result from call %d", call)), nil
+		}, timeNow, false)
+
+		// first call, fills the cache
+		p, err := c.PostingsForMatchers(indexForPostingsMock{}, true, expectedMatchers...)
+		require.NoError(t, err)
+		require.EqualError(t, p.Err(), "result from call 1")
+
+		timeNow.advance(defaultPostingsForMatchersCacheTTL / 2)
+
+		// second call within the ttl, should use the cache
+		p, err = c.PostingsForMatchers(indexForPostingsMock{}, true, expectedMatchers...)
+		require.NoError(t, err)
+		require.EqualError(t, p.Err(), "result from call 1")
+
+		timeNow.advance(defaultPostingsForMatchersCacheTTL / 2)
+
+		// third call is after ttl (exactly), should call again
+		p, err = c.PostingsForMatchers(indexForPostingsMock{}, true, expectedMatchers...)
+		require.NoError(t, err)
+		require.EqualError(t, p.Err(), "result from call 2")
+	})
+
+	t.Run("cached value is evicted because cache exceeds max size", func(t *testing.T) {
+		timeNow := &timeNowMock{}
+		calls := make([][]*labels.Matcher, testCacheSize)
+		for i := range calls {
+			calls[i] = []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "matchers", fmt.Sprintf("%d", i))}
+		}
+
+		callsPerMatchers := map[string]int{}
+		c := newPostingsForMatchersCache(defaultPostingsForMatchersCacheTTL, func(ix IndexPostingsReader, ms ...*labels.Matcher) (index.Postings, error) {
+			k := matchersKey(ms)
+			callsPerMatchers[k]++
+			return index.ErrPostings(fmt.Errorf("result from call %d", callsPerMatchers[k])), nil
+		}, timeNow, false)
+
+		// each one of the first testCacheSize calls is cached properly
+		for _, matchers := range calls {
+			// first call
+			p, err := c.PostingsForMatchers(indexForPostingsMock{}, true, matchers...)
+			require.NoError(t, err)
+			require.EqualError(t, p.Err(), "result from call 1")
+
+			// cached value
+			p, err = c.PostingsForMatchers(indexForPostingsMock{}, true, matchers...)
+			require.NoError(t, err)
+			require.EqualError(t, p.Err(), "result from call 1")
+		}
+
+		// one extra call is made, which is cached properly, but evicts the first cached value
+		someExtraMatchers := []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar")}
+		// first call
+		p, err := c.PostingsForMatchers(indexForPostingsMock{}, true, someExtraMatchers...)
+		require.NoError(t, err)
+		require.EqualError(t, p.Err(), "result from call 1")
+
+		// cached value
+		p, err = c.PostingsForMatchers(indexForPostingsMock{}, true, someExtraMatchers...)
+		require.NoError(t, err)
+		require.EqualError(t, p.Err(), "result from call 1")
+
+		// make first call again, it's calculated again
+		p, err = c.PostingsForMatchers(indexForPostingsMock{}, true, calls[0]...)
+		require.NoError(t, err)
+		require.EqualError(t, p.Err(), "result from call 2")
+	})
+}
+
+type indexForPostingsMock struct{}
+
+func (idx indexForPostingsMock) LabelValues(string, ...*labels.Matcher) ([]string, error) {
+	panic("implement me")
+}
+
+func (idx indexForPostingsMock) Postings(string, ...string) (index.Postings, error) {
+	panic("implement me")
+}
+
+// timeNowMock offers a mockable time.Now() implementation
+// empty value is ready to be used, and it should not be copied (use a reference)
+type timeNowMock struct {
+	sync.Mutex
+	now time.Time
+}
+
+// timeNow can be used as a mocked replacement for time.Now()
+func (t *timeNowMock) timeNow() time.Time {
+	t.Lock()
+	defer t.Unlock()
+	if t.now.IsZero() {
+		t.now = time.Date(2020, 1, 2, 3, 4, 5, 0, time.UTC)
+	}
+	return t.now
+}
+
+// advance advances the mocked time.Now() value
+func (t *timeNowMock) advance(d time.Duration) {
+	t.Lock()
+	defer t.Unlock()
+	if t.now.IsZero() {
+		t.now = time.Date(2020, 1, 2, 3, 4, 5, 0, time.UTC)
+	}
+	t.now = t.now.Add(d)
+}
+
+func BenchmarkMatchersKey(b *testing.B) {
+	const totalMatchers = 10
+	const matcherSets = 100
+	sets := make([][]*labels.Matcher, matcherSets)
+	for i := 0; i < matcherSets; i++ {
+		for j := 0; j < totalMatchers; j++ {
+			sets[i] = append(sets[i], labels.MustNewMatcher(labels.MatchType(j%4), fmt.Sprintf("%d_%d", i*13, j*65537), fmt.Sprintf("%x_%x", i*127, j*2_147_483_647)))
+		}
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = matchersKey(sets[i%matcherSets])
+	}
+}
--- a/tsdb/querier.go
+++ b/tsdb/querier.go
@ -16,8 +16,6 @@ package tsdb
 import (
 	"fmt"
 	"math"
-	"strings"
-	"unicode/utf8"

 	"github.com/oklog/ulid"
 	"github.com/pkg/errors"
@ -32,20 +30,6 @@ import (
 	"github.com/prometheus/prometheus/tsdb/tombstones"
 )

-// Bitmap used by func isRegexMetaCharacter to check whether a character needs to be escaped.
-var regexMetaCharacterBytes [16]byte
-
-// isRegexMetaCharacter reports whether byte b needs to be escaped.
-func isRegexMetaCharacter(b byte) bool {
-	return b < utf8.RuneSelf && regexMetaCharacterBytes[b%16]&(1<<(b/16)) != 0
-}
-
-func init() {
-	for _, b := range []byte(`.+*?()|[]{}^$`) {
-		regexMetaCharacterBytes[b%16] |= 1 << (b / 16)
-	}
-}
-
 type blockBaseQuerier struct {
 	blockID    ulid.ULID
 	index      IndexReader
@ -128,11 +112,14 @@ func (q *blockQuerier) Select(sortSeries bool, hints *storage.SelectHints, ms ..
 	mint := q.mint
 	maxt := q.maxt
 	disableTrimming := false
-
-	p, err := PostingsForMatchers(q.index, ms...)
+	sharded := hints != nil && hints.ShardCount > 0
+	p, err := q.index.PostingsForMatchers(sharded, ms...)
 	if err != nil {
 		return storage.ErrSeriesSet(err)
 	}
+	if sharded {
+		p = q.index.ShardedPostings(p, hints.ShardIndex, hints.ShardCount)
+	}
 	if sortSeries {
 		p = q.index.SortedPostings(p)
 	}
@ -173,68 +160,23 @@ func (q *blockChunkQuerier) Select(sortSeries bool, hints *storage.SelectHints,
 		maxt = hints.End
 		disableTrimming = hints.DisableTrimming
 	}
-	p, err := PostingsForMatchers(q.index, ms...)
+	sharded := hints != nil && hints.ShardCount > 0
+	p, err := q.index.PostingsForMatchers(sharded, ms...)
 	if err != nil {
 		return storage.ErrChunkSeriesSet(err)
 	}
+	if sharded {
+		p = q.index.ShardedPostings(p, hints.ShardIndex, hints.ShardCount)
+	}
 	if sortSeries {
 		p = q.index.SortedPostings(p)
 	}
 	return NewBlockChunkSeriesSet(q.blockID, q.index, q.chunks, q.tombstones, p, mint, maxt, disableTrimming)
 }

-func findSetMatches(pattern string) []string {
-	// Return empty matches if the wrapper from Prometheus is missing.
-	if len(pattern) < 6 || pattern[:4] != "^(?:" || pattern[len(pattern)-2:] != ")$" {
-		return nil
-	}
-	escaped := false
-	sets := []*strings.Builder{{}}
-	init := 4
-	end := len(pattern) - 2
-	// If the regex is wrapped in a group we can remove the first and last parentheses
-	if pattern[init] == '(' && pattern[end-1] == ')' {
-		init++
-		end--
-	}
-	for i := init; i < end; i++ {
-		if escaped {
-			switch {
-			case isRegexMetaCharacter(pattern[i]):
-				sets[len(sets)-1].WriteByte(pattern[i])
-			case pattern[i] == '\\':
-				sets[len(sets)-1].WriteByte('\\')
-			default:
-				return nil
-			}
-			escaped = false
-		} else {
-			switch {
-			case isRegexMetaCharacter(pattern[i]):
-				if pattern[i] == '|' {
-					sets = append(sets, &strings.Builder{})
-				} else {
-					return nil
-				}
-			case pattern[i] == '\\':
-				escaped = true
-			default:
-				sets[len(sets)-1].WriteByte(pattern[i])
-			}
-		}
-	}
-	matches := make([]string, 0, len(sets))
-	for _, s := range sets {
-		if s.Len() > 0 {
-			matches = append(matches, s.String())
-		}
-	}
-	return matches
-}
-
 // PostingsForMatchers assembles a single postings iterator against the index reader
 // based on the given matchers. The resulting postings are not ordered by series.
-func PostingsForMatchers(ix IndexReader, ms ...*labels.Matcher) (index.Postings, error) {
+func PostingsForMatchers(ix IndexPostingsReader, ms ...*labels.Matcher) (index.Postings, error) {
 	var its, notIts []index.Postings
 	// See which label must be non-empty.
 	// Optimization for case like {l=~".", l!="1"}.
@ -331,7 +273,7 @@ func PostingsForMatchers(ix IndexReader, ms ...*labels.Matcher) (index.Postings,
 	return it, nil
 }

-func postingsForMatcher(ix IndexReader, m *labels.Matcher) (index.Postings, error) {
+func postingsForMatcher(ix IndexPostingsReader, m *labels.Matcher) (index.Postings, error) {
 	// This method will not return postings for missing labels.

 	// Fast-path for equal matching.
@ -341,7 +283,7 @@ func postingsForMatcher(ix IndexReader, m *labels.Matcher) (index.Postings, erro

 	// Fast-path for set matching.
 	if m.Type == labels.MatchRegexp {
-		setMatches := findSetMatches(m.GetRegexString())
+		setMatches := m.SetMatches()
 		if len(setMatches) > 0 {
 			return ix.Postings(m.Name, setMatches...)
 		}
@ -367,12 +309,12 @@ func postingsForMatcher(ix IndexReader, m *labels.Matcher) (index.Postings, erro
 }

 // inversePostingsForMatcher returns the postings for the series with the label name set but not matching the matcher.
-func inversePostingsForMatcher(ix IndexReader, m *labels.Matcher) (index.Postings, error) {
+func inversePostingsForMatcher(ix IndexPostingsReader, m *labels.Matcher) (index.Postings, error) {
 	// Fast-path for MatchNotRegexp matching.
 	// Inverse of a MatchNotRegexp is MatchRegexp (double negation).
 	// Fast-path for set matching.
 	if m.Type == labels.MatchNotRegexp {
-		setMatches := findSetMatches(m.GetRegexString())
+		setMatches := m.SetMatches()
 		if len(setMatches) > 0 {
 			return ix.Postings(m.Name, setMatches...)
 		}
@ -404,6 +346,8 @@ func inversePostingsForMatcher(ix IndexReader, m *labels.Matcher) (index.Posting
 	return ix.Postings(m.Name, res...)
 }

+const maxExpandedPostingsFactor = 100 // Division factor for maximum number of matched series.
+
 func labelValuesWithMatchers(r IndexReader, name string, matchers ...*labels.Matcher) ([]string, error) {
 	p, err := PostingsForMatchers(r, matchers...)
 	if err != nil {
@ -434,6 +378,34 @@ func labelValuesWithMatchers(r IndexReader, name string, matchers ...*labels.Mat
 		allValues = filteredValues
 	}

+	// Let's see if expanded postings for matchers have smaller cardinality than label values.
+	// Since computing label values from series is expensive, we apply a limit on number of expanded
+	// postings (and series).
+	maxExpandedPostings := len(allValues) / maxExpandedPostingsFactor
+	if maxExpandedPostings > 0 {
+		// Add space for one extra posting when checking if we expanded all postings.
+		expanded := make([]storage.SeriesRef, 0, maxExpandedPostings+1)
+
+		// Call p.Next() even if len(expanded) == maxExpandedPostings. This tells us if there are more postings or not.
+		for len(expanded) <= maxExpandedPostings && p.Next() {
+			expanded = append(expanded, p.At())
+		}
+
+		if len(expanded) <= maxExpandedPostings {
+			// When we're here, p.Next() must have returned false, so we need to check for errors.
+			if err := p.Err(); err != nil {
+				return nil, errors.Wrap(err, "expanding postings for matchers")
+			}
+
+			// We have expanded all the postings -- all returned label values will be from these series only.
+			// (We supply allValues as a buffer for storing results. It should be big enough already, since it holds all possible label values.)
+			return labelValuesFromSeries(r, name, expanded, allValues)
+		}
+
+		// If we haven't reached end of postings, we prepend our expanded postings to "p", and continue.
+		p = newPrependPostings(expanded, p)
+	}
+
 	valuesPostings := make([]index.Postings, len(allValues))
 	for i, value := range allValues {
 		valuesPostings[i], err = r.Postings(name, value)
@ -454,8 +426,85 @@ func labelValuesWithMatchers(r IndexReader, name string, matchers ...*labels.Mat
 	return values, nil
 }

+// labelValuesFromSeries returns all unique label values from for given label name from supplied series. Values are not sorted.
+// buf is space for holding result (if it isn't big enough, it will be ignored), may be nil.
+func labelValuesFromSeries(r IndexReader, labelName string, refs []storage.SeriesRef, buf []string) ([]string, error) {
+	values := map[string]struct{}{}
+
+	var builder labels.ScratchBuilder
+	for _, ref := range refs {
+		err := r.Series(ref, &builder, nil)
+		if err != nil {
+			return nil, errors.Wrapf(err, "label values for label %s", labelName)
+		}
+
+		v := builder.Labels().Get(labelName)
+		if v != "" {
+			values[v] = struct{}{}
+		}
+	}
+
+	if cap(buf) >= len(values) {
+		buf = buf[:0]
+	} else {
+		buf = make([]string, 0, len(values))
+	}
+	for v := range values {
+		buf = append(buf, v)
+	}
+	return buf, nil
+}
+
+func newPrependPostings(a []storage.SeriesRef, b index.Postings) index.Postings {
+	return &prependPostings{
+		ix:     -1,
+		prefix: a,
+		rest:   b,
+	}
+}
+
+// prependPostings returns series references from "prefix" before using "rest" postings.
+type prependPostings struct {
+	ix     int
+	prefix []storage.SeriesRef
+	rest   index.Postings
+}
+
+func (p *prependPostings) Next() bool {
+	p.ix++
+	if p.ix < len(p.prefix) {
+		return true
+	}
+	return p.rest.Next()
+}
+
+func (p *prependPostings) Seek(v storage.SeriesRef) bool {
+	for p.ix < len(p.prefix) {
+		if p.ix >= 0 && p.prefix[p.ix] >= v {
+			return true
+		}
+		p.ix++
+	}
+
+	return p.rest.Seek(v)
+}
+
+func (p *prependPostings) At() storage.SeriesRef {
+	if p.ix >= 0 && p.ix < len(p.prefix) {
+		return p.prefix[p.ix]
+	}
+	return p.rest.At()
+}
+
+func (p *prependPostings) Err() error {
+	if p.ix >= 0 && p.ix < len(p.prefix) {
+		return nil
+	}
+	return p.rest.Err()
+}
+
 func labelNamesWithMatchers(r IndexReader, matchers ...*labels.Matcher) ([]string, error) {
-	p, err := PostingsForMatchers(r, matchers...)
+	p, err := r.PostingsForMatchers(false, matchers...)
 	if err != nil {
 		return nil, err
 	}
@ -707,6 +756,10 @@ func (s *chunkSeriesEntry) Iterator(it chunks.Iterator) chunks.Iterator {
 	return pi
 }

+func (s *chunkSeriesEntry) ChunkCount() (int, error) {
+	return len(s.chks), nil
+}
+
 // populateWithDelSeriesIterator allows to iterate over samples for the single series.
 type populateWithDelSeriesIterator struct {
 	populateWithDelGenericSeriesIterator
--- a/tsdb/querier_bench_test.go
+++ b/tsdb/querier_bench_test.go
@ -19,10 +19,12 @@ import (
 	"strconv"
 	"testing"

-	"github.com/prometheus/prometheus/model/labels"
-	"github.com/prometheus/prometheus/tsdb/index"
-
 	"github.com/stretchr/testify/require"
+
+	"github.com/prometheus/prometheus/model/labels"
+	"github.com/prometheus/prometheus/storage"
+	"github.com/prometheus/prometheus/tsdb/hashcache"
+	"github.com/prometheus/prometheus/tsdb/index"
 )

 // Make entries ~50B in size, to emulate real-world high cardinality.
@ -48,7 +50,7 @@ func BenchmarkQuerier(b *testing.B) {

 	for n := 0; n < 10; n++ {
 		for i := 0; i < 100000; i++ {
-			addSeries(labels.FromStrings("i", strconv.Itoa(i)+postingsBenchSuffix, "n", strconv.Itoa(n)+postingsBenchSuffix, "j", "foo"))
+			addSeries(labels.FromStrings("i", strconv.Itoa(i)+postingsBenchSuffix, "n", strconv.Itoa(n)+postingsBenchSuffix, "j", "foo", "i_times_n", strconv.Itoa(i*n)))
 			// Have some series that won't be matched, to properly test inverted matches.
 			addSeries(labels.FromStrings("i", strconv.Itoa(i)+postingsBenchSuffix, "n", strconv.Itoa(n)+postingsBenchSuffix, "j", "bar"))
 			addSeries(labels.FromStrings("i", strconv.Itoa(i)+postingsBenchSuffix, "n", "0_"+strconv.Itoa(n)+postingsBenchSuffix, "j", "bar"))
@ -182,6 +184,9 @@ func benchmarkLabelValuesWithMatchers(b *testing.B, ir IndexReader) {
 	n1 := labels.MustNewMatcher(labels.MatchEqual, "n", "1"+postingsBenchSuffix)
 	nX := labels.MustNewMatcher(labels.MatchNotEqual, "n", "X"+postingsBenchSuffix)
 	nPlus := labels.MustNewMatcher(labels.MatchRegexp, "i", "^.+$")
+	primesTimes := labels.MustNewMatcher(labels.MatchEqual, "i_times_n", "533701") // = 76243*7, ie. multiplication of primes. It will match single i*n combination.
+	nonPrimesTimes := labels.MustNewMatcher(labels.MatchEqual, "i_times_n", "20")  // 1*20, 2*10, 4*5, 5*4
+	times12 := labels.MustNewMatcher(labels.MatchRegexp, "i_times_n", "12.*")

 	cases := []struct {
 		name      string
@ -197,6 +202,9 @@ func benchmarkLabelValuesWithMatchers(b *testing.B, ir IndexReader) {
 		{`i with n="1",j=~"XXX|YYY"`, "i", []*labels.Matcher{n1, jXXXYYY}},
 		{`i with n="X",j!="foo"`, "i", []*labels.Matcher{nX, jNotFoo}},
 		{`i with n="1",i=~"^.*$",j!="foo"`, "i", []*labels.Matcher{n1, iStar, jNotFoo}},
+		{`i with i_times_n=533701`, "i", []*labels.Matcher{primesTimes}},
+		{`i with i_times_n=20`, "i", []*labels.Matcher{nonPrimesTimes}},
+		{`i with i_times_n=~"12.*""`, "i", []*labels.Matcher{times12}},
 		// n has 10 values.
 		{`n with j!="foo"`, "n", []*labels.Matcher{jNotFoo}},
 		{`n with i="1"`, "n", []*labels.Matcher{i1}},
@ -249,16 +257,28 @@ func BenchmarkQuerierSelect(b *testing.B) {
 	}
 	require.NoError(b, app.Commit())

-	bench := func(b *testing.B, br BlockReader, sorted bool) {
+	bench := func(b *testing.B, br BlockReader, sorted, sharding bool) {
 		matcher := labels.MustNewMatcher(labels.MatchEqual, "foo", "bar")
 		for s := 1; s <= numSeries; s *= 10 {
 			b.Run(fmt.Sprintf("%dof%d", s, numSeries), func(b *testing.B) {
-				q, err := NewBlockQuerier(br, 0, int64(s-1))
+				mint := int64(0)
+				maxt := int64(s - 1)
+				q, err := NewBlockQuerier(br, mint, maxt)
 				require.NoError(b, err)

 				b.ResetTimer()
 				for i := 0; i < b.N; i++ {
-					ss := q.Select(sorted, nil, matcher)
+					var hints *storage.SelectHints
+					if sharding {
+						hints = &storage.SelectHints{
+							Start:      mint,
+							End:        maxt,
+							ShardIndex: uint64(i % 16),
+							ShardCount: 16,
+						}
+					}
+
+					ss := q.Select(sorted, hints, matcher)
 					for ss.Next() { // nolint:revive
 					}
 					require.NoError(b, ss.Err())
@ -269,22 +289,38 @@ func BenchmarkQuerierSelect(b *testing.B) {
 	}

 	b.Run("Head", func(b *testing.B) {
-		bench(b, h, false)
+		b.Run("without sharding", func(b *testing.B) {
+			bench(b, h, false, false)
+		})
+		b.Run("with sharding", func(b *testing.B) {
+			bench(b, h, false, true)
+		})
 	})
 	b.Run("SortedHead", func(b *testing.B) {
-		bench(b, h, true)
+		b.Run("without sharding", func(b *testing.B) {
+			bench(b, h, true, false)
+		})
+		b.Run("with sharding", func(b *testing.B) {
+			bench(b, h, true, true)
+		})
 	})

 	tmpdir := b.TempDir()

+	seriesHashCache := hashcache.NewSeriesHashCache(1024 * 1024 * 1024)
 	blockdir := createBlockFromHead(b, tmpdir, h)
-	block, err := OpenBlock(nil, blockdir, nil)
+	block, err := OpenBlockWithOptions(nil, blockdir, nil, seriesHashCache.GetBlockCacheProvider("test"), defaultPostingsForMatchersCacheTTL, defaultPostingsForMatchersCacheSize, false)
 	require.NoError(b, err)
 	defer func() {
 		require.NoError(b, block.Close())
 	}()

 	b.Run("Block", func(b *testing.B) {
-		bench(b, block, false)
+		b.Run("without sharding", func(b *testing.B) {
+			bench(b, block, false, false)
+		})
+		b.Run("with sharding", func(b *testing.B) {
+			bench(b, block, false, true)
+		})
 	})
 }
--- a/tsdb/querier_test.go
+++ b/tsdb/querier_test.go
@ -383,6 +383,46 @@ func TestBlockQuerier(t *testing.T) {
 				),
 			}),
 		},
+		{
+			// This tests query sharding. The label sets involved both hash into this test's result set. The test
+			// following this is companion to this test (same test but with a different ShardIndex) and should find that
+			// the label sets involved do not hash to that test's result set.
+			mint:  math.MinInt64,
+			maxt:  math.MaxInt64,
+			hints: &storage.SelectHints{Start: math.MinInt64, End: math.MaxInt64, ShardIndex: 0, ShardCount: 2},
+			ms:    []*labels.Matcher{labels.MustNewMatcher(labels.MatchRegexp, "a", ".*")},
+			exp: newMockSeriesSet([]storage.Series{
+				storage.NewListSeries(labels.FromStrings("a", "a"),
+					[]tsdbutil.Sample{sample{1, 2, nil, nil}, sample{2, 3, nil, nil}, sample{3, 4, nil, nil}, sample{5, 2, nil, nil}, sample{6, 3, nil, nil}, sample{7, 4, nil, nil}},
+				),
+				storage.NewListSeries(labels.FromStrings("a", "a", "b", "b"),
+					[]tsdbutil.Sample{sample{1, 1, nil, nil}, sample{2, 2, nil, nil}, sample{3, 3, nil, nil}, sample{5, 3, nil, nil}, sample{6, 6, nil, nil}},
+				),
+				storage.NewListSeries(labels.FromStrings("b", "b"),
+					[]tsdbutil.Sample{sample{1, 3, nil, nil}, sample{2, 2, nil, nil}, sample{3, 6, nil, nil}, sample{5, 1, nil, nil}, sample{6, 7, nil, nil}, sample{7, 2, nil, nil}},
+				),
+			}),
+			expChks: newMockChunkSeriesSet([]storage.ChunkSeries{
+				storage.NewListChunkSeriesFromSamples(labels.FromStrings("a", "a"),
+					[]tsdbutil.Sample{sample{1, 2, nil, nil}, sample{2, 3, nil, nil}, sample{3, 4, nil, nil}}, []tsdbutil.Sample{sample{5, 2, nil, nil}, sample{6, 3, nil, nil}, sample{7, 4, nil, nil}},
+				),
+				storage.NewListChunkSeriesFromSamples(labels.FromStrings("a", "a", "b", "b"),
+					[]tsdbutil.Sample{sample{1, 1, nil, nil}, sample{2, 2, nil, nil}, sample{3, 3, nil, nil}}, []tsdbutil.Sample{sample{5, 3, nil, nil}, sample{6, 6, nil, nil}},
+				),
+				storage.NewListChunkSeriesFromSamples(labels.FromStrings("b", "b"),
+					[]tsdbutil.Sample{sample{1, 3, nil, nil}, sample{2, 2, nil, nil}, sample{3, 6, nil, nil}}, []tsdbutil.Sample{sample{5, 1, nil, nil}, sample{6, 7, nil, nil}, sample{7, 2, nil, nil}},
+				),
+			}),
+		},
+		{
+			// This is a companion to the test above.
+			mint:    math.MinInt64,
+			maxt:    math.MaxInt64,
+			hints:   &storage.SelectHints{Start: math.MinInt64, End: math.MaxInt64, ShardIndex: 1, ShardCount: 2},
+			ms:      []*labels.Matcher{labels.MustNewMatcher(labels.MatchRegexp, "a", ".*")},
+			exp:     newMockSeriesSet([]storage.Series{}),
+			expChks: newMockChunkSeriesSet([]storage.ChunkSeries{}),
+		},
 	} {
 		t.Run("", func(t *testing.T) {
 			ir, cr, _, _ := createIdxChkReaders(t, testData)
@ -1520,6 +1560,48 @@ func (m mockIndex) SortedPostings(p index.Postings) index.Postings {
 	return index.NewListPostings(ep)
 }

+func (m mockIndex) PostingsForMatchers(concurrent bool, ms ...*labels.Matcher) (index.Postings, error) {
+	var ps []storage.SeriesRef
+	for p, s := range m.series {
+		if matches(ms, s.l) {
+			ps = append(ps, p)
+		}
+	}
+	sort.Slice(ps, func(i, j int) bool { return ps[i] < ps[j] })
+	return index.NewListPostings(ps), nil
+}
+
+func matches(ms []*labels.Matcher, lbls labels.Labels) bool {
+	lm := lbls.Map()
+	for _, m := range ms {
+		if !m.Matches(lm[m.Name]) {
+			return false
+		}
+	}
+	return true
+}
+
+func (m mockIndex) ShardedPostings(p index.Postings, shardIndex, shardCount uint64) index.Postings {
+	out := make([]storage.SeriesRef, 0, 128)
+
+	for p.Next() {
+		ref := p.At()
+		s, ok := m.series[ref]
+		if !ok {
+			continue
+		}
+
+		// Check if the series belong to the shard.
+		if s.l.Hash()%shardCount != shardIndex {
+			continue
+		}
+
+		out = append(out, ref)
+	}
+
+	return index.NewListPostings(out)
+}
+
 func (m mockIndex) Series(ref storage.SeriesRef, builder *labels.ScratchBuilder, chks *[]chunks.Meta) error {
 	s, ok := m.series[ref]
 	if !ok {
@ -1831,69 +1913,6 @@ func BenchmarkSetMatcher(b *testing.B) {
 	}
 }

-// Refer to https://github.com/prometheus/prometheus/issues/2651.
-func TestFindSetMatches(t *testing.T) {
-	cases := []struct {
-		pattern string
-		exp     []string
-	}{
-		// Single value, coming from a `bar=~"foo"` selector.
-		{
-			pattern: "^(?:foo)$",
-			exp: []string{
-				"foo",
-			},
-		},
-		// Simple sets.
-		{
-			pattern: "^(?:foo|bar|baz)$",
-			exp: []string{
-				"foo",
-				"bar",
-				"baz",
-			},
-		},
-		// Simple sets containing escaped characters.
-		{
-			pattern: "^(?:fo\\.o|bar\\?|\\^baz)$",
-			exp: []string{
-				"fo.o",
-				"bar?",
-				"^baz",
-			},
-		},
-		// Simple sets containing special characters without escaping.
-		{
-			pattern: "^(?:fo.o|bar?|^baz)$",
-			exp:     nil,
-		},
-		// Missing wrapper.
-		{
-			pattern: "foo|bar|baz",
-			exp:     nil,
-		},
-	}
-
-	for _, c := range cases {
-		matches := findSetMatches(c.pattern)
-		if len(c.exp) == 0 {
-			if len(matches) != 0 {
-				t.Errorf("Evaluating %s, unexpected result %v", c.pattern, matches)
-			}
-		} else {
-			if len(matches) != len(c.exp) {
-				t.Errorf("Evaluating %s, length of result not equal to exp", c.pattern)
-			} else {
-				for i := 0; i < len(c.exp); i++ {
-					if c.exp[i] != matches[i] {
-						t.Errorf("Evaluating %s, unexpected result %s", c.pattern, matches[i])
-					}
-				}
-			}
-		}
-	}
-}
-
 func TestPostingsForMatchers(t *testing.T) {
 	chunkDir := t.TempDir()
 	opts := DefaultHeadOptions()
@ -2390,10 +2409,18 @@ func (m mockMatcherIndex) Postings(name string, values ...string) (index.Posting
 	return index.EmptyPostings(), nil
 }

+func (m mockMatcherIndex) PostingsForMatchers(bool, ...*labels.Matcher) (index.Postings, error) {
+	return index.EmptyPostings(), nil
+}
+
 func (m mockMatcherIndex) SortedPostings(p index.Postings) index.Postings {
 	return index.EmptyPostings()
 }

+func (m mockMatcherIndex) ShardedPostings(ps index.Postings, shardIndex, shardCount uint64) index.Postings {
+	return ps
+}
+
 func (m mockMatcherIndex) Series(ref storage.SeriesRef, builder *labels.ScratchBuilder, chks *[]chunks.Meta) error {
 	return nil
 }
@ -2424,7 +2451,7 @@ func TestPostingsForMatcher(t *testing.T) {
 		{
 			// Test case for double quoted regex matcher
 			matcher:  labels.MustNewMatcher(labels.MatchRegexp, "test", "^(?:a|b)$"),
-			hasError: true,
+			hasError: false,
 		},
 	}

@ -2706,3 +2733,115 @@ func TestQueryWithDeletedHistograms(t *testing.T) {
 		})
 	}
 }
+
+func TestPrependPostings(t *testing.T) {
+	t.Run("empty", func(t *testing.T) {
+		p := newPrependPostings(nil, index.NewListPostings(nil))
+		require.False(t, p.Next())
+	})
+
+	t.Run("next+At", func(t *testing.T) {
+		p := newPrependPostings([]storage.SeriesRef{10, 20, 30}, index.NewListPostings([]storage.SeriesRef{200, 300, 500}))
+
+		for _, s := range []storage.SeriesRef{10, 20, 30, 200, 300, 500} {
+			require.True(t, p.Next())
+			require.Equal(t, s, p.At())
+			require.Equal(t, s, p.At()) // Multiple calls return same value.
+		}
+		require.False(t, p.Next())
+	})
+
+	t.Run("seek+At", func(t *testing.T) {
+		p := newPrependPostings([]storage.SeriesRef{10, 20, 30}, index.NewListPostings([]storage.SeriesRef{200, 300, 500}))
+
+		require.True(t, p.Seek(5))
+		require.Equal(t, storage.SeriesRef(10), p.At())
+		require.Equal(t, storage.SeriesRef(10), p.At())
+
+		require.True(t, p.Seek(15))
+		require.Equal(t, storage.SeriesRef(20), p.At())
+		require.Equal(t, storage.SeriesRef(20), p.At())
+
+		require.True(t, p.Seek(20)) // Seeking to "current" value doesn't move postings iterator.
+		require.Equal(t, storage.SeriesRef(20), p.At())
+		require.Equal(t, storage.SeriesRef(20), p.At())
+
+		require.True(t, p.Seek(50))
+		require.Equal(t, storage.SeriesRef(200), p.At())
+		require.Equal(t, storage.SeriesRef(200), p.At())
+
+		require.False(t, p.Seek(1000))
+		require.False(t, p.Next())
+	})
+
+	t.Run("err", func(t *testing.T) {
+		err := fmt.Errorf("error")
+		p := newPrependPostings([]storage.SeriesRef{10, 20, 30}, index.ErrPostings(err))
+
+		for _, s := range []storage.SeriesRef{10, 20, 30} {
+			require.True(t, p.Next())
+			require.Equal(t, s, p.At())
+			require.NoError(t, p.Err())
+		}
+		// Advancing after prepended values returns false, and gives us access to error.
+		require.False(t, p.Next())
+		require.Equal(t, err, p.Err())
+	})
+}
+
+func TestLabelsValuesWithMatchersOptimization(t *testing.T) {
+	dir := t.TempDir()
+	opts := DefaultHeadOptions()
+	opts.ChunkRange = 1000
+	opts.ChunkDirRoot = dir
+	h, err := NewHead(nil, nil, nil, nil, opts, nil)
+	require.NoError(t, err)
+	defer func() {
+		require.NoError(t, h.Close())
+	}()
+
+	app := h.Appender(context.Background())
+	addSeries := func(l labels.Labels) {
+		app.Append(0, l, 0, 0)
+	}
+
+	const maxI = 10 * maxExpandedPostingsFactor
+
+	allValuesOfI := make([]string, 0, maxI)
+	for i := 0; i < maxI; i++ {
+		allValuesOfI = append(allValuesOfI, strconv.Itoa(i))
+	}
+
+	for n := 0; n < 10; n++ {
+		for i := 0; i < maxI; i++ {
+			addSeries(labels.FromStrings("i", allValuesOfI[i], "n", strconv.Itoa(n), "j", "foo", "i_times_n", strconv.Itoa(i*n)))
+		}
+	}
+	require.NoError(t, app.Commit())
+
+	ir, err := h.Index()
+	require.NoError(t, err)
+
+	primesTimes := labels.MustNewMatcher(labels.MatchEqual, "i_times_n", "23") // It will match single i*n combination (n < 10)
+	nonPrimesTimes := labels.MustNewMatcher(labels.MatchEqual, "i_times_n", "20")
+	n3 := labels.MustNewMatcher(labels.MatchEqual, "n", "3")
+
+	cases := []struct {
+		name            string
+		labelName       string
+		matchers        []*labels.Matcher
+		expectedResults []string
+	}{
+		{name: `i with i_times_n=23`, labelName: "i", matchers: []*labels.Matcher{primesTimes}, expectedResults: []string{"23"}},
+		{name: `i with i_times_n=20`, labelName: "i", matchers: []*labels.Matcher{nonPrimesTimes}, expectedResults: []string{"4", "5", "10", "20"}},
+		{name: `n with n="3"`, labelName: "i", matchers: []*labels.Matcher{n3}, expectedResults: allValuesOfI},
+	}
+
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			values, err := labelValuesWithMatchers(ir, c.labelName, c.matchers...)
+			require.NoError(t, err)
+			require.ElementsMatch(t, c.expectedResults, values)
+		})
+	}
+}
--- a/tsdb/symbols_batch.go
+++ b/tsdb/symbols_batch.go
@ -0,0 +1,375 @@
+package tsdb
+
+import (
+	"container/heap"
+	"encoding/gob"
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+	"sort"
+	"sync"
+
+	"github.com/golang/snappy"
+
+	"github.com/prometheus/prometheus/tsdb/errors"
+)
+
+// symbolFlushers writes symbols to provided files in background goroutines.
+type symbolFlushers struct {
+	jobs chan flusherJob
+	wg   sync.WaitGroup
+
+	closed bool
+
+	errMu sync.Mutex
+	err   error
+
+	pool *sync.Pool
+}
+
+func newSymbolFlushers(concurrency int) *symbolFlushers {
+	f := &symbolFlushers{
+		jobs: make(chan flusherJob),
+		pool: &sync.Pool{},
+	}
+
+	for i := 0; i < concurrency; i++ {
+		f.wg.Add(1)
+		go f.loop()
+	}
+
+	return f
+}
+
+func (f *symbolFlushers) flushSymbols(outputFile string, symbols map[string]struct{}) error {
+	if len(symbols) == 0 {
+		return fmt.Errorf("no symbols")
+	}
+
+	f.errMu.Lock()
+	err := f.err
+	f.errMu.Unlock()
+
+	// If there was any error previously, return it.
+	if err != nil {
+		return err
+	}
+
+	f.jobs <- flusherJob{
+		outputFile: outputFile,
+		symbols:    symbols,
+	}
+	return nil
+}
+
+func (f *symbolFlushers) loop() {
+	defer f.wg.Done()
+
+	for j := range f.jobs {
+		var sortedSymbols []string
+
+		pooled := f.pool.Get()
+		if pooled == nil {
+			sortedSymbols = make([]string, 0, len(j.symbols))
+		} else {
+			sortedSymbols = pooled.([]string)
+			sortedSymbols = sortedSymbols[:0]
+		}
+
+		for s := range j.symbols {
+			sortedSymbols = append(sortedSymbols, s)
+		}
+		sort.Strings(sortedSymbols)
+
+		err := writeSymbolsToFile(j.outputFile, sortedSymbols)
+		sortedSymbols = sortedSymbols[:0]
+
+		//nolint:staticcheck // Ignore SA6002 safe to ignore and actually fixing it has some performance penalty.
+		f.pool.Put(sortedSymbols)
+
+		if err != nil {
+			f.errMu.Lock()
+			if f.err == nil {
+				f.err = err
+			}
+			f.errMu.Unlock()
+
+			break
+		}
+	}
+
+	for range f.jobs { //nolint:revive // This "empty" block is intentional
+		// drain the channel, don't do more flushing. only used when error occurs.
+	}
+}
+
+// Stops and waits until all flusher goroutines are finished.
+func (f *symbolFlushers) close() error {
+	if f.closed {
+		return f.err
+	}
+
+	f.closed = true
+	close(f.jobs)
+	f.wg.Wait()
+
+	return f.err
+}
+
+type flusherJob struct {
+	outputFile string
+	symbols    map[string]struct{}
+}
+
+// symbolsBatcher keeps buffer of symbols in memory. Once the buffer reaches the size limit (number of symbols),
+// batcher writes currently buffered symbols to file. At the end remaining symbols must be flushed. After writing
+// all batches, symbolsBatcher has list of files that can be used together with newSymbolsIterator to iterate
+// through all previously added symbols in sorted order.
+type symbolsBatcher struct {
+	dir   string
+	limit int
+
+	symbolsFiles []string // paths of symbol files, which were sent to flushers for flushing
+
+	buffer   map[string]struct{} // using map to deduplicate
+	flushers *symbolFlushers
+}
+
+func newSymbolsBatcher(limit int, dir string, flushers *symbolFlushers) *symbolsBatcher {
+	return &symbolsBatcher{
+		limit:    limit,
+		dir:      dir,
+		buffer:   make(map[string]struct{}, limit),
+		flushers: flushers,
+	}
+}
+
+func (sw *symbolsBatcher) flushSymbols(force bool) error {
+	if !force && len(sw.buffer) < sw.limit {
+		return nil
+	}
+
+	if len(sw.buffer) == 0 {
+		return nil
+	}
+
+	symbolsFile := filepath.Join(sw.dir, fmt.Sprintf("symbols_%d", len(sw.symbolsFiles)))
+	sw.symbolsFiles = append(sw.symbolsFiles, symbolsFile)
+
+	buf := sw.buffer
+	sw.buffer = make(map[string]struct{}, sw.limit)
+	return sw.flushers.flushSymbols(symbolsFile, buf)
+}
+
+// getSymbolFiles returns list of symbol files used to flush symbols to. These files are only valid if flushers
+// finish successfully.
+func (sw *symbolsBatcher) getSymbolFiles() []string {
+	return sw.symbolsFiles
+}
+
+func writeSymbolsToFile(filename string, symbols []string) error {
+	f, err := os.Create(filename)
+	if err != nil {
+		return err
+	}
+
+	// Snappy is used for buffering and to create smaller files.
+	sn := snappy.NewBufferedWriter(f)
+	enc := gob.NewEncoder(sn)
+
+	errs := errors.NewMulti()
+
+	for _, s := range symbols {
+		err := enc.Encode(s)
+		if err != nil {
+			errs.Add(err)
+			break
+		}
+	}
+
+	errs.Add(sn.Close())
+	errs.Add(f.Close())
+	return errs.Err()
+}
+
+// Implements heap.Interface using symbols from files.
+type symbolsHeap []*symbolsFile
+
+// Len implements sort.Interface.
+func (s *symbolsHeap) Len() int {
+	return len(*s)
+}
+
+// Less implements sort.Interface.
+func (s *symbolsHeap) Less(i, j int) bool {
+	iw, ierr := (*s)[i].Peek()
+	if ierr != nil {
+		// Empty string will be sorted first, so error will be returned before any other result.
+		iw = ""
+	}
+
+	jw, jerr := (*s)[j].Peek()
+	if jerr != nil {
+		jw = ""
+	}
+
+	return iw < jw
+}
+
+// Swap implements sort.Interface.
+func (s *symbolsHeap) Swap(i, j int) {
+	(*s)[i], (*s)[j] = (*s)[j], (*s)[i]
+}
+
+// Push implements heap.Interface. Push should add x as element Len().
+func (s *symbolsHeap) Push(x interface{}) {
+	*s = append(*s, x.(*symbolsFile))
+}
+
+// Pop implements heap.Interface. Pop should remove and return element Len() - 1.
+func (s *symbolsHeap) Pop() interface{} {
+	l := len(*s)
+	res := (*s)[l-1]
+	*s = (*s)[:l-1]
+	return res
+}
+
+type symbolsIterator struct {
+	files []*os.File
+	heap  symbolsHeap
+
+	// To avoid returning duplicates, we remember last returned symbol. We want to support "" as a valid
+	// symbol, so we use pointer to a string instead.
+	lastReturned *string
+}
+
+func newSymbolsIterator(filenames []string) (*symbolsIterator, error) {
+	files, err := openFiles(filenames)
+	if err != nil {
+		return nil, err
+	}
+
+	var symFiles []*symbolsFile
+	for _, f := range files {
+		symFiles = append(symFiles, newSymbolsFile(f))
+	}
+
+	h := &symbolsIterator{
+		files: files,
+		heap:  symFiles,
+	}
+
+	heap.Init(&h.heap)
+
+	return h, nil
+}
+
+// NextSymbol advances iterator forward, and returns next symbol.
+// If there is no next element, returns err == io.EOF.
+func (sit *symbolsIterator) NextSymbol() (string, error) {
+	for len(sit.heap) > 0 {
+		result, err := sit.heap[0].Next()
+		if err == io.EOF {
+			// End of file, remove it from heap, and try next file.
+			heap.Remove(&sit.heap, 0)
+			continue
+		}
+
+		if err != nil {
+			return "", err
+		}
+
+		heap.Fix(&sit.heap, 0)
+
+		if sit.lastReturned != nil && *sit.lastReturned == result {
+			// Duplicate symbol, try next one.
+			continue
+		}
+
+		sit.lastReturned = &result
+		return result, nil
+	}
+
+	return "", io.EOF
+}
+
+// Close all files.
+func (sit *symbolsIterator) Close() error {
+	errs := errors.NewMulti()
+	for _, f := range sit.files {
+		errs.Add(f.Close())
+	}
+	return errs.Err()
+}
+
+type symbolsFile struct {
+	dec *gob.Decoder
+
+	nextValid  bool // if true, nextSymbol and nextErr have the next symbol (possibly "")
+	nextSymbol string
+	nextErr    error
+}
+
+func newSymbolsFile(f *os.File) *symbolsFile {
+	sn := snappy.NewReader(f)
+	dec := gob.NewDecoder(sn)
+
+	return &symbolsFile{
+		dec: dec,
+	}
+}
+
+// Peek returns next symbol or error, but also preserves them for subsequent Peek or Next calls.
+func (sf *symbolsFile) Peek() (string, error) {
+	if sf.nextValid {
+		return sf.nextSymbol, sf.nextErr
+	}
+
+	sf.nextValid = true
+	sf.nextSymbol, sf.nextErr = sf.readNext()
+	return sf.nextSymbol, sf.nextErr
+}
+
+// Next advances iterator and returns the next symbol or error.
+func (sf *symbolsFile) Next() (string, error) {
+	if sf.nextValid {
+		defer func() {
+			sf.nextValid = false
+			sf.nextSymbol = ""
+			sf.nextErr = nil
+		}()
+		return sf.nextSymbol, sf.nextErr
+	}
+
+	return sf.readNext()
+}
+
+func (sf *symbolsFile) readNext() (string, error) {
+	var s string
+	err := sf.dec.Decode(&s)
+	// Decode returns io.EOF at the end.
+	if err != nil {
+		return "", err
+	}
+
+	return s, nil
+}
+
+func openFiles(filenames []string) ([]*os.File, error) {
+	var result []*os.File
+
+	for _, fn := range filenames {
+		f, err := os.Open(fn)
+		if err != nil {
+			// Close files opened so far.
+			for _, sf := range result {
+				_ = sf.Close()
+			}
+			return nil, err
+		}
+
+		result = append(result, f)
+	}
+	return result, nil
+}
--- a/tsdb/symbols_batch_test.go
+++ b/tsdb/symbols_batch_test.go
@ -0,0 +1,71 @@
+package tsdb
+
+import (
+	"fmt"
+	"io"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestSymbolsBatchAndIteration1(t *testing.T) {
+	testSymbolsBatchAndIterationWithFlushersConcurrency(t, 1)
+}
+
+func TestSymbolsBatchAndIteration5(t *testing.T) {
+	testSymbolsBatchAndIterationWithFlushersConcurrency(t, 5)
+}
+
+func testSymbolsBatchAndIterationWithFlushersConcurrency(t *testing.T, flushersConcurrency int) {
+	flushers := newSymbolFlushers(flushersConcurrency)
+	defer func() { _ = flushers.close() }()
+
+	dir := t.TempDir()
+
+	b := newSymbolsBatcher(100, dir, flushers)
+
+	allWords := map[string]struct{}{}
+
+	for i := 0; i < 10*flushersConcurrency; i++ {
+		require.NoError(t, b.addSymbol(""))
+		allWords[""] = struct{}{}
+
+		for j := 0; j < 123; j++ {
+			w := fmt.Sprintf("word_%d_%d", i%3, j)
+
+			require.NoError(t, b.addSymbol(w))
+
+			allWords[w] = struct{}{}
+		}
+	}
+
+	require.NoError(t, b.flushSymbols(true))
+	require.NoError(t, b.flushSymbols(true)) // call again, this should do nothing, and not create new empty file.
+	require.NoError(t, flushers.close())
+
+	symbols := b.getSymbolFiles()
+
+	it, err := newSymbolsIterator(symbols)
+	require.NoError(t, err)
+	t.Cleanup(func() {
+		require.NoError(t, it.Close())
+	})
+
+	first := true
+	var w, prev string
+	for w, err = it.NextSymbol(); err == nil; w, err = it.NextSymbol() {
+		if !first {
+			require.True(t, w != "")
+			require.True(t, prev < w)
+		}
+
+		first = false
+
+		_, known := allWords[w]
+		require.True(t, known)
+		delete(allWords, w)
+		prev = w
+	}
+	require.Equal(t, io.EOF, err)
+	require.Equal(t, 0, len(allWords))
+}
--- a/tsdb/tombstones/tombstones_test.go
+++ b/tsdb/tombstones/tombstones_test.go
@ -22,13 +22,13 @@ import (

 	"github.com/go-kit/log"
 	"github.com/stretchr/testify/require"
-	"go.uber.org/goleak"

 	"github.com/prometheus/prometheus/storage"
+	"github.com/prometheus/prometheus/util/testutil"
 )

 func TestMain(m *testing.M) {
-	goleak.VerifyTestMain(m)
+	testutil.TolerantVerifyLeak(m)
 }

 func TestWriteAndReadbackTombstones(t *testing.T) {
--- a/tsdb/wlog/wlog_test.go
+++ b/tsdb/wlog/wlog_test.go
@ -25,14 +25,13 @@ import (

 	client_testutil "github.com/prometheus/client_golang/prometheus/testutil"
 	"github.com/stretchr/testify/require"
-	"go.uber.org/goleak"

 	"github.com/prometheus/prometheus/tsdb/fileutil"
 	"github.com/prometheus/prometheus/util/testutil"
 )

 func TestMain(m *testing.M) {
-	goleak.VerifyTestMain(m)
+	testutil.TolerantVerifyLeak(m)
 }

 // TestWALRepair_ReadingError ensures that a repair is run for an error
--- a/util/testutil/testing.go
+++ b/util/testutil/testing.go
@ -40,5 +40,9 @@ func TolerantVerifyLeak(m *testing.M) {
 		// positives.
 		// https://github.com/kubernetes/client-go/blob/f6ce18ae578c8cca64d14ab9687824d9e1305a67/util/workqueue/queue.go#L201
 		goleak.IgnoreTopFunction("k8s.io/client-go/util/workqueue.(*Type).updateUnfinishedWorkLoop"),
+		// Ignore "ristretto" and its dependency "glog".
+		goleak.IgnoreTopFunction("github.com/dgraph-io/ristretto.(*defaultPolicy).processItems"),
+		goleak.IgnoreTopFunction("github.com/dgraph-io/ristretto.(*Cache).processItems"),
+		goleak.IgnoreTopFunction("github.com/golang/glog.(*fileSink).flushDaemon"),
 	)
 }
--- a/web/ui/assets_vfsdata.go
+++ b/web/ui/assets_vfsdata.go
--- a/web/ui/module/codemirror-promql/src/grammar/parser.js
+++ b/web/ui/module/codemirror-promql/src/grammar/parser.js
@ -0,0 +1,19 @@
+// This file was generated by lezer-generator. You probably shouldn't edit it.
+import {LRParser} from "@lezer/lr"
+import {specializeIdentifier, extendIdentifier} from "./tokens"
+const spec_Identifier = {__proto__:null,absent_over_time:307, absent:309, abs:311, acos:313, acosh:315, asin:317, asinh:319, atan:321, atanh:323, avg_over_time:325, ceil:327, changes:329, clamp:331, clamp_max:333, clamp_min:335, cos:337, cosh:339, count_over_time:341, days_in_month:343, day_of_month:345, day_of_week:347, deg:349, delta:351, deriv:353, exp:355, floor:357, histogram_quantile:359, holt_winters:361, hour:363, idelta:365, increase:367, irate:369, label_replace:371, label_join:373, last_over_time:375, ln:377, log10:379, log2:381, max_over_time:383, min_over_time:385, minute:387, month:389, pi:391, predict_linear:393, present_over_time:395, quantile_over_time:397, rad:399, rate:401, resets:403, round:405, scalar:407, sgn:409, sin:411, sinh:413, sort:415, sort_desc:417, sqrt:419, stddev_over_time:421, stdvar_over_time:423, sum_over_time:425, tan:427, tanh:429, timestamp:431, time:433, vector:435, year:437}
+export const parser = LRParser.deserialize({
+  version: 13,
+  states: "6[OYQPOOO&{QPOOOOQO'#C{'#C{O'QQPO'#CzQ']QQOOOOQO'#De'#DeO'WQPO'#DdOOQO'#E}'#E}O(jQPO'#FTOYQPO'#FPOYQPO'#FSOOQO'#FV'#FVO.fQSO'#FWO.nQQO'#FUOOQO'#FU'#FUOOQO'#Cy'#CyOOQO'#Df'#DfOOQO'#Dh'#DhOOQO'#Di'#DiOOQO'#Dj'#DjOOQO'#Dk'#DkOOQO'#Dl'#DlOOQO'#Dm'#DmOOQO'#Dn'#DnOOQO'#Do'#DoOOQO'#Dp'#DpOOQO'#Dq'#DqOOQO'#Dr'#DrOOQO'#Ds'#DsOOQO'#Dt'#DtOOQO'#Du'#DuOOQO'#Dv'#DvOOQO'#Dw'#DwOOQO'#Dx'#DxOOQO'#Dy'#DyOOQO'#Dz'#DzOOQO'#D{'#D{OOQO'#D|'#D|OOQO'#D}'#D}OOQO'#EO'#EOOOQO'#EP'#EPOOQO'#EQ'#EQOOQO'#ER'#EROOQO'#ES'#ESOOQO'#ET'#ETOOQO'#EU'#EUOOQO'#EV'#EVOOQO'#EW'#EWOOQO'#EX'#EXOOQO'#EY'#EYOOQO'#EZ'#EZOOQO'#E['#E[OOQO'#E]'#E]OOQO'#E^'#E^OOQO'#E_'#E_OOQO'#E`'#E`OOQO'#Ea'#EaOOQO'#Eb'#EbOOQO'#Ec'#EcOOQO'#Ed'#EdOOQO'#Ee'#EeOOQO'#Ef'#EfOOQO'#Eg'#EgOOQO'#Eh'#EhOOQO'#Ei'#EiOOQO'#Ej'#EjOOQO'#Ek'#EkOOQO'#El'#ElOOQO'#Em'#EmOOQO'#En'#EnOOQO'#Eo'#EoOOQO'#Ep'#EpOOQO'#Eq'#EqOOQO'#Er'#ErOOQO'#Es'#EsOOQO'#Et'#EtOOQO'#Eu'#EuOOQO'#Ev'#EvOOQO'#Ew'#EwOOQO'#Ex'#ExOOQO'#Ey'#EyOOQO'#Ez'#EzQOQPOOO0XQPO'#C|O0^QPO'#DRO'WQPO,59fO0eQQO,59fO2RQPO,59oO2RQPO,59oO2RQPO,59oO2RQPO,59oO2RQPO,59oO7}QQO,5;gO8SQQO,5;jO8[QPO,5;yOOQO,5:O,5:OOOQO,5;i,5;iO8sQQO,5;kO8zQQO,5;nO:bQPO'#FYO:pQPO,5;rOOQO'#FX'#FXOOQO,5;r,5;rOOQO,5;p,5;pO:xQSO'#C}OOQO,59h,59hO;QQPO,59mO;YQQO'#DSOOQO,59m,59mOOQO1G/Q1G/QO0XQPO'#DWOAVQPO'#DVOAaQPO'#DVOYQPO1G/ZOYQPO1G/ZOYQPO1G/ZOYQPO1G/ZOYQPO1G/ZOAkQSO1G1ROOQO1G1U1G1UOAsQQO1G1UOAxQPO'#E}OOQO'#Fa'#FaOOQO1G1e1G1eOBTQPO1G1eOOQO1G1V1G1VOOQO'#FZ'#FZOBYQPO,5;tOB_QSO1G1^OOQO1G1^1G1^OOQO'#DP'#DPOBgQPO,59iOOQO'#DO'#DOOOQO,59i,59iOYQPO,59nOOQO1G/X1G/XOOQO,59r,59rOH_QPO,59qOHfQPO,59qOI}QQO7+$uOJ_QQO7+$uOKsQQO7+$uOLZQQO7+$uOMrQQO7+$uOOQO7+&m7+&mON]QQO7+&sOOQO7+&p7+&pONeQPO7+'POOQO1G1`1G1`OOQO1G1_1G1_OOQO7+&x7+&xONjQSO1G/TOOQO1G/T1G/TONrQQO1G/YOOQO1G/]1G/]ON|QPO1G/]OOQO<<J_<<J_O!&oQPO<<J_OOQO<<Jk<<JkOOQO1G/U1G/UOOQO7+$o7+$oOOQO7+$w7+$wOOQOAN?yAN?y",
+  stateData: "!&t~O$ZOSkOS~OWQOXQOYQOZQO[QO]QO^QO_QO`QOaQObQOcQO!ZZO#t_O$WVO$XVO$[XO$_`O$`aO$abO$bcO$cdO$deO$efO$fgO$ghO$hiO$ijO$jkO$klO$lmO$mnO$noO$opO$pqO$qrO$rsO$stO$tuO$uvO$vwO$wxO$xyO$yzO$z{O${|O$|}O$}!OO%O!PO%P!QO%Q!RO%R!SO%S!TO%T!UO%U!VO%V!WO%W!XO%X!YO%Y!ZO%Z![O%[!]O%]!^O%^!_O%_!`O%`!aO%a!bO%b!cO%c!dO%d!eO%e!fO%f!gO%g!hO%h!iO%i!jO%j!kO%k!lO%l!mO%m!nO%n!oO%o!pO%p!qO%q!rO%r!sO%uWO%vWO%wVO%y[O~O!ZZO~Od!uOe!uO$[!vO~OU#POV!yOf!|Og!}Oh!|Ox!yO{!yO|!yO}!yO!O!zO!P!zO!Q!{O!R!{O!S!{O!T!{O!U!{O!V!{O$S#QO%s#OO~O$W#SO$X#SO%w#SOW#wXX#wXY#wXZ#wX[#wX]#wX^#wX_#wX`#wXa#wXb#wXc#wX!Z#wX#t#wX$W#wX$X#wX$[#wX$_#wX$`#wX$a#wX$b#wX$c#wX$d#wX$e#wX$f#wX$g#wX$h#wX$i#wX$j#wX$k#wX$l#wX$m#wX$n#wX$o#wX$p#wX$q#wX$r#wX$s#wX$t#wX$u#wX$v#wX$w#wX$x#wX$y#wX$z#wX${#wX$|#wX$}#wX%O#wX%P#wX%Q#wX%R#wX%S#wX%T#wX%U#wX%V#wX%W#wX%X#wX%Y#wX%Z#wX%[#wX%]#wX%^#wX%_#wX%`#wX%a#wX%b#wX%c#wX%d#wX%e#wX%f#wX%g#wX%h#wX%i#wX%j#wX%k#wX%l#wX%m#wX%n#wX%o#wX%p#wX%q#wX%r#wX%u#wX%v#wX%w#wX%y#wX~Ot#VO%z#YO~O%y[OU#xXV#xXf#xXg#xXh#xXx#xX{#xX|#xX}#xX!O#xX!P#xX!Q#xX!R#xX!S#xX!T#xX!U#xX!V#xX$S#xX$V#xX%s#xX$^#xX$]#xX~O$[#[O~O$^#`O~PYOd!uOe!uOUnaVnafnagnahnaxna{na|na}na!Ona!Pna!Qna!Rna!Sna!Tna!Una!Vna$Sna$Vna%sna$^na$]na~OP#dOQ#bOR#bOWyPXyPYyPZyP[yP]yP^yP_yP`yPayPbyPcyP!ZyP#tyP$WyP$XyP$[yP$_yP$`yP$ayP$byP$cyP$dyP$eyP$fyP$gyP$hyP$iyP$jyP$kyP$lyP$myP$nyP$oyP$pyP$qyP$ryP$syP$tyP$uyP$vyP$wyP$xyP$yyP$zyP${yP$|yP$}yP%OyP%PyP%QyP%RyP%SyP%TyP%UyP%VyP%WyP%XyP%YyP%ZyP%[yP%]yP%^yP%_yP%`yP%ayP%byP%cyP%dyP%eyP%fyP%gyP%hyP%iyP%jyP%kyP%lyP%myP%nyP%oyP%pyP%qyP%ryP%uyP%vyP%wyP%yyP~O#p#jO~O!P#lO#p#kO~Oi#nOj#nO$WVO$XVO%u#mO%v#mO%wVO~O$^#qO~P']Ox!yOU#vaV#vaf#vag#vah#va{#va|#va}#va!O#va!P#va!Q#va!R#va!S#va!T#va!U#va!V#va$S#va$V#va%s#va$^#va$]#va~O!V#rO$O#rO$P#rO$Q#rO~O$]#tO%z#uO~Ot#vO$^#yO~O$]#zO$^#{O~O$]vX$^vX~P']OWyXXyXYyXZyX[yX]yX^yX_yX`yXayXbyXcyX!ZyX#tyX$WyX$XyX$[yX$_yX$`yX$ayX$byX$cyX$dyX$eyX$fyX$gyX$hyX$iyX$jyX$kyX$lyX$myX$nyX$oyX$pyX$qyX$ryX$syX$tyX$uyX$vyX$wyX$xyX$yyX$zyX${yX$|yX$}yX%OyX%PyX%QyX%RyX%SyX%TyX%UyX%VyX%WyX%XyX%YyX%ZyX%[yX%]yX%^yX%_yX%`yX%ayX%byX%cyX%dyX%eyX%fyX%gyX%hyX%iyX%jyX%kyX%lyX%myX%nyX%oyX%pyX%qyX%ryX%uyX%vyX%wyX%yyX~OS#}OT#}O~P;dOQ#bOR#bO~P;dO%t$UO%x$VO~O#p$WO~O$W#SO$X#SO%w#SO~O$[$XO~O#t$YO~Ot#VO%z$[O~O$]$]O$^$^O~OWyaXyaYyaZya[ya]ya^ya_ya`yaayabyacya!Zya#tya$Wya$Xya$_ya$`ya$aya$bya$cya$dya$eya$fya$gya$hya$iya$jya$kya$lya$mya$nya$oya$pya$qya$rya$sya$tya$uya$vya$wya$xya$yya$zya${ya$|ya$}ya%Oya%Pya%Qya%Rya%Sya%Tya%Uya%Vya%Wya%Xya%Yya%Zya%[ya%]ya%^ya%_ya%`ya%aya%bya%cya%dya%eya%fya%gya%hya%iya%jya%kya%lya%mya%nya%oya%pya%qya%rya%uya%vya%wya%yya~O$[#[O~PBoOS$aOT$aO$[ya~PBoOx!yOUwqfwqgwqhwq!Owq!Pwq!Qwq!Rwq!Swq!Twq!Uwq!Vwq$Swq$Vwq%swq$^wq$]wq~OVwq{wq|wq}wq~PHsOV!yO{!yO|!yO}!yO~PHsOV!yOx!yO{!yO|!yO}!yO!O!zO!P!zOUwqfwqgwqhwq$Swq$Vwq%swq$^wq$]wq~O!Qwq!Rwq!Swq!Twq!Uwq!Vwq~PJoO!Q!{O!R!{O!S!{O!T!{O!U!{O!V!{O~PJoOV!yOf!|Oh!|Ox!yO{!yO|!yO}!yO!O!zO!P!zO!Q!{O!R!{O!S!{O!T!{O!U!{O!V!{O~OUwqgwq$Swq$Vwq%swq$^wq$]wq~PLqO#p$cO%t$bO~O$^$dO~Ot#vO$^$fO~O$]vi$^vi~P']O$[#[OWyiXyiYyiZyi[yi]yi^yi_yi`yiayibyicyi!Zyi#tyi$Wyi$Xyi$_yi$`yi$ayi$byi$cyi$dyi$eyi$fyi$gyi$hyi$iyi$jyi$kyi$lyi$myi$nyi$oyi$pyi$qyi$ryi$syi$tyi$uyi$vyi$wyi$xyi$yyi$zyi${yi$|yi$}yi%Oyi%Pyi%Qyi%Ryi%Syi%Tyi%Uyi%Vyi%Wyi%Xyi%Yyi%Zyi%[yi%]yi%^yi%_yi%`yi%ayi%byi%cyi%dyi%eyi%fyi%gyi%hyi%iyi%jyi%kyi%lyi%myi%nyi%oyi%pyi%qyi%ryi%uyi%vyi%wyi%yyi~O%t$hO~O",
+  goto: "(u$UPPPPPPPPPPPPPPPPPPPPPPPPPPPPP$V$u%R%_%e%q%tP%z&T$uP&W&gPPPPPPPPPPP$u&q&}P&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}&}$uP'Z$u$uP$u$u'j$u'v(V(f(i(oPPP$uP(rQSOQ#TXQ#UYQ#_!vQ$P#eQ$Q#fQ$R#gQ$S#hQ$T#iR$_#ze_OXY!v#e#f#g#h#i#zeROXY!v#e#f#g#h#i#zQ!wRR#a!xQ#]!uQ#|#bQ$`#}R$g$aR#w#[Q#x#[R$e$]Q!xRQ#RUR#a!wR#^!vQ#e!yQ#f!zQ#g!{Q#h!|R#i!}Y#c!y!z!{!|!}R$O#deUOXY!v#e#f#g#h#i#zeTOXY!v#e#f#g#h#i#zd_OXY!v#e#f#g#h#i#zR#o#QeYOXY!v#e#f#g#h#i#zd]OXY!v#e#f#g#h#i#zR!tPd^OXY!v#e#f#g#h#i#zR#Z]R#W[Q#X[R$Z#tR#s#VR#p#Q",
+  nodeNames: "⚠ Bool Ignoring On GroupLeft GroupRight Offset Atan2 Avg Bottomk Count CountValues Group Max Min Quantile Stddev Stdvar Sum Topk By Without And Or Unless Start End LineComment PromQL Expr AggregateExpr AggregateOp AggregateModifier GroupingLabels GroupingLabelList GroupingLabel LabelName FunctionCallBody FunctionCallArgs BinaryExpr Pow BinModifiers OnOrIgnoring Mul Div Mod Add Sub Eql Gte Gtr Lte Lss Neq FunctionCall FunctionIdentifier AbsentOverTime Identifier Absent Abs Acos Acosh Asin Asinh Atan Atanh AvgOverTime Ceil Changes Clamp ClampMax ClampMin Cos Cosh CountOverTime DaysInMonth DayOfMonth DayOfWeek Deg Delta Deriv Exp Floor HistogramQuantile HoltWinters Hour Idelta Increase Irate LabelReplace LabelJoin LastOverTime Ln Log10 Log2 MaxOverTime MinOverTime Minute Month Pi PredictLinear PresentOverTime QuantileOverTime Rad Rate Resets Round Scalar Sgn Sin Sinh Sort SortDesc Sqrt StddevOverTime StdvarOverTime SumOverTime Tan Tanh Timestamp Time Vector Year MatrixSelector Duration NumberLiteral OffsetExpr ParenExpr StringLiteral SubqueryExpr UnaryExpr UnaryOp VectorSelector MetricIdentifier LabelMatchers LabelMatchList LabelMatcher MatchOp EqlSingle EqlRegex NeqRegex StepInvariantExpr At AtModifierPreprocessors MetricName",
+  maxTerm: 226,
+  skippedNodes: [0,27],
+  repeatNodeCount: 0,
+  tokenData: "1R~RwX^#lpq#lqr$ars$tst%huv%swx%xxy&gyz&lz{&q{|&v|}&}}!O'S!O!P'Z!P!Q(Z!Q!R(`!R![)W![!]-r!^!_.n!_!`.{!`!a/b!b!c/o!c!}/t!}#O0[#P#Q0a#Q#R0f#R#S/t#S#T0k#T#o/t#o#p0w#q#r0|#y#z#l$f$g#l#BY#BZ#l$IS$I_#l$I|$JO#l$JT$JU#l$KV$KW#l&FU&FV#l~#qY$Z~X^#lpq#l#y#z#l$f$g#l#BY#BZ#l$IS$I_#l$I|$JO#l$JT$JU#l$KV$KW#l&FU&FV#l~$dQ!_!`$j#r#s$o~$oO!V~~$tO$Q~~$yU#t~OY$tZr$trs%]s#O$t#O#P%b#P~$t~%bO#t~~%ePO~$t~%mQk~OY%hZ~%h~%xO}~~%}U#t~OY%xZw%xwx%]x#O%x#O#P&a#P~%x~&dPO~%x~&lO$[~~&qO$^~~&vO{~R&}O%vP!OQ~'SO$]~R'ZO%uP!PQP'^P!Q!['aP'fR%wP!Q!['a!g!h'o#X#Y'oP'rR{|'{}!O'{!Q![(RP(OP!Q![(RP(WP%wP!Q![(R~(`O|~R(eZ%wP!O!P'a!Q![)W!g!h'o#W#X){#X#Y'o#[#]*d#a#b*x#g#h+l#k#l+}#l#m-W#m#n,iR)]Y%wP!O!P'a!Q![)W!g!h'o#W#X){#X#Y'o#[#]*d#a#b*x#g#h+l#k#l+}#m#n,iQ*QP#pQ!Q![*TQ*WS!Q![*T#[#]*d#a#b*x#g#h+lQ*iP#pQ!Q![*lQ*oR!Q![*l#a#b*x#g#h+lQ*}Q#pQ!Q![+T#g#h+gQ+WR!Q![+T#a#b+a#g#h+lQ+dP#g#h+gQ+lO#pQQ+qP#pQ!Q![+tQ+wQ!Q![+t#a#b+aQ,SP#pQ!Q![,VQ,YT!Q![,V#W#X){#[#]*d#a#b*x#g#h+lQ,nP#pQ!Q![,qQ,tU!Q![,q#W#X){#[#]*d#a#b*x#g#h+l#k#l+}P-ZR!Q![-d!c!i-d#T#Z-dP-iR%wP!Q![-d!c!i-d#T#Z-dV-yT%xS!ZR!Q![.Y![!].Y!c!}.Y#R#S.Y#T#o.YR._T!ZR!Q![.Y![!].Y!c!}.Y#R#S.Y#T#o.Y~.sP!U~!_!`.v~.{O!T~~/QQ$OP!_!`/W#r#s/]Q/]O!QQ~/bO$P~~/gP!S~!_!`/j~/oO!R~~/tO$S~V/{T!ZRtS!Q![/t![!].Y!c!}/t#R#S/t#T#o/t~0aO%s~~0fO%t~~0kOx~~0nRO#S0k#S#T%]#T~0k~0|O%y~~1RO%z~",
+  tokenizers: [0, 1, 2],
+  topRules: {"PromQL":[0,28],"MetricName":[1,144]},
+  specialized: [{term: 57, get: (value, stack) => (specializeIdentifier(value, stack) << 1)},{term: 57, get: (value, stack) => (extendIdentifier(value, stack) << 1) | 1},{term: 57, get: value => spec_Identifier[value] || -1}],
+  tokenPrec: 0
+})
--- a/web/ui/module/codemirror-promql/src/grammar/parser.terms.js
+++ b/web/ui/module/codemirror-promql/src/grammar/parser.terms.js
@ -0,0 +1,148 @@
+// This file was generated by lezer-generator. You probably shouldn't edit it.
+export const
+  inf = 146,
+  nan = 147,
+  Bool = 1,
+  Ignoring = 2,
+  On = 3,
+  GroupLeft = 4,
+  GroupRight = 5,
+  Offset = 6,
+  Atan2 = 7,
+  Avg = 8,
+  Bottomk = 9,
+  Count = 10,
+  CountValues = 11,
+  Group = 12,
+  Max = 13,
+  Min = 14,
+  Quantile = 15,
+  Stddev = 16,
+  Stdvar = 17,
+  Sum = 18,
+  Topk = 19,
+  By = 20,
+  Without = 21,
+  And = 22,
+  Or = 23,
+  Unless = 24,
+  Start = 25,
+  End = 26,
+  LineComment = 27,
+  PromQL = 28,
+  Expr = 29,
+  AggregateExpr = 30,
+  AggregateOp = 31,
+  AggregateModifier = 32,
+  GroupingLabels = 33,
+  GroupingLabelList = 34,
+  GroupingLabel = 35,
+  LabelName = 36,
+  FunctionCallBody = 37,
+  FunctionCallArgs = 38,
+  BinaryExpr = 39,
+  Pow = 40,
+  BinModifiers = 41,
+  OnOrIgnoring = 42,
+  Mul = 43,
+  Div = 44,
+  Mod = 45,
+  Add = 46,
+  Sub = 47,
+  Eql = 48,
+  Gte = 49,
+  Gtr = 50,
+  Lte = 51,
+  Lss = 52,
+  Neq = 53,
+  FunctionCall = 54,
+  FunctionIdentifier = 55,
+  AbsentOverTime = 56,
+  Identifier = 57,
+  Absent = 58,
+  Abs = 59,
+  Acos = 60,
+  Acosh = 61,
+  Asin = 62,
+  Asinh = 63,
+  Atan = 64,
+  Atanh = 65,
+  AvgOverTime = 66,
+  Ceil = 67,
+  Changes = 68,
+  Clamp = 69,
+  ClampMax = 70,
+  ClampMin = 71,
+  Cos = 72,
+  Cosh = 73,
+  CountOverTime = 74,
+  DaysInMonth = 75,
+  DayOfMonth = 76,
+  DayOfWeek = 77,
+  Deg = 78,
+  Delta = 79,
+  Deriv = 80,
+  Exp = 81,
+  Floor = 82,
+  HistogramQuantile = 83,
+  HoltWinters = 84,
+  Hour = 85,
+  Idelta = 86,
+  Increase = 87,
+  Irate = 88,
+  LabelReplace = 89,
+  LabelJoin = 90,
+  LastOverTime = 91,
+  Ln = 92,
+  Log10 = 93,
+  Log2 = 94,
+  MaxOverTime = 95,
+  MinOverTime = 96,
+  Minute = 97,
+  Month = 98,
+  Pi = 99,
+  PredictLinear = 100,
+  PresentOverTime = 101,
+  QuantileOverTime = 102,
+  Rad = 103,
+  Rate = 104,
+  Resets = 105,
+  Round = 106,
+  Scalar = 107,
+  Sgn = 108,
+  Sin = 109,
+  Sinh = 110,
+  Sort = 111,
+  SortDesc = 112,
+  Sqrt = 113,
+  StddevOverTime = 114,
+  StdvarOverTime = 115,
+  SumOverTime = 116,
+  Tan = 117,
+  Tanh = 118,
+  Timestamp = 119,
+  Time = 120,
+  Vector = 121,
+  Year = 122,
+  MatrixSelector = 123,
+  Duration = 124,
+  NumberLiteral = 125,
+  OffsetExpr = 126,
+  ParenExpr = 127,
+  StringLiteral = 128,
+  SubqueryExpr = 129,
+  UnaryExpr = 130,
+  UnaryOp = 131,
+  VectorSelector = 132,
+  MetricIdentifier = 133,
+  LabelMatchers = 134,
+  LabelMatchList = 135,
+  LabelMatcher = 136,
+  MatchOp = 137,
+  EqlSingle = 138,
+  EqlRegex = 139,
+  NeqRegex = 140,
+  StepInvariantExpr = 141,
+  At = 142,
+  AtModifierPreprocessors = 143,
+  MetricName = 144