Merge pull request #569 from grafana/arve/sync-upstream

Sync with latest Prometheus upstream
2025-03-05 20:59:13 -08:00 · 2023-11-28 16:23:18 +01:00 · 2023-11-28 16:23:18 +01:00 · e239c5eda5
parent cdc4e7fd4b a3ee0990dd
commit e239c5eda5
141 changed files with 5244 additions and 4421 deletions
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -1,4 +1,8 @@
 <!--
+    Please give your PR a title in the form "area: short description".  For example "tsdb: reduce disk usage by 95%"
+
+    If your PR is to fix an issue, put "Fixes #issue-number" in the description.
+
    Don't forget!

    - Please sign CNCF's Developer Certificate of Origin and sign-off your commits by adding the -s / --signoff flag to `git commit`. See https://github.com/apps/dco for more information.
@ -7,8 +11,6 @@

    - Where possible use only exported APIs for tests to simplify the review and make it as close as possible to an actual library usage.

-    - No tests are needed for internal implementation changes.
-
    - Performance improvements would need a benchmark test to prove it.

    - All exposed objects should have a comment.
--- a/.golangci.yml
+++ b/.golangci.yml
@ -6,6 +6,7 @@ run:
  skip-dirs:
    # Copied it from a different source
    - storage/remote/otlptranslator/prometheusremotewrite
+    - storage/remote/otlptranslator/prometheus

 output:
  sort-results: true
@ -37,12 +38,10 @@ issues:
    - path: tsdb/
      linters:
        - errorlint
-    - path: util/
+    - path: tsdb/
+      text: "import 'github.com/pkg/errors' is not allowed"
      linters:
-        - errorlint
-    - path: web/
-      linters:
-        - errorlint
+        - depguard
    - linters:
        - godot
      source: "^// ==="
@ -62,6 +61,8 @@ linters-settings:
          desc: "Use corresponding 'os' or 'io' functions instead."
        - pkg: "regexp"
          desc: "Use github.com/grafana/regexp instead of regexp"
+        - pkg: "github.com/pkg/errors"
+          desc: "Use 'errors' or 'fmt' instead of github.com/pkg/errors"
  errcheck:
    exclude-functions:
      # Don't flag lines such as "io.Copy(io.Discard, resp.Body)".
--- a/cmd/prometheus/main.go
+++ b/cmd/prometheus/main.go
@ -63,6 +63,7 @@ import (
 	"github.com/prometheus/prometheus/notifier"
 	_ "github.com/prometheus/prometheus/plugins" // Register plugins.
 	"github.com/prometheus/prometheus/promql"
+	"github.com/prometheus/prometheus/promql/parser"
 	"github.com/prometheus/prometheus/rules"
 	"github.com/prometheus/prometheus/scrape"
 	"github.com/prometheus/prometheus/storage"
@ -199,6 +200,9 @@ func (c *flagConfig) setFeatureListOptions(logger log.Logger) error {
 			case "no-default-scrape-port":
 				c.scrape.NoDefaultPort = true
 				level.Info(logger).Log("msg", "No default port will be appended to scrape targets' addresses.")
+			case "promql-experimental-functions":
+				parser.EnableExperimentalFunctions = true
+				level.Info(logger).Log("msg", "Experimental PromQL functions enabled.")
 			case "native-histograms":
 				c.tsdb.EnableNativeHistograms = true
 				// Change relevant global variables. Hacky, but it's hard to pass a new option or default to unmarshallers.
@ -419,7 +423,7 @@ func main() {
 	a.Flag("scrape.discovery-reload-interval", "Interval used by scrape manager to throttle target groups updates.").
 		Hidden().Default("5s").SetValue(&cfg.scrape.DiscoveryReloadInterval)

-	a.Flag("enable-feature", "Comma separated feature names to enable. Valid options: agent, exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-at-modifier, promql-negative-offset, promql-per-step-stats, remote-write-receiver (DEPRECATED), extra-scrape-metrics, new-service-discovery-manager, auto-gomaxprocs, no-default-scrape-port, native-histograms, otlp-write-receiver. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details.").
+	a.Flag("enable-feature", "Comma separated feature names to enable. Valid options: agent, exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-at-modifier, promql-negative-offset, promql-per-step-stats, promql-experimental-functions, remote-write-receiver (DEPRECATED), extra-scrape-metrics, new-service-discovery-manager, auto-gomaxprocs, no-default-scrape-port, native-histograms, otlp-write-receiver. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details.").
 		Default("").StringsVar(&cfg.featureList)

 	promlogflag.AddFlags(a, &cfg.promlogConfig)
--- a/cmd/promtool/tsdb.go
+++ b/cmd/promtool/tsdb.go
@ -16,6 +16,7 @@ package main
 import (
 	"bufio"
 	"context"
+	"errors"
 	"fmt"
 	"io"
 	"os"
@ -643,10 +644,15 @@ func analyzeCompaction(ctx context.Context, block tsdb.BlockReader, indexr tsdb.

 		for _, chk := range chks {
 			// Load the actual data of the chunk.
-			chk, err := chunkr.Chunk(chk)
+			chk, iterable, err := chunkr.ChunkOrIterable(chk)
 			if err != nil {
 				return err
 			}
+			// Chunks within blocks should not need to be re-written, so an
+			// iterable is not expected to be returned from the chunk reader.
+			if iterable != nil {
+				return errors.New("ChunkOrIterable should not return an iterable when reading a block")
+			}
 			switch chk.Encoding() {
 			case chunkenc.EncXOR:
 				floatChunkSamplesCount = append(floatChunkSamplesCount, chk.NumSamples())
--- a/config/config.go
+++ b/config/config.go
@ -158,6 +158,7 @@ var (
 		HonorLabels:             false,
 		HonorTimestamps:         true,
 		HTTPClientConfig:        config.DefaultHTTPClientConfig,
+		EnableCompression:       true,
 	}

 	// DefaultAlertmanagerConfig is the default alertmanager configuration.
@ -582,6 +583,8 @@ type ScrapeConfig struct {
 	MetricsPath string `yaml:"metrics_path,omitempty"`
 	// The URL scheme with which to fetch metrics from targets.
 	Scheme string `yaml:"scheme,omitempty"`
+	// Indicator whether to request compressed response from the target.
+	EnableCompression bool `yaml:"enable_compression"`
 	// An uncompressed response body larger than this many bytes will cause the
 	// scrape to fail. 0 means no limit.
 	BodySizeLimit units.Base2Bytes `yaml:"body_size_limit,omitempty"`
--- a/config/config_test.go
+++ b/config/config_test.go
@ -186,6 +186,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -288,6 +289,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(50 * time.Second),
 			ScrapeTimeout:         model.Duration(5 * time.Second),
+			EnableCompression:     true,
 			BodySizeLimit:         10 * units.MiB,
 			SampleLimit:           1000,
 			TargetLimit:           35,
@ -384,6 +386,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -438,6 +441,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         model.Duration(10 * time.Second),
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -470,6 +474,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -508,6 +513,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -546,6 +552,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -573,6 +580,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -609,6 +617,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -642,6 +651,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -682,6 +692,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -712,6 +723,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -745,6 +757,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -771,6 +784,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -800,6 +814,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       false,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -829,6 +844,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -858,6 +874,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -884,6 +901,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -918,6 +936,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -951,6 +970,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -980,6 +1000,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -1009,6 +1030,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -1042,6 +1064,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -1078,6 +1101,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -1133,6 +1157,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -1159,6 +1184,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -1196,6 +1222,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -1239,6 +1266,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -1273,6 +1301,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -1301,6 +1330,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -1332,6 +1362,7 @@ var expectedConf = &Config{
 			HonorTimestamps:       true,
 			ScrapeInterval:        model.Duration(15 * time.Second),
 			ScrapeTimeout:         DefaultGlobalConfig.ScrapeTimeout,
+			EnableCompression:     true,
 			BodySizeLimit:         globBodySizeLimit,
 			SampleLimit:           globSampleLimit,
 			TargetLimit:           globTargetLimit,
@ -2060,9 +2091,10 @@ func TestGetScrapeConfigs(t *testing.T) {
 			ScrapeTimeout:   scrapeTimeout,
 			ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols,

-			MetricsPath:      "/metrics",
-			Scheme:           "http",
-			HTTPClientConfig: config.DefaultHTTPClientConfig,
+			MetricsPath:       "/metrics",
+			Scheme:            "http",
+			EnableCompression: true,
+			HTTPClientConfig:  config.DefaultHTTPClientConfig,
 			ServiceDiscoveryConfigs: discovery.Configs{
 				discovery.StaticConfig{
 					{
@ -2118,6 +2150,8 @@ func TestGetScrapeConfigs(t *testing.T) {
 					MetricsPath: DefaultScrapeConfig.MetricsPath,
 					Scheme:      DefaultScrapeConfig.Scheme,

+					EnableCompression: true,
+
 					HTTPClientConfig: config.HTTPClientConfig{
 						TLSConfig: config.TLSConfig{
 							CertFile: filepath.FromSlash("testdata/scrape_configs/valid_cert_file"),
@ -2158,6 +2192,8 @@ func TestGetScrapeConfigs(t *testing.T) {
 					MetricsPath: DefaultScrapeConfig.MetricsPath,
 					Scheme:      DefaultScrapeConfig.Scheme,

+					EnableCompression: true,
+
 					ServiceDiscoveryConfigs: discovery.Configs{
 						&vultr.SDConfig{
 							HTTPClientConfig: config.HTTPClientConfig{
@ -2210,3 +2246,16 @@ func kubernetesSDHostURL() config.URL {
 	tURL, _ := url.Parse("https://localhost:1234")
 	return config.URL{URL: tURL}
 }
+
+func TestScrapeConfigDisableCompression(t *testing.T) {
+	want, err := LoadFile("testdata/scrape_config_disable_compression.good.yml", false, false, log.NewNopLogger())
+	require.NoError(t, err)
+
+	out, err := yaml.Marshal(want)
+
+	require.NoError(t, err)
+	got := &Config{}
+	require.NoError(t, yaml.UnmarshalStrict(out, got))
+
+	require.Equal(t, false, got.ScrapeConfigs[0].EnableCompression)
+}
--- a/config/testdata/scrape_config_disable_compression.good.yml
+++ b/config/testdata/scrape_config_disable_compression.good.yml
@ -0,0 +1,5 @@
+scrape_configs:
+  - job_name: prometheus
+    static_configs:
+      - targets: ['localhost:8080']
+    enable_compression: false
--- a/discovery/azure/azure.go
+++ b/discovery/azure/azure.go
@ -17,6 +17,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"math/rand"
 	"net"
 	"net/http"
 	"strings"
@ -30,10 +31,13 @@ import (
 	"github.com/Azure/azure-sdk-for-go/sdk/azidentity"
 	"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v4"
 	"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v2"
+	cache "github.com/Code-Hex/go-generics-cache"
+	"github.com/Code-Hex/go-generics-cache/policy/lru"
 	"github.com/go-kit/log"
 	"github.com/go-kit/log/level"
 	"github.com/prometheus/client_golang/prometheus"
 	config_util "github.com/prometheus/common/config"
+
 	"github.com/prometheus/common/model"
 	"github.com/prometheus/common/version"

@ -80,6 +84,11 @@ var (
 			Name: "prometheus_sd_azure_failures_total",
 			Help: "Number of Azure service discovery refresh failures.",
 		})
+	cacheHitCount = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Name: "prometheus_sd_azure_cache_hit_total",
+			Help: "Number of cache hit during refresh.",
+		})
 )

 var environments = map[string]cloud.Configuration{
@ -105,6 +114,7 @@ func CloudConfigurationFromName(name string) (cloud.Configuration, error) {
 func init() {
 	discovery.RegisterConfig(&SDConfig{})
 	prometheus.MustRegister(failuresCount)
+	prometheus.MustRegister(cacheHitCount)
 }

 // SDConfig is the configuration for Azure based service discovery.
@ -145,7 +155,6 @@ func (c *SDConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
 	if err != nil {
 		return err
 	}
-
 	if err = validateAuthParam(c.SubscriptionID, "subscription_id"); err != nil {
 		return err
 	}
@ -174,6 +183,7 @@ type Discovery struct {
 	logger log.Logger
 	cfg    *SDConfig
 	port   int
+	cache  *cache.Cache[string, *armnetwork.Interface]
 }

 // NewDiscovery returns a new AzureDiscovery which periodically refreshes its targets.
@ -181,17 +191,21 @@ func NewDiscovery(cfg *SDConfig, logger log.Logger) *Discovery {
 	if logger == nil {
 		logger = log.NewNopLogger()
 	}
+	l := cache.New(cache.AsLRU[string, *armnetwork.Interface](lru.WithCapacity(5000)))
 	d := &Discovery{
 		cfg:    cfg,
 		port:   cfg.Port,
 		logger: logger,
+		cache:  l,
 	}
+
 	d.Discovery = refresh.NewDiscovery(
 		logger,
 		"azure",
 		time.Duration(cfg.RefreshInterval),
 		d.refresh,
 	)
+
 	return d
 }

@ -385,15 +399,22 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) {

 			// Get the IP address information via separate call to the network provider.
 			for _, nicID := range vm.NetworkInterfaces {
-				networkInterface, err := client.getNetworkInterfaceByID(ctx, nicID)
-				if err != nil {
-					if errors.Is(err, errorNotFound) {
-						level.Warn(d.logger).Log("msg", "Network interface does not exist", "name", nicID, "err", err)
-					} else {
-						ch <- target{labelSet: nil, err: err}
+				var networkInterface *armnetwork.Interface
+				if v, ok := d.getFromCache(nicID); ok {
+					networkInterface = v
+					cacheHitCount.Add(1)
+				} else {
+					networkInterface, err = client.getNetworkInterfaceByID(ctx, nicID)
+					if err != nil {
+						if errors.Is(err, errorNotFound) {
+							level.Warn(d.logger).Log("msg", "Network interface does not exist", "name", nicID, "err", err)
+						} else {
+							ch <- target{labelSet: nil, err: err}
+						}
+						// Get out of this routine because we cannot continue without a network interface.
+						return
 					}
-					// Get out of this routine because we cannot continue without a network interface.
-					return
+					d.addToCache(nicID, networkInterface)
 				}

 				if networkInterface.Properties == nil {
@ -628,3 +649,19 @@ func (client *azureClient) getNetworkInterfaceByID(ctx context.Context, networkI

 	return &resp.Interface, nil
 }
+
+// addToCache will add the network interface information for the specified nicID.
+func (d *Discovery) addToCache(nicID string, netInt *armnetwork.Interface) {
+	random := rand.Int63n(int64(time.Duration(d.cfg.RefreshInterval * 3).Seconds()))
+	rs := time.Duration(random) * time.Second
+	exptime := time.Duration(d.cfg.RefreshInterval*10) + rs
+	d.cache.Set(nicID, netInt, cache.WithExpiration(exptime))
+	level.Debug(d.logger).Log("msg", "Adding nic", "nic", nicID, "time", exptime.Seconds())
+}
+
+// getFromCache will get the network Interface for the specified nicID
+// If the cache is disabled nothing will happen.
+func (d *Discovery) getFromCache(nicID string) (*armnetwork.Interface, bool) {
+	net, found := d.cache.Get(nicID)
+	return net, found
+}
--- a/discovery/ionos/ionos.go
+++ b/discovery/ionos/ionos.go
@ -14,10 +14,10 @@
 package ionos

 import (
+	"errors"
 	"time"

 	"github.com/go-kit/log"
-	"github.com/pkg/errors"
 	"github.com/prometheus/common/config"
 	"github.com/prometheus/common/model"

--- a/discovery/linode/linode.go
+++ b/discovery/linode/linode.go
@ -51,6 +51,7 @@ const (
 	linodeLabelStatus             = linodeLabel + "status"
 	linodeLabelTags               = linodeLabel + "tags"
 	linodeLabelGroup              = linodeLabel + "group"
+	linodeLabelGPUs               = linodeLabel + "gpus"
 	linodeLabelHypervisor         = linodeLabel + "hypervisor"
 	linodeLabelBackups            = linodeLabel + "backups"
 	linodeLabelSpecsDiskBytes     = linodeLabel + "specs_disk_bytes"
@ -302,6 +303,7 @@ func (d *Discovery) refreshData(ctx context.Context) ([]*targetgroup.Group, erro
 			linodeLabelType:               model.LabelValue(instance.Type),
 			linodeLabelStatus:             model.LabelValue(instance.Status),
 			linodeLabelGroup:              model.LabelValue(instance.Group),
+			linodeLabelGPUs:               model.LabelValue(fmt.Sprintf("%d", instance.Specs.GPUs)),
 			linodeLabelHypervisor:         model.LabelValue(instance.Hypervisor),
 			linodeLabelBackups:            model.LabelValue(backupsStatus),
 			linodeLabelSpecsDiskBytes:     model.LabelValue(fmt.Sprintf("%d", int64(instance.Specs.Disk)<<20)),
--- a/discovery/linode/linode_test.go
+++ b/discovery/linode/linode_test.go
@ -85,6 +85,7 @@ func TestLinodeSDRefresh(t *testing.T) {
 			"__meta_linode_status":               model.LabelValue("running"),
 			"__meta_linode_tags":                 model.LabelValue(",monitoring,"),
 			"__meta_linode_group":                model.LabelValue(""),
+			"__meta_linode_gpus":                 model.LabelValue("0"),
 			"__meta_linode_hypervisor":           model.LabelValue("kvm"),
 			"__meta_linode_backups":              model.LabelValue("disabled"),
 			"__meta_linode_specs_disk_bytes":     model.LabelValue("85899345920"),
@ -109,6 +110,7 @@ func TestLinodeSDRefresh(t *testing.T) {
 			"__meta_linode_status":               model.LabelValue("running"),
 			"__meta_linode_tags":                 model.LabelValue(",monitoring,"),
 			"__meta_linode_group":                model.LabelValue(""),
+			"__meta_linode_gpus":                 model.LabelValue("0"),
 			"__meta_linode_hypervisor":           model.LabelValue("kvm"),
 			"__meta_linode_backups":              model.LabelValue("disabled"),
 			"__meta_linode_specs_disk_bytes":     model.LabelValue("85899345920"),
@ -132,6 +134,7 @@ func TestLinodeSDRefresh(t *testing.T) {
 			"__meta_linode_status":               model.LabelValue("running"),
 			"__meta_linode_tags":                 model.LabelValue(",monitoring,"),
 			"__meta_linode_group":                model.LabelValue(""),
+			"__meta_linode_gpus":                 model.LabelValue("0"),
 			"__meta_linode_hypervisor":           model.LabelValue("kvm"),
 			"__meta_linode_backups":              model.LabelValue("disabled"),
 			"__meta_linode_specs_disk_bytes":     model.LabelValue("53687091200"),
@ -155,6 +158,7 @@ func TestLinodeSDRefresh(t *testing.T) {
 			"__meta_linode_status":               model.LabelValue("running"),
 			"__meta_linode_tags":                 model.LabelValue(",monitoring,"),
 			"__meta_linode_group":                model.LabelValue(""),
+			"__meta_linode_gpus":                 model.LabelValue("0"),
 			"__meta_linode_hypervisor":           model.LabelValue("kvm"),
 			"__meta_linode_backups":              model.LabelValue("disabled"),
 			"__meta_linode_specs_disk_bytes":     model.LabelValue("26843545600"),
--- a/docs/command-line/prometheus.md
+++ b/docs/command-line/prometheus.md
@ -52,7 +52,7 @@ The Prometheus monitoring server
 | <code class="text-nowrap">--query.timeout</code> | Maximum time a query may take before being aborted. Use with server mode only. | `2m` |
 | <code class="text-nowrap">--query.max-concurrency</code> | Maximum number of queries executed concurrently. Use with server mode only. | `20` |
 | <code class="text-nowrap">--query.max-samples</code> | Maximum number of samples a single query can load into memory. Note that queries will fail if they try to load more samples than this into memory, so this also limits the number of samples a query can return. Use with server mode only. | `50000000` |
-| <code class="text-nowrap">--enable-feature</code> | Comma separated feature names to enable. Valid options: agent, exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-at-modifier, promql-negative-offset, promql-per-step-stats, remote-write-receiver (DEPRECATED), extra-scrape-metrics, new-service-discovery-manager, auto-gomaxprocs, no-default-scrape-port, native-histograms, otlp-write-receiver. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details. |  |
+| <code class="text-nowrap">--enable-feature</code> | Comma separated feature names to enable. Valid options: agent, exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-at-modifier, promql-negative-offset, promql-per-step-stats, promql-experimental-functions, remote-write-receiver (DEPRECATED), extra-scrape-metrics, new-service-discovery-manager, auto-gomaxprocs, no-default-scrape-port, native-histograms, otlp-write-receiver. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details. |  |
 | <code class="text-nowrap">--log.level</code> | Only log messages with the given severity or above. One of: [debug, info, warn, error] | `info` |
 | <code class="text-nowrap">--log.format</code> | Output format of log messages. One of: [logfmt, json] | `logfmt` |

--- a/docs/configuration/configuration.md
+++ b/docs/configuration/configuration.md
@ -37,7 +37,7 @@ Generic placeholders are defined as follows:
 * `<float>`: a floating-point number
 * `<host>`: a valid string consisting of a hostname or IP followed by an optional port number
 * `<int>`: an integer value
-* `<labelname>`: a string matching the regular expression `[a-zA-Z_][a-zA-Z0-9_]*`
+* `<labelname>`: a string matching the regular expression `[a-zA-Z_][a-zA-Z0-9_]*`. Any other unsupported character in the source label should be converted to an underscore. For example, the label `app.kubernetes.io/name` should be written as `app_kubernetes_io_name`.
 * `<labelvalue>`: a string of unicode characters
 * `<path>`: a valid URL path
 * `<scheme>`: a string that can take the values `http` or `https`
@ -237,6 +237,10 @@ job_name: <job_name>
 params:
  [ <string>: [<string>, ...] ]

+# If enable_compression is set to "false", Prometheus will request uncompressed
+# response from the scraped target.
+[ enable_compression: <boolean> | default = true ]
+
 # Sets the `Authorization` header on every scrape request with the
 # configured username and password.
 # password and password_file are mutually exclusive.
@ -844,12 +848,12 @@ Available meta labels:
 * `__meta_docker_container_id`: the id of the container
 * `__meta_docker_container_name`: the name of the container
 * `__meta_docker_container_network_mode`: the network mode of the container
-* `__meta_docker_container_label_<labelname>`: each label of the container
+* `__meta_docker_container_label_<labelname>`: each label of the container, with any unsupported characters converted to an underscore
 * `__meta_docker_network_id`: the ID of the network
 * `__meta_docker_network_name`: the name of the network
 * `__meta_docker_network_ingress`: whether the network is ingress
 * `__meta_docker_network_internal`: whether the network is internal
-* `__meta_docker_network_label_<labelname>`: each label of the network
+* `__meta_docker_network_label_<labelname>`: each label of the network, with any unsupported characters converted to an underscore
 * `__meta_docker_network_scope`: the scope of the network
 * `__meta_docker_network_ip`: the IP of the container in this network
 * `__meta_docker_port_private`: the port on the container
@ -960,7 +964,7 @@ Available meta labels:
 * `__meta_dockerswarm_service_mode`: the mode of the service
 * `__meta_dockerswarm_service_endpoint_port_name`: the name of the endpoint port, if available
 * `__meta_dockerswarm_service_endpoint_port_publish_mode`: the publish mode of the endpoint port
-* `__meta_dockerswarm_service_label_<labelname>`: each label of the service
+* `__meta_dockerswarm_service_label_<labelname>`: each label of the service, with any unsupported characters converted to an underscore
 * `__meta_dockerswarm_service_task_container_hostname`: the container hostname of the target, if available
 * `__meta_dockerswarm_service_task_container_image`: the container image of the target
 * `__meta_dockerswarm_service_updating_status`: the status of the service, if available
@ -968,7 +972,7 @@ Available meta labels:
 * `__meta_dockerswarm_network_name`: the name of the network
 * `__meta_dockerswarm_network_ingress`: whether the network is ingress
 * `__meta_dockerswarm_network_internal`: whether the network is internal
-* `__meta_dockerswarm_network_label_<labelname>`: each label of the network
+* `__meta_dockerswarm_network_label_<labelname>`: each label of the network, with any unsupported characters converted to an underscore
 * `__meta_dockerswarm_network_scope`: the scope of the network

 #### `tasks`
@ -980,7 +984,7 @@ created using the `port` parameter defined in the SD configuration.

 Available meta labels:

-* `__meta_dockerswarm_container_label_<labelname>`: each label of the container
+* `__meta_dockerswarm_container_label_<labelname>`: each label of the container, with any unsupported characters converted to an underscore
 * `__meta_dockerswarm_task_id`: the id of the task
 * `__meta_dockerswarm_task_container_id`: the container id of the task
 * `__meta_dockerswarm_task_desired_state`: the desired state of the task
@ -990,19 +994,19 @@ Available meta labels:
 * `__meta_dockerswarm_service_id`: the id of the service
 * `__meta_dockerswarm_service_name`: the name of the service
 * `__meta_dockerswarm_service_mode`: the mode of the service
-* `__meta_dockerswarm_service_label_<labelname>`: each label of the service
+* `__meta_dockerswarm_service_label_<labelname>`: each label of the service, with any unsupported characters converted to an underscore
 * `__meta_dockerswarm_network_id`: the ID of the network
 * `__meta_dockerswarm_network_name`: the name of the network
 * `__meta_dockerswarm_network_ingress`: whether the network is ingress
 * `__meta_dockerswarm_network_internal`: whether the network is internal
-* `__meta_dockerswarm_network_label_<labelname>`: each label of the network
-* `__meta_dockerswarm_network_label`: each label of the network
+* `__meta_dockerswarm_network_label_<labelname>`: each label of the network, with any unsupported characters converted to an underscore
+* `__meta_dockerswarm_network_label`: each label of the network, with any unsupported characters converted to an underscore
 * `__meta_dockerswarm_network_scope`: the scope of the network
 * `__meta_dockerswarm_node_id`: the ID of the node
 * `__meta_dockerswarm_node_hostname`: the hostname of the node
 * `__meta_dockerswarm_node_address`: the address of the node
 * `__meta_dockerswarm_node_availability`: the availability of the node
-* `__meta_dockerswarm_node_label_<labelname>`: each label of the node
+* `__meta_dockerswarm_node_label_<labelname>`: each label of the node, with any unsupported characters converted to an underscore
 * `__meta_dockerswarm_node_platform_architecture`: the architecture of the node
 * `__meta_dockerswarm_node_platform_os`: the operating system of the node
 * `__meta_dockerswarm_node_role`: the role of the node
@ -1022,7 +1026,7 @@ Available meta labels:
 * `__meta_dockerswarm_node_engine_version`: the version of the node engine
 * `__meta_dockerswarm_node_hostname`: the hostname of the node
 * `__meta_dockerswarm_node_id`: the ID of the node
-* `__meta_dockerswarm_node_label_<labelname>`: each label of the node
+* `__meta_dockerswarm_node_label_<labelname>`: each label of the node, with any unsupported characters converted to an underscore
 * `__meta_dockerswarm_node_manager_address`: the address of the manager component of the node
 * `__meta_dockerswarm_node_manager_leader`: the leadership status of the manager component of the node (true or false)
 * `__meta_dockerswarm_node_manager_reachability`: the reachability of the manager component of the node
@ -1611,7 +1615,7 @@ The following meta labels are available on targets during [relabeling](#relabel_

 * `__meta_gce_instance_id`: the numeric id of the instance
 * `__meta_gce_instance_name`: the name of the instance
-* `__meta_gce_label_<labelname>`: each GCE label of the instance
+* `__meta_gce_label_<labelname>`: each GCE label of the instance, with any unsupported characters converted to an underscore
 * `__meta_gce_machine_type`: full or partial URL of the machine type of the instance
 * `__meta_gce_metadata_<name>`: each metadata item of the instance
 * `__meta_gce_network`: the network URL of the instance
@ -1695,8 +1699,8 @@ The labels below are only available for targets with `role` set to `hcloud`:
 * `__meta_hetzner_hcloud_memory_size_gb`: the amount of memory of the server (in GB)
 * `__meta_hetzner_hcloud_disk_size_gb`: the disk size of the server (in GB)
 * `__meta_hetzner_hcloud_private_ipv4_<networkname>`: the private ipv4 address of the server within a given network
-* `__meta_hetzner_hcloud_label_<labelname>`: each label of the server
-* `__meta_hetzner_hcloud_labelpresent_<labelname>`: `true` for each label of the server
+* `__meta_hetzner_hcloud_label_<labelname>`: each label of the server, with any unsupported characters converted to an underscore
+* `__meta_hetzner_hcloud_labelpresent_<labelname>`: `true` for each label of the server, with any unsupported characters converted to an underscore

 The labels below are only available for targets with `role` set to `robot`:

@ -1963,8 +1967,8 @@ Available meta labels:

 * `__meta_kubernetes_node_name`: The name of the node object.
 * `__meta_kubernetes_node_provider_id`: The cloud provider's name for the node object.
-* `__meta_kubernetes_node_label_<labelname>`: Each label from the node object.
-* `__meta_kubernetes_node_labelpresent_<labelname>`: `true` for each label from the node object.
+* `__meta_kubernetes_node_label_<labelname>`: Each label from the node object, with any unsupported characters converted to an underscore.
+* `__meta_kubernetes_node_labelpresent_<labelname>`: `true` for each label from the node object, with any unsupported characters converted to an underscore.
 * `__meta_kubernetes_node_annotation_<annotationname>`: Each annotation from the node object.
 * `__meta_kubernetes_node_annotationpresent_<annotationname>`: `true` for each annotation from the node object.
 * `__meta_kubernetes_node_address_<address_type>`: The first address for each node address type, if it exists.
@ -1987,8 +1991,8 @@ Available meta labels:
 * `__meta_kubernetes_service_cluster_ip`: The cluster IP address of the service. (Does not apply to services of type ExternalName)
 * `__meta_kubernetes_service_loadbalancer_ip`: The IP address of the loadbalancer. (Applies to services of type LoadBalancer)
 * `__meta_kubernetes_service_external_name`: The DNS name of the service. (Applies to services of type ExternalName)
-* `__meta_kubernetes_service_label_<labelname>`: Each label from the service object.
-* `__meta_kubernetes_service_labelpresent_<labelname>`: `true` for each label of the service object.
+* `__meta_kubernetes_service_label_<labelname>`: Each label from the service object, with any unsupported characters converted to an underscore.
+* `__meta_kubernetes_service_labelpresent_<labelname>`: `true` for each label of the service object, with any unsupported characters converted to an underscore.
 * `__meta_kubernetes_service_name`: The name of the service object.
 * `__meta_kubernetes_service_port_name`: Name of the service port for the target.
 * `__meta_kubernetes_service_port_number`: Number of the service port for the target.
@ -2006,8 +2010,8 @@ Available meta labels:
 * `__meta_kubernetes_namespace`: The namespace of the pod object.
 * `__meta_kubernetes_pod_name`: The name of the pod object.
 * `__meta_kubernetes_pod_ip`: The pod IP of the pod object.
-* `__meta_kubernetes_pod_label_<labelname>`: Each label from the pod object.
-* `__meta_kubernetes_pod_labelpresent_<labelname>`: `true` for each label from the pod object.
+* `__meta_kubernetes_pod_label_<labelname>`: Each label from the pod object, with any unsupported characters converted to an underscore.
+* `__meta_kubernetes_pod_labelpresent_<labelname>`: `true` for each label from the pod object, with any unsupported characters converted to an underscore.
 * `__meta_kubernetes_pod_annotation_<annotationname>`: Each annotation from the pod object.
 * `__meta_kubernetes_pod_annotationpresent_<annotationname>`: `true` for each annotation from the pod object.
 * `__meta_kubernetes_pod_container_init`: `true` if the container is an [InitContainer](https://kubernetes.io/docs/concepts/workloads/pods/init-containers/)
@ -2036,8 +2040,8 @@ Available meta labels:

 * `__meta_kubernetes_namespace`: The namespace of the endpoints object.
 * `__meta_kubernetes_endpoints_name`: The names of the endpoints object.
-* `__meta_kubernetes_endpoints_label_<labelname>`: Each label from the endpoints object.
-* `__meta_kubernetes_endpoints_labelpresent_<labelname>`: `true` for each label from the endpoints object.
+* `__meta_kubernetes_endpoints_label_<labelname>`: Each label from the endpoints object, with any unsupported characters converted to an underscore.
+* `__meta_kubernetes_endpoints_labelpresent_<labelname>`: `true` for each label from the endpoints object, with any unsupported characters converted to an underscore.
 * `__meta_kubernetes_endpoints_annotation_<annotationname>`: Each annotation from the endpoints object.
 * `__meta_kubernetes_endpoints_annotationpresent_<annotationname>`: `true` for each annotation from the endpoints object.
 * For all targets discovered directly from the endpoints list (those not additionally inferred
@ -2062,8 +2066,8 @@ Available meta labels:

 * `__meta_kubernetes_namespace`: The namespace of the endpoints object.
 * `__meta_kubernetes_endpointslice_name`: The name of endpointslice object.
-* `__meta_kubernetes_endpointslice_label_<labelname>`: Each label from the endpointslice object.
-* `__meta_kubernetes_endpointslice_labelpresent_<labelname>`: `true` for each label from the endpointslice object.
+* `__meta_kubernetes_endpointslice_label_<labelname>`: Each label from the endpointslice object, with any unsupported characters converted to an underscore.
+* `__meta_kubernetes_endpointslice_labelpresent_<labelname>`: `true` for each label from the endpointslice object, with any unsupported characters converted to an underscore.
 * `__meta_kubernetes_endpointslice_annotation_<annotationname>`: Each annotation from the endpointslice object.
 * `__meta_kubernetes_endpointslice_annotationpresent_<annotationname>`: `true` for each annotation from the endpointslice object.
 * For all targets discovered directly from the endpointslice list (those not additionally inferred
@ -2092,8 +2096,8 @@ Available meta labels:

 * `__meta_kubernetes_namespace`: The namespace of the ingress object.
 * `__meta_kubernetes_ingress_name`: The name of the ingress object.
-* `__meta_kubernetes_ingress_label_<labelname>`: Each label from the ingress object.
-* `__meta_kubernetes_ingress_labelpresent_<labelname>`: `true` for each label from the ingress object.
+* `__meta_kubernetes_ingress_label_<labelname>`: Each label from the ingress object, with any unsupported characters converted to an underscore.
+* `__meta_kubernetes_ingress_labelpresent_<labelname>`: `true` for each label from the ingress object, with any unsupported characters converted to an underscore.
 * `__meta_kubernetes_ingress_annotation_<annotationname>`: Each annotation from the ingress object.
 * `__meta_kubernetes_ingress_annotationpresent_<annotationname>`: `true` for each annotation from the ingress object.
 * `__meta_kubernetes_ingress_class_name`: Class name from ingress spec, if present.
@ -2483,9 +2487,9 @@ The following meta labels are available on targets during [relabeling](#relabel_
 * `__meta_marathon_app`: the name of the app (with slashes replaced by dashes)
 * `__meta_marathon_image`: the name of the Docker image used (if available)
 * `__meta_marathon_task`: the ID of the Mesos task
-* `__meta_marathon_app_label_<labelname>`: any Marathon labels attached to the app
-* `__meta_marathon_port_definition_label_<labelname>`: the port definition labels
-* `__meta_marathon_port_mapping_label_<labelname>`: the port mapping labels
+* `__meta_marathon_app_label_<labelname>`: any Marathon labels attached to the app, with any unsupported characters converted to an underscore
+* `__meta_marathon_port_definition_label_<labelname>`: the port definition labels, with any unsupported characters converted to an underscore
+* `__meta_marathon_port_mapping_label_<labelname>`: the port mapping labels, with any unsupported characters converted to an underscore
 * `__meta_marathon_port_index`: the port index number (e.g. `1` for `PORT1`)

 See below for the configuration options for Marathon discovery:
--- a/docs/feature_flags.md
+++ b/docs/feature_flags.md
@ -187,4 +187,11 @@ This should **only** be applied to metrics that currently produce such labels.

 The OTLP receiver allows Prometheus to accept [OpenTelemetry](https://opentelemetry.io/) metrics writes.
 Prometheus is best used as a Pull based system, and staleness, `up` metric, and other Pull enabled features 
-won't work when you push OTLP metrics.
+won't work when you push OTLP metrics.
+
+## Experimental PromQL functions
+
+`--enable-feature=promql-experimental-functions`
+
+Enables PromQL functions that are considered experimental and whose name or
+semantics could change.
--- a/docs/querying/functions.md
+++ b/docs/querying/functions.md
@ -323,6 +323,24 @@ a histogram.
 You can use `histogram_quantile(1, v instant-vector)` to get the estimated maximum value stored in
 a histogram.

+Buckets of classic histograms are cumulative. Therefore, the following should always be the case:
+
+- The counts in the buckets are monotonically increasing (strictly non-decreasing).
+- A lack of observations between the upper limits of two consecutive buckets results in equal counts
+in those two buckets.
+
+However, floating point precision issues (e.g. small discrepancies introduced by computing of buckets
+with `sum(rate(...))`) or invalid data might violate these assumptions. In that case,
+`histogram_quantile` would be unable to return meaningful results. To mitigate the issue,
+`histogram_quantile` assumes that tiny relative differences between consecutive buckets are happening
+because of floating point precision errors and ignores them. (The threshold to ignore a difference
+between two buckets is a trillionth (1e-12) of the sum of both buckets.) Furthermore, if there are
+non-monotonic bucket counts even after this adjustment, they are increased to the value of the
+previous buckets to enforce monotonicity. The latter is evidence for an actual issue with the input
+data and is therefore flagged with an informational annotation reading `input to histogram_quantile
+needed to be fixed for monotonicity`. If you encounter this annotation, you should find and remove
+the source of the invalid data.
+
 ## `histogram_stddev()` and `histogram_stdvar()`

 _Both functions only act on native histograms, which are an experimental
--- a/documentation/prometheus-mixin/dashboards.libsonnet
+++ b/documentation/prometheus-mixin/dashboards.libsonnet
@ -117,7 +117,7 @@ local template = grafana.template;
            (
              prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"} 
            -  
-              ignoring(remote_name, url) group_right(instance) (prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"} != 0)
+              ignoring(remote_name, url) group_right(instance) (prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance", url=~"$url"} != 0)
            )
          |||,
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}',
@ -134,7 +134,7 @@ local template = grafana.template;
            clamp_min(
              rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}[5m])  
            - 
-              ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"}[5m])
+              ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])
            , 0)
          |||,
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}',
@ -151,9 +151,9 @@ local template = grafana.template;
            rate(
              prometheus_remote_storage_samples_in_total{cluster=~"$cluster", instance=~"$instance"}[5m])
            - 
-              ignoring(remote_name, url) group_right(instance) (rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) or rate(prometheus_remote_storage_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]))
+              ignoring(remote_name, url) group_right(instance) (rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]))
            - 
-              (rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", instance=~"$instance"}[5m]))
+              (rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]))
          |||,
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
        ));
@ -166,7 +166,7 @@ local template = grafana.template;
          min_span=6,
        )
        .addTarget(prometheus.target(
-          'prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}',
+          'prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance", url=~"$url"}',
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
        ));

@ -177,7 +177,7 @@ local template = grafana.template;
          span=4,
        )
        .addTarget(prometheus.target(
-          'prometheus_remote_storage_shards_max{cluster=~"$cluster", instance=~"$instance"}',
+          'prometheus_remote_storage_shards_max{cluster=~"$cluster", instance=~"$instance", url=~"$url"}',
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
        ));

@ -188,7 +188,7 @@ local template = grafana.template;
          span=4,
        )
        .addTarget(prometheus.target(
-          'prometheus_remote_storage_shards_min{cluster=~"$cluster", instance=~"$instance"}',
+          'prometheus_remote_storage_shards_min{cluster=~"$cluster", instance=~"$instance", url=~"$url"}',
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
        ));

@ -199,7 +199,7 @@ local template = grafana.template;
          span=4,
        )
        .addTarget(prometheus.target(
-          'prometheus_remote_storage_shards_desired{cluster=~"$cluster", instance=~"$instance"}',
+          'prometheus_remote_storage_shards_desired{cluster=~"$cluster", instance=~"$instance", url=~"$url"}',
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
        ));

@ -210,7 +210,7 @@ local template = grafana.template;
          span=6,
        )
        .addTarget(prometheus.target(
-          'prometheus_remote_storage_shard_capacity{cluster=~"$cluster", instance=~"$instance"}',
+          'prometheus_remote_storage_shard_capacity{cluster=~"$cluster", instance=~"$instance", url=~"$url"}',
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
        ));

@ -222,7 +222,7 @@ local template = grafana.template;
          span=6,
        )
        .addTarget(prometheus.target(
-          'prometheus_remote_storage_pending_samples{cluster=~"$cluster", instance=~"$instance"} or prometheus_remote_storage_samples_pending{cluster=~"$cluster", instance=~"$instance"}',
+          'prometheus_remote_storage_pending_samples{cluster=~"$cluster", instance=~"$instance", url=~"$url"} or prometheus_remote_storage_samples_pending{cluster=~"$cluster", instance=~"$instance", url=~"$url"}',
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
        ));

@ -257,7 +257,7 @@ local template = grafana.template;
          span=3,
        )
        .addTarget(prometheus.target(
-          'rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", instance=~"$instance"}[5m])',
+          'rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])',
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
        ));

@ -268,7 +268,7 @@ local template = grafana.template;
          span=3,
        )
        .addTarget(prometheus.target(
-          'rate(prometheus_remote_storage_failed_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{cluster=~"$cluster", instance=~"$instance"}[5m])',
+          'rate(prometheus_remote_storage_failed_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])',
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
        ));

@ -279,7 +279,7 @@ local template = grafana.template;
          span=3,
        )
        .addTarget(prometheus.target(
-          'rate(prometheus_remote_storage_retried_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) or rate(prometheus_remote_storage_samples_retried_total{cluster=~"$cluster", instance=~"$instance"}[5m])',
+          'rate(prometheus_remote_storage_retried_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_retried_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])',
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
        ));

@ -290,7 +290,7 @@ local template = grafana.template;
          span=3,
        )
        .addTarget(prometheus.target(
-          'rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", instance=~"$instance"}[5m])',
+          'rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])',
          legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
        ));

--- a/go.mod
+++ b/go.mod
@ -36,9 +36,9 @@ require (
 	github.com/hetznercloud/hcloud-go/v2 v2.4.0
 	github.com/ionos-cloud/sdk-go/v6 v6.1.9
 	github.com/json-iterator/go v1.1.12
-	github.com/klauspost/compress v1.17.1
+	github.com/klauspost/compress v1.17.2
 	github.com/kolo/xmlrpc v0.0.0-20220921171641-a4b6fa1dd06b
-	github.com/linode/linodego v1.23.0
+	github.com/linode/linodego v1.24.0
 	github.com/miekg/dns v1.1.56
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822
 	github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f
@ -57,6 +57,7 @@ require (
 	github.com/shurcooL/httpfs v0.0.0-20230704072500-f1e31cf0ba5c
 	github.com/stretchr/testify v1.8.4
 	github.com/vultr/govultr/v2 v2.17.2
+	go.opentelemetry.io/collector/featuregate v0.77.0
 	go.opentelemetry.io/collector/pdata v1.0.0-rcv0017
 	go.opentelemetry.io/collector/semconv v0.88.0
 	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.45.0
@ -70,22 +71,22 @@ require (
 	go.uber.org/automaxprocs v1.5.3
 	go.uber.org/goleak v1.2.1
 	go.uber.org/multierr v1.11.0
-	golang.org/x/exp v0.0.0-20231006140011-7918f672742d
-	golang.org/x/net v0.17.0
+	golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa
+	golang.org/x/net v0.18.0
 	golang.org/x/oauth2 v0.13.0
-	golang.org/x/sync v0.4.0
-	golang.org/x/sys v0.13.0
+	golang.org/x/sync v0.5.0
+	golang.org/x/sys v0.14.0
 	golang.org/x/time v0.3.0
-	golang.org/x/tools v0.14.0
+	golang.org/x/tools v0.15.0
 	google.golang.org/api v0.147.0
 	google.golang.org/genproto/googleapis/api v0.0.0-20231012201019-e917dd12ba7a
 	google.golang.org/grpc v1.59.0
 	google.golang.org/protobuf v1.31.0
 	gopkg.in/yaml.v2 v2.4.0
 	gopkg.in/yaml.v3 v3.0.1
-	k8s.io/api v0.28.2
-	k8s.io/apimachinery v0.28.2
-	k8s.io/client-go v0.28.2
+	k8s.io/api v0.28.3
+	k8s.io/apimachinery v0.28.3
+	k8s.io/client-go v0.28.3
 	k8s.io/klog v1.0.0
 	k8s.io/klog/v2 v2.100.1
 )
@ -112,6 +113,7 @@ require (
 )

 require (
+	github.com/Code-Hex/go-generics-cache v1.3.1
 	github.com/Microsoft/go-winio v0.6.1 // indirect
 	github.com/armon/go-metrics v0.4.1 // indirect
 	github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect
@ -138,7 +140,7 @@ require (
 	github.com/go-openapi/spec v0.20.9 // indirect
 	github.com/go-openapi/swag v0.22.4 // indirect
 	github.com/go-openapi/validate v0.22.1 // indirect
-	github.com/go-resty/resty/v2 v2.7.0 // indirect
+	github.com/go-resty/resty/v2 v2.10.0 // indirect
 	github.com/golang/glog v1.1.2 // indirect
 	github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
 	github.com/golang/protobuf v1.5.3 // indirect
@ -181,10 +183,10 @@ require (
 	go.opencensus.io v0.24.0 // indirect
 	go.opentelemetry.io/otel/metric v1.19.0 // indirect
 	go.opentelemetry.io/proto/otlp v1.0.0 // indirect
-	golang.org/x/crypto v0.14.0 // indirect
-	golang.org/x/mod v0.13.0 // indirect
-	golang.org/x/term v0.13.0 // indirect
-	golang.org/x/text v0.13.0 // indirect
+	golang.org/x/crypto v0.15.0 // indirect
+	golang.org/x/mod v0.14.0 // indirect
+	golang.org/x/term v0.14.0 // indirect
+	golang.org/x/text v0.14.0 // indirect
 	google.golang.org/appengine v1.6.7 // indirect
 	gopkg.in/inf.v0 v0.9.1 // indirect
 	gopkg.in/ini.v1 v1.67.0 // indirect
--- a/go.sum
+++ b/go.sum
@ -54,6 +54,8 @@ github.com/AzureAD/microsoft-authentication-library-for-go v1.1.1 h1:WpB/QDNLpMw
 github.com/AzureAD/microsoft-authentication-library-for-go v1.1.1/go.mod h1:wP83P5OoQ5p6ip3ScPr0BAq0BvuPAvacpEuSzyouqAI=
 github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
 github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
+github.com/Code-Hex/go-generics-cache v1.3.1 h1:i8rLwyhoyhaerr7JpjtYjJZUcCbWOdiYO3fZXLiEC4g=
+github.com/Code-Hex/go-generics-cache v1.3.1/go.mod h1:qxcC9kRVrct9rHeiYpFWSoW1vxyillCVzX13KZG8dl4=
 github.com/DataDog/datadog-go v3.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ=
 github.com/DmitriyVTitov/size v1.5.0 h1:/PzqxYrOyOUX1BXj6J9OuVRVGe+66VL4D9FlUaW515g=
 github.com/DmitriyVTitov/size v1.5.0/go.mod h1:le6rNI4CoLQV1b9gzp1+3d7hMAD/uu2QcJ+aYbNgiU0=
@ -244,8 +246,8 @@ github.com/go-openapi/swag v0.22.4 h1:QLMzNJnMGPRNDCbySlcj1x01tzU8/9LTTL9hZZZogB
 github.com/go-openapi/swag v0.22.4/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14=
 github.com/go-openapi/validate v0.22.1 h1:G+c2ub6q47kfX1sOBLwIQwzBVt8qmOAARyo/9Fqs9NU=
 github.com/go-openapi/validate v0.22.1/go.mod h1:rjnrwK57VJ7A8xqfpAOEKRH8yQSGUriMu5/zuPSQ1hg=
-github.com/go-resty/resty/v2 v2.7.0 h1:me+K9p3uhSmXtrBZ4k9jcEAfJmuC8IivWHwaLZwPrFY=
-github.com/go-resty/resty/v2 v2.7.0/go.mod h1:9PWDzw47qPphMRFfhsyk0NnSgvluHcljSMVIq3w7q0I=
+github.com/go-resty/resty/v2 v2.10.0 h1:Qla4W/+TMmv0fOeeRqzEpXPLfTUnR5HZ1+lGs+CkiCo=
+github.com/go-resty/resty/v2 v2.10.0/go.mod h1:iiP/OpA0CkcL3IGt1O0+/SIItFUbkkyw5BGXiVdTu+A=
 github.com/go-sql-driver/mysql v1.4.0/go.mod h1:zAC/RDZ24gD3HViQzih4MyKcchzm+sOG5ZlKdlhCg5w=
 github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
 github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI=
@ -487,8 +489,8 @@ github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvW
 github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
 github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
-github.com/klauspost/compress v1.17.1 h1:NE3C767s2ak2bweCZo3+rdP4U/HoyVXLv/X9f2gPS5g=
-github.com/klauspost/compress v1.17.1/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
+github.com/klauspost/compress v1.17.2 h1:RlWWUY/Dr4fL8qk9YG7DTZ7PDgME2V4csBXA8L/ixi4=
+github.com/klauspost/compress v1.17.2/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
 github.com/kolo/xmlrpc v0.0.0-20220921171641-a4b6fa1dd06b h1:udzkj9S/zlT5X367kqJis0QP7YMxobob6zhzq6Yre00=
 github.com/kolo/xmlrpc v0.0.0-20220921171641-a4b6fa1dd06b/go.mod h1:pcaDhQK0/NJZEvtCO0qQPPropqV0sJOJ6YW7X+9kRwM=
 github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
@ -506,8 +508,8 @@ github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0
 github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
 github.com/lightstep/lightstep-tracer-common/golang/gogo v0.0.0-20190605223551-bc2310a04743/go.mod h1:qklhhLq1aX+mtWk9cPHPzaBjWImj5ULL6C7HFJtXQMM=
 github.com/lightstep/lightstep-tracer-go v0.18.1/go.mod h1:jlF1pusYV4pidLvZ+XD0UBX0ZE6WURAspgAczcDHrL4=
-github.com/linode/linodego v1.23.0 h1:s0ReCZtuN9Z1IoUN9w1RLeYO1dMZUGPwOQ/IBFsBHtU=
-github.com/linode/linodego v1.23.0/go.mod h1:0U7wj/UQOqBNbKv1FYTXiBUXueR8DY4HvIotwE0ENgg=
+github.com/linode/linodego v1.24.0 h1:zO+bMdTE6wPccqP7QIkbxAfACX7DjSX6DW9JE/qOKDQ=
+github.com/linode/linodego v1.24.0/go.mod h1:cq/ty5BCEQnsO6OjMqD7Q03KCCyB8CNM5E3MNg0LV6M=
 github.com/lyft/protoc-gen-validate v0.0.13/go.mod h1:XbGvPuh87YZc5TdIa2/I4pLk0QoUACkjt2znoq26NVQ=
 github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
 github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
@ -768,6 +770,8 @@ go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
 go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
 go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0=
 go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo=
+go.opentelemetry.io/collector/featuregate v0.77.0 h1:m1/IzaXoQh6SgF6CM80vrBOCf5zSJ2GVISfA27fYzGU=
+go.opentelemetry.io/collector/featuregate v0.77.0/go.mod h1:/kVAsGUCyJXIDSgHftCN63QiwAEVHRLX2Kh/S+dqgHY=
 go.opentelemetry.io/collector/pdata v1.0.0-rcv0017 h1:AgALhc2VenoA5l1DvTdg7mkzaBGqoTSuMkAtjsttBFo=
 go.opentelemetry.io/collector/pdata v1.0.0-rcv0017/go.mod h1:Rv9fOclA5AtM/JGm0d4jBOIAo1+jBA13UT5Bx0ovXi4=
 go.opentelemetry.io/collector/semconv v0.88.0 h1:8TVP4hYaUC87S6CCLKNoSxsUE0ChldE4vqotvNHHUnE=
@ -819,8 +823,9 @@ golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPh
 golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
 golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
 golang.org/x/crypto v0.0.0-20220829220503-c86fa9a7ed90/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
-golang.org/x/crypto v0.14.0 h1:wBqGXzWJW6m1XrIKlAH0Hs1JJ7+9KBwnIO8v66Q9cHc=
 golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4=
+golang.org/x/crypto v0.15.0 h1:frVn1TEaCEaZcn3Tmd7Y2b5KKPaZ+I32Q2OA3kYp5TA=
+golang.org/x/crypto v0.15.0/go.mod h1:4ChreQoLWfG3xLDer1WdlH5NdlQ3+mwnQq1YTKY+72g=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
@ -831,8 +836,8 @@ golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u0
 golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4=
 golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM=
 golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU=
-golang.org/x/exp v0.0.0-20231006140011-7918f672742d h1:jtJma62tbqLibJ5sFQz8bKtEM8rJBtfilJ2qTU199MI=
-golang.org/x/exp v0.0.0-20231006140011-7918f672742d/go.mod h1:ldy0pHrwJyGW56pPQzzkH36rKxoZW1tw7ZJpeKx+hdo=
+golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa h1:FRnLl4eNAQl8hwxVVC17teOw8kdjVDVAiFMtgUdTSRQ=
+golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa/go.mod h1:zk2irFbV9DP96SEBUUAy67IdHUaZuSnrz1n472HUCLE=
 golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
 golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
 golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
@ -854,8 +859,9 @@ golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzB
 golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
 golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
-golang.org/x/mod v0.13.0 h1:I/DsJXRlw/8l/0c24sM9yb0T4z9liZTduXvdAWYiysY=
-golang.org/x/mod v0.13.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
+golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
+golang.org/x/mod v0.14.0 h1:dGoOF9QVLYng8IHTm7BAyWqCqSheQ5pYWGhzW00YJr0=
+golang.org/x/mod v0.14.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
 golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
 golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
@ -896,12 +902,14 @@ golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v
 golang.org/x/net v0.0.0-20210410081132-afb366fc7cd1/go.mod h1:9tjilg8BloeKEkVJvy7fQ90B1CfIiPueXVOjqfkSzI8=
 golang.org/x/net v0.0.0-20210421230115-4e50805a0758/go.mod h1:72T/g9IO56b78aLF+1Kcs5dz7/ng1VjMUvfKvpfy+jM=
 golang.org/x/net v0.0.0-20210525063256-abc453219eb5/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
-golang.org/x/net v0.0.0-20211029224645-99673261e6eb/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
 golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
 golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
 golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco=
-golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
+golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
+golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
 golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
+golang.org/x/net v0.18.0 h1:mIYleuAkSbHh0tCv7RvjL3F6ZVbLjq4+R7zbOn3Kokg=
+golang.org/x/net v0.18.0/go.mod h1:/czyP5RqHAH4odGYxBJ1qz0+CE5WZ+2j1YgoEo8F2jQ=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
@ -923,8 +931,9 @@ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJ
 golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
-golang.org/x/sync v0.4.0 h1:zxkM55ReGkDlKSM+Fu41A+zmbZuaPVbGMzvvdUPznYQ=
-golang.org/x/sync v0.4.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
+golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE=
+golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
 golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@ -991,14 +1000,20 @@ golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBc
 golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20221010170243-090e33056c14/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE=
+golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.14.0 h1:Vz7Qs629MkJkGyHxUlRHizWJRG2j8fbQKjELVSNhy7Q=
+golang.org/x/sys v0.14.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
 golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
-golang.org/x/term v0.13.0 h1:bb+I9cTfFazGW51MZqBVmZy7+JEJMouUHTUSKVQLBek=
+golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
+golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
 golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U=
+golang.org/x/term v0.14.0 h1:LGK9IlZ8T9jvdy6cTdfKUCltatMFOehAQo9SRC46UQ8=
+golang.org/x/term v0.14.0/go.mod h1:TySc+nGkYR6qt8km8wUhuFRTVSMIX3XPR58y2lC8vww=
 golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
@ -1010,8 +1025,10 @@ golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
 golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ=
 golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
 golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
-golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k=
+golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
 golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
+golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
+golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
 golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
@ -1071,8 +1088,9 @@ golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roY
 golang.org/x/tools v0.0.0-20200825202427-b303f430e36d/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA=
 golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
 golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
-golang.org/x/tools v0.14.0 h1:jvNa2pY0M4r62jkRQ6RwEZZyPcymeL9XZMLBbV7U2nc=
-golang.org/x/tools v0.14.0/go.mod h1:uYBEerGOWcJyEORxN+Ek8+TT266gXkNlHdJBwexUsBg=
+golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
+golang.org/x/tools v0.15.0 h1:zdAyfUGbYmuVokhzVmghFl2ZJh5QhcfebBgmVPFYA+8=
+golang.org/x/tools v0.15.0/go.mod h1:hpksKq4dtpQWS1uQ61JkdqWM3LscIS6Slf+VVkm+wQk=
 golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
@ -1217,12 +1235,12 @@ honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWh
 honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg=
 honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
 honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
-k8s.io/api v0.28.2 h1:9mpl5mOb6vXZvqbQmankOfPIGiudghwCoLl1EYfUZbw=
-k8s.io/api v0.28.2/go.mod h1:RVnJBsjU8tcMq7C3iaRSGMeaKt2TWEUXcpIt/90fjEg=
-k8s.io/apimachinery v0.28.2 h1:KCOJLrc6gu+wV1BYgwik4AF4vXOlVJPdiqn0yAWWwXQ=
-k8s.io/apimachinery v0.28.2/go.mod h1:RdzF87y/ngqk9H4z3EL2Rppv5jj95vGS/HaFXrLDApU=
-k8s.io/client-go v0.28.2 h1:DNoYI1vGq0slMBN/SWKMZMw0Rq+0EQW6/AK4v9+3VeY=
-k8s.io/client-go v0.28.2/go.mod h1:sMkApowspLuc7omj1FOSUxSoqjr+d5Q0Yc0LOFnYFJY=
+k8s.io/api v0.28.3 h1:Gj1HtbSdB4P08C8rs9AR94MfSGpRhJgsS+GF9V26xMM=
+k8s.io/api v0.28.3/go.mod h1:MRCV/jr1dW87/qJnZ57U5Pak65LGmQVkKTzf3AtKFHc=
+k8s.io/apimachinery v0.28.3 h1:B1wYx8txOaCQG0HmYF6nbpU8dg6HvA06x5tEffvOe7A=
+k8s.io/apimachinery v0.28.3/go.mod h1:uQTKmIqs+rAYaq+DFaoD2X7pcjLOqbQX2AOiO0nIpb8=
+k8s.io/client-go v0.28.3 h1:2OqNb72ZuTZPKCl+4gTKvqao0AMOl9f3o2ijbAj3LI4=
+k8s.io/client-go v0.28.3/go.mod h1:LTykbBp9gsA7SwqirlCXBWtK0guzfhpoW4qSm7i9dxo=
 k8s.io/kube-openapi v0.0.0-20230717233707-2695361300d9 h1:LyMgNKD2P8Wn1iAwQU5OhxCKlKJy0sHc+PcDwFB24dQ=
 k8s.io/kube-openapi v0.0.0-20230717233707-2695361300d9/go.mod h1:wZK2AVp1uHCp4VamDVgBP2COHZjqD1T68Rf0CM3YjSM=
 k8s.io/utils v0.0.0-20230711102312-30195339c3c7 h1:ZgnF1KZsYxWIifwSNZFZgNtWE89WI5yiP5WwlfDoIyc=
--- a/model/exemplar/exemplar.go
+++ b/model/exemplar/exemplar.go
@ -48,3 +48,18 @@ func (e Exemplar) Equals(e2 Exemplar) bool {

 	return e.Value == e2.Value
 }
+
+// Sort first by timestamp, then value, then labels.
+func Compare(a, b Exemplar) int {
+	if a.Ts < b.Ts {
+		return -1
+	} else if a.Ts > b.Ts {
+		return 1
+	}
+	if a.Value < b.Value {
+		return -1
+	} else if a.Value > b.Value {
+		return 1
+	}
+	return labels.Compare(a.Labels, b.Labels)
+}
--- a/model/histogram/float_histogram.go
+++ b/model/histogram/float_histogram.go
@ -94,8 +94,8 @@ func (h *FloatHistogram) CopyToSchema(targetSchema int32) *FloatHistogram {
 		Sum:           h.Sum,
 	}

-	c.PositiveSpans, c.PositiveBuckets = mergeToSchema(h.PositiveSpans, h.PositiveBuckets, h.Schema, targetSchema)
-	c.NegativeSpans, c.NegativeBuckets = mergeToSchema(h.NegativeSpans, h.NegativeBuckets, h.Schema, targetSchema)
+	c.PositiveSpans, c.PositiveBuckets = reduceResolution(h.PositiveSpans, h.PositiveBuckets, h.Schema, targetSchema, false)
+	c.NegativeSpans, c.NegativeBuckets = reduceResolution(h.NegativeSpans, h.NegativeBuckets, h.Schema, targetSchema, false)

 	return &c
 }
@ -268,17 +268,23 @@ func (h *FloatHistogram) Add(other *FloatHistogram) *FloatHistogram {
 	h.Count += other.Count
 	h.Sum += other.Sum

-	otherPositiveSpans := other.PositiveSpans
-	otherPositiveBuckets := other.PositiveBuckets
-	otherNegativeSpans := other.NegativeSpans
-	otherNegativeBuckets := other.NegativeBuckets
-	if other.Schema != h.Schema {
-		otherPositiveSpans, otherPositiveBuckets = mergeToSchema(other.PositiveSpans, other.PositiveBuckets, other.Schema, h.Schema)
-		otherNegativeSpans, otherNegativeBuckets = mergeToSchema(other.NegativeSpans, other.NegativeBuckets, other.Schema, h.Schema)
+	var (
+		otherPositiveSpans   = other.PositiveSpans
+		otherPositiveBuckets = other.PositiveBuckets
+		otherNegativeSpans   = other.NegativeSpans
+		otherNegativeBuckets = other.NegativeBuckets
+	)
+
+	if other.Schema < h.Schema {
+		panic(fmt.Errorf("cannot add histogram with schema %d to %d", other.Schema, h.Schema))
+	} else if other.Schema > h.Schema {
+		otherPositiveSpans, otherPositiveBuckets = reduceResolution(otherPositiveSpans, otherPositiveBuckets, other.Schema, h.Schema, false)
+		otherNegativeSpans, otherNegativeBuckets = reduceResolution(otherNegativeSpans, otherNegativeBuckets, other.Schema, h.Schema, false)
 	}

 	h.PositiveSpans, h.PositiveBuckets = addBuckets(h.Schema, h.ZeroThreshold, false, h.PositiveSpans, h.PositiveBuckets, otherPositiveSpans, otherPositiveBuckets)
 	h.NegativeSpans, h.NegativeBuckets = addBuckets(h.Schema, h.ZeroThreshold, false, h.NegativeSpans, h.NegativeBuckets, otherNegativeSpans, otherNegativeBuckets)
+
 	return h
 }

@ -289,17 +295,23 @@ func (h *FloatHistogram) Sub(other *FloatHistogram) *FloatHistogram {
 	h.Count -= other.Count
 	h.Sum -= other.Sum

-	otherPositiveSpans := other.PositiveSpans
-	otherPositiveBuckets := other.PositiveBuckets
-	otherNegativeSpans := other.NegativeSpans
-	otherNegativeBuckets := other.NegativeBuckets
-	if other.Schema != h.Schema {
-		otherPositiveSpans, otherPositiveBuckets = mergeToSchema(other.PositiveSpans, other.PositiveBuckets, other.Schema, h.Schema)
-		otherNegativeSpans, otherNegativeBuckets = mergeToSchema(other.NegativeSpans, other.NegativeBuckets, other.Schema, h.Schema)
+	var (
+		otherPositiveSpans   = other.PositiveSpans
+		otherPositiveBuckets = other.PositiveBuckets
+		otherNegativeSpans   = other.NegativeSpans
+		otherNegativeBuckets = other.NegativeBuckets
+	)
+
+	if other.Schema < h.Schema {
+		panic(fmt.Errorf("cannot subtract histigram with schema %d to %d", other.Schema, h.Schema))
+	} else if other.Schema > h.Schema {
+		otherPositiveSpans, otherPositiveBuckets = reduceResolution(otherPositiveSpans, otherPositiveBuckets, other.Schema, h.Schema, false)
+		otherNegativeSpans, otherNegativeBuckets = reduceResolution(otherNegativeSpans, otherNegativeBuckets, other.Schema, h.Schema, false)
 	}

 	h.PositiveSpans, h.PositiveBuckets = addBuckets(h.Schema, h.ZeroThreshold, true, h.PositiveSpans, h.PositiveBuckets, otherPositiveSpans, otherPositiveBuckets)
 	h.NegativeSpans, h.NegativeBuckets = addBuckets(h.Schema, h.ZeroThreshold, true, h.NegativeSpans, h.NegativeBuckets, otherNegativeSpans, otherNegativeBuckets)
+
 	return h
 }

@ -466,25 +478,25 @@ func (h *FloatHistogram) DetectReset(previous *FloatHistogram) bool {
 	}
 	currIt := h.floatBucketIterator(true, h.ZeroThreshold, h.Schema)
 	prevIt := previous.floatBucketIterator(true, h.ZeroThreshold, h.Schema)
-	if detectReset(currIt, prevIt) {
+	if detectReset(&currIt, &prevIt) {
 		return true
 	}
 	currIt = h.floatBucketIterator(false, h.ZeroThreshold, h.Schema)
 	prevIt = previous.floatBucketIterator(false, h.ZeroThreshold, h.Schema)
-	return detectReset(currIt, prevIt)
+	return detectReset(&currIt, &prevIt)
 }

-func detectReset(currIt, prevIt BucketIterator[float64]) bool {
+func detectReset(currIt, prevIt *floatBucketIterator) bool {
 	if !prevIt.Next() {
 		return false // If no buckets in previous histogram, nothing can be reset.
 	}
-	prevBucket := prevIt.At()
+	prevBucket := prevIt.strippedAt()
 	if !currIt.Next() {
 		// No bucket in current, but at least one in previous
 		// histogram. Check if any of those are non-zero, in which case
 		// this is a reset.
 		for {
-			if prevBucket.Count != 0 {
+			if prevBucket.count != 0 {
 				return true
 			}
 			if !prevIt.Next() {
@ -492,10 +504,10 @@ func detectReset(currIt, prevIt BucketIterator[float64]) bool {
 			}
 		}
 	}
-	currBucket := currIt.At()
+	currBucket := currIt.strippedAt()
 	for {
 		// Forward currIt until we find the bucket corresponding to prevBucket.
-		for currBucket.Index < prevBucket.Index {
+		for currBucket.index < prevBucket.index {
 			if !currIt.Next() {
 				// Reached end of currIt early, therefore
 				// previous histogram has a bucket that the
@ -503,7 +515,7 @@ func detectReset(currIt, prevIt BucketIterator[float64]) bool {
 				// remaining buckets in the previous histogram
 				// are unpopulated, this is a reset.
 				for {
-					if prevBucket.Count != 0 {
+					if prevBucket.count != 0 {
 						return true
 					}
 					if !prevIt.Next() {
@ -511,18 +523,18 @@ func detectReset(currIt, prevIt BucketIterator[float64]) bool {
 					}
 				}
 			}
-			currBucket = currIt.At()
+			currBucket = currIt.strippedAt()
 		}
-		if currBucket.Index > prevBucket.Index {
+		if currBucket.index > prevBucket.index {
 			// Previous histogram has a bucket the current one does
 			// not have. If it's populated, it's a reset.
-			if prevBucket.Count != 0 {
+			if prevBucket.count != 0 {
 				return true
 			}
 		} else {
 			// We have reached corresponding buckets in both iterators.
 			// We can finally compare the counts.
-			if currBucket.Count < prevBucket.Count {
+			if currBucket.count < prevBucket.count {
 				return true
 			}
 		}
@ -530,35 +542,39 @@ func detectReset(currIt, prevIt BucketIterator[float64]) bool {
 			// Reached end of prevIt without finding offending buckets.
 			return false
 		}
-		prevBucket = prevIt.At()
+		prevBucket = prevIt.strippedAt()
 	}
 }

 // PositiveBucketIterator returns a BucketIterator to iterate over all positive
 // buckets in ascending order (starting next to the zero bucket and going up).
 func (h *FloatHistogram) PositiveBucketIterator() BucketIterator[float64] {
-	return h.floatBucketIterator(true, 0, h.Schema)
+	it := h.floatBucketIterator(true, 0, h.Schema)
+	return &it
 }

 // NegativeBucketIterator returns a BucketIterator to iterate over all negative
 // buckets in descending order (starting next to the zero bucket and going
 // down).
 func (h *FloatHistogram) NegativeBucketIterator() BucketIterator[float64] {
-	return h.floatBucketIterator(false, 0, h.Schema)
+	it := h.floatBucketIterator(false, 0, h.Schema)
+	return &it
 }

 // PositiveReverseBucketIterator returns a BucketIterator to iterate over all
 // positive buckets in descending order (starting at the highest bucket and
 // going down towards the zero bucket).
 func (h *FloatHistogram) PositiveReverseBucketIterator() BucketIterator[float64] {
-	return newReverseFloatBucketIterator(h.PositiveSpans, h.PositiveBuckets, h.Schema, true)
+	it := newReverseFloatBucketIterator(h.PositiveSpans, h.PositiveBuckets, h.Schema, true)
+	return &it
 }

 // NegativeReverseBucketIterator returns a BucketIterator to iterate over all
 // negative buckets in ascending order (starting at the lowest bucket and going
 // up towards the zero bucket).
 func (h *FloatHistogram) NegativeReverseBucketIterator() BucketIterator[float64] {
-	return newReverseFloatBucketIterator(h.NegativeSpans, h.NegativeBuckets, h.Schema, false)
+	it := newReverseFloatBucketIterator(h.NegativeSpans, h.NegativeBuckets, h.Schema, false)
+	return &it
 }

 // AllBucketIterator returns a BucketIterator to iterate over all negative,
@ -569,8 +585,8 @@ func (h *FloatHistogram) NegativeReverseBucketIterator() BucketIterator[float64]
 func (h *FloatHistogram) AllBucketIterator() BucketIterator[float64] {
 	return &allFloatBucketIterator{
 		h:         h,
-		leftIter:  h.NegativeReverseBucketIterator(),
-		rightIter: h.PositiveBucketIterator(),
+		leftIter:  newReverseFloatBucketIterator(h.NegativeSpans, h.NegativeBuckets, h.Schema, false),
+		rightIter: h.floatBucketIterator(true, 0, h.Schema),
 		state:     -1,
 	}
 }
@ -583,12 +599,37 @@ func (h *FloatHistogram) AllBucketIterator() BucketIterator[float64] {
 func (h *FloatHistogram) AllReverseBucketIterator() BucketIterator[float64] {
 	return &allFloatBucketIterator{
 		h:         h,
-		leftIter:  h.PositiveReverseBucketIterator(),
-		rightIter: h.NegativeBucketIterator(),
+		leftIter:  newReverseFloatBucketIterator(h.PositiveSpans, h.PositiveBuckets, h.Schema, true),
+		rightIter: h.floatBucketIterator(false, 0, h.Schema),
 		state:     -1,
 	}
 }

+// Validate validates consistency between span and bucket slices. Also, buckets are checked
+// against negative values.
+// We do not check for h.Count being at least as large as the sum of the
+// counts in the buckets because floating point precision issues can
+// create false positives here.
+func (h *FloatHistogram) Validate() error {
+	if err := checkHistogramSpans(h.NegativeSpans, len(h.NegativeBuckets)); err != nil {
+		return fmt.Errorf("negative side: %w", err)
+	}
+	if err := checkHistogramSpans(h.PositiveSpans, len(h.PositiveBuckets)); err != nil {
+		return fmt.Errorf("positive side: %w", err)
+	}
+	var nCount, pCount float64
+	err := checkHistogramBuckets(h.NegativeBuckets, &nCount, false)
+	if err != nil {
+		return fmt.Errorf("negative side: %w", err)
+	}
+	err = checkHistogramBuckets(h.PositiveBuckets, &pCount, false)
+	if err != nil {
+		return fmt.Errorf("positive side: %w", err)
+	}
+
+	return nil
+}
+
 // zeroCountForLargerThreshold returns what the histogram's zero count would be
 // if the ZeroThreshold had the provided larger (or equal) value. If the
 // provided value is less than the histogram's ZeroThreshold, the method panics.
@ -715,11 +756,11 @@ func (h *FloatHistogram) reconcileZeroBuckets(other *FloatHistogram) float64 {
 // targetSchema prior to iterating (without mutating FloatHistogram).
 func (h *FloatHistogram) floatBucketIterator(
 	positive bool, absoluteStartValue float64, targetSchema int32,
-) *floatBucketIterator {
+) floatBucketIterator {
 	if targetSchema > h.Schema {
 		panic(fmt.Errorf("cannot merge from schema %d to %d", h.Schema, targetSchema))
 	}
-	i := &floatBucketIterator{
+	i := floatBucketIterator{
 		baseBucketIterator: baseBucketIterator[float64, float64]{
 			schema:   h.Schema,
 			positive: positive,
@ -737,11 +778,11 @@ func (h *FloatHistogram) floatBucketIterator(
 	return i
 }

-// reverseFloatbucketiterator is a low-level constructor for reverse bucket iterators.
+// reverseFloatBucketIterator is a low-level constructor for reverse bucket iterators.
 func newReverseFloatBucketIterator(
 	spans []Span, buckets []float64, schema int32, positive bool,
-) *reverseFloatBucketIterator {
-	r := &reverseFloatBucketIterator{
+) reverseFloatBucketIterator {
+	r := reverseFloatBucketIterator{
 		baseBucketIterator: baseBucketIterator[float64, float64]{
 			schema:   schema,
 			spans:    spans,
@ -769,6 +810,8 @@ type floatBucketIterator struct {
 	targetSchema       int32   // targetSchema is the schema to merge to and must be ≤ schema.
 	origIdx            int32   // The bucket index within the original schema.
 	absoluteStartValue float64 // Never return buckets with an upper bound ≤ this value.
+
+	boundReachedStartValue bool // Has getBound reached absoluteStartValue already?
 }

 func (i *floatBucketIterator) At() Bucket[float64] {
@ -832,9 +875,10 @@ mergeLoop: // Merge together all buckets from the original schema that fall into
 	}
 	// Skip buckets before absoluteStartValue.
 	// TODO(beorn7): Maybe do something more efficient than this recursive call.
-	if getBound(i.currIdx, i.targetSchema) <= i.absoluteStartValue {
+	if !i.boundReachedStartValue && getBound(i.currIdx, i.targetSchema) <= i.absoluteStartValue {
 		return i.Next()
 	}
+	i.boundReachedStartValue = true
 	return true
 }

@ -875,8 +919,9 @@ func (i *reverseFloatBucketIterator) Next() bool {
 }

 type allFloatBucketIterator struct {
-	h                   *FloatHistogram
-	leftIter, rightIter BucketIterator[float64]
+	h         *FloatHistogram
+	leftIter  reverseFloatBucketIterator
+	rightIter floatBucketIterator
 	// -1 means we are iterating negative buckets.
 	// 0 means it is time for the zero bucket.
 	// 1 means we are iterating positive buckets.
@ -942,69 +987,6 @@ func targetIdx(idx, originSchema, targetSchema int32) int32 {
 	return ((idx - 1) >> (originSchema - targetSchema)) + 1
 }

-// mergeToSchema is used to merge a FloatHistogram's Spans and Buckets (no matter if
-// positive or negative) from the original schema to the target schema.
-// The target schema must be smaller than the original schema.
-func mergeToSchema(originSpans []Span, originBuckets []float64, originSchema, targetSchema int32) ([]Span, []float64) {
-	var (
-		targetSpans         []Span    // The spans in the target schema.
-		targetBuckets       []float64 // The buckets in the target schema.
-		bucketIdx           int32     // The index of bucket in the origin schema.
-		lastTargetBucketIdx int32     // The index of the last added target bucket.
-		origBucketIdx       int       // The position of a bucket in originBuckets slice.
-	)
-
-	for _, span := range originSpans {
-		// Determine the index of the first bucket in this span.
-		bucketIdx += span.Offset
-		for j := 0; j < int(span.Length); j++ {
-			// Determine the index of the bucket in the target schema from the index in the original schema.
-			targetBucketIdx := targetIdx(bucketIdx, originSchema, targetSchema)
-
-			switch {
-			case len(targetSpans) == 0:
-				// This is the first span in the targetSpans.
-				span := Span{
-					Offset: targetBucketIdx,
-					Length: 1,
-				}
-				targetSpans = append(targetSpans, span)
-				targetBuckets = append(targetBuckets, originBuckets[0])
-				lastTargetBucketIdx = targetBucketIdx
-
-			case lastTargetBucketIdx == targetBucketIdx:
-				// The current bucket has to be merged into the same target bucket as the previous bucket.
-				targetBuckets[len(targetBuckets)-1] += originBuckets[origBucketIdx]
-
-			case (lastTargetBucketIdx + 1) == targetBucketIdx:
-				// The current bucket has to go into a new target bucket,
-				// and that bucket is next to the previous target bucket,
-				// so we add it to the current target span.
-				targetSpans[len(targetSpans)-1].Length++
-				targetBuckets = append(targetBuckets, originBuckets[origBucketIdx])
-				lastTargetBucketIdx++
-
-			case (lastTargetBucketIdx + 1) < targetBucketIdx:
-				// The current bucket has to go into a new target bucket,
-				// and that bucket is separated by a gap from the previous target bucket,
-				// so we need to add a new target span.
-				span := Span{
-					Offset: targetBucketIdx - lastTargetBucketIdx - 1,
-					Length: 1,
-				}
-				targetSpans = append(targetSpans, span)
-				targetBuckets = append(targetBuckets, originBuckets[origBucketIdx])
-				lastTargetBucketIdx = targetBucketIdx
-			}
-
-			bucketIdx++
-			origBucketIdx++
-		}
-	}
-
-	return targetSpans, targetBuckets
-}
-
 // addBuckets adds the buckets described by spansB/bucketsB to the buckets described by spansA/bucketsA,
 // creating missing buckets in spansA/bucketsA as needed.
 // It returns the resulting spans/buckets (which must be used instead of the original spansA/bucketsA,
@ -1146,3 +1128,16 @@ func floatBucketsMatch(b1, b2 []float64) bool {
 	}
 	return true
 }
+
+// ReduceResolution reduces the float histogram's spans, buckets into target schema.
+// The target schema must be smaller than the current float histogram's schema.
+func (h *FloatHistogram) ReduceResolution(targetSchema int32) *FloatHistogram {
+	if targetSchema >= h.Schema {
+		panic(fmt.Errorf("cannot reduce resolution from schema %d to %d", h.Schema, targetSchema))
+	}
+
+	h.PositiveSpans, h.PositiveBuckets = reduceResolution(h.PositiveSpans, h.PositiveBuckets, h.Schema, targetSchema, false)
+	h.NegativeSpans, h.NegativeBuckets = reduceResolution(h.NegativeSpans, h.NegativeBuckets, h.Schema, targetSchema, false)
+	h.Schema = targetSchema
+	return h
+}
--- a/model/histogram/float_histogram_test.go
+++ b/model/histogram/float_histogram_test.go
@ -16,6 +16,7 @@ package histogram
 import (
 	"fmt"
 	"math"
+	"math/rand"
 	"testing"

 	"github.com/stretchr/testify/require"
@ -1572,9 +1573,12 @@ func TestFloatHistogramAdd(t *testing.T) {

 	for _, c := range cases {
 		t.Run(c.name, func(t *testing.T) {
+			in2Copy := c.in2.Copy()
 			require.Equal(t, c.expected, c.in1.Add(c.in2))
 			// Has it also happened in-place?
 			require.Equal(t, c.expected, c.in1)
+			// Check that the argument was not mutated.
+			require.Equal(t, in2Copy, c.in2)
 		})
 	}
 }
@ -1658,9 +1662,12 @@ func TestFloatHistogramSub(t *testing.T) {

 	for _, c := range cases {
 		t.Run(c.name, func(t *testing.T) {
+			in2Copy := c.in2.Copy()
 			require.Equal(t, c.expected, c.in1.Sub(c.in2))
 			// Has it also happened in-place?
 			require.Equal(t, c.expected, c.in1)
+			// Check that the argument was not mutated.
+			require.Equal(t, in2Copy, c.in2)
 		})
 	}
 }
@ -2393,3 +2400,94 @@ func TestFloatHistogramSize(t *testing.T) {
 		})
 	}
 }
+
+func BenchmarkFloatHistogramAllBucketIterator(b *testing.B) {
+	rng := rand.New(rand.NewSource(0))
+
+	fh := createRandomFloatHistogram(rng, 50)
+
+	b.ReportAllocs() // the current implementation reports 1 alloc
+	b.ResetTimer()
+
+	for n := 0; n < b.N; n++ {
+		for it := fh.AllBucketIterator(); it.Next(); {
+		}
+	}
+}
+
+func BenchmarkFloatHistogramDetectReset(b *testing.B) {
+	rng := rand.New(rand.NewSource(0))
+
+	fh := createRandomFloatHistogram(rng, 50)
+
+	b.ReportAllocs() // the current implementation reports 0 allocs
+	b.ResetTimer()
+
+	for n := 0; n < b.N; n++ {
+		// Detect against the itself (no resets is the worst case input).
+		fh.DetectReset(fh)
+	}
+}
+
+func createRandomFloatHistogram(rng *rand.Rand, spanNum int32) *FloatHistogram {
+	f := &FloatHistogram{}
+	f.PositiveSpans, f.PositiveBuckets = createRandomSpans(rng, spanNum)
+	f.NegativeSpans, f.NegativeBuckets = createRandomSpans(rng, spanNum)
+	return f
+}
+
+func createRandomSpans(rng *rand.Rand, spanNum int32) ([]Span, []float64) {
+	Spans := make([]Span, spanNum)
+	Buckets := make([]float64, 0)
+	for i := 0; i < int(spanNum); i++ {
+		Spans[i].Offset = rng.Int31n(spanNum) + 1
+		Spans[i].Length = uint32(rng.Int31n(spanNum) + 1)
+		for j := 0; j < int(Spans[i].Length); j++ {
+			Buckets = append(Buckets, float64(rng.Int31n(spanNum)+1))
+		}
+	}
+	return Spans, Buckets
+}
+
+func TestFloatHistogramReduceResolution(t *testing.T) {
+	tcs := map[string]struct {
+		origin *FloatHistogram
+		target *FloatHistogram
+	}{
+		"valid float histogram": {
+			origin: &FloatHistogram{
+				Schema: 0,
+				PositiveSpans: []Span{
+					{Offset: 0, Length: 4},
+					{Offset: 0, Length: 0},
+					{Offset: 3, Length: 2},
+				},
+				PositiveBuckets: []float64{1, 3, 1, 2, 1, 1},
+				NegativeSpans: []Span{
+					{Offset: 0, Length: 4},
+					{Offset: 0, Length: 0},
+					{Offset: 3, Length: 2},
+				},
+				NegativeBuckets: []float64{1, 3, 1, 2, 1, 1},
+			},
+			target: &FloatHistogram{
+				Schema: -1,
+				PositiveSpans: []Span{
+					{Offset: 0, Length: 3},
+					{Offset: 1, Length: 1},
+				},
+				PositiveBuckets: []float64{1, 4, 2, 2},
+				NegativeSpans: []Span{
+					{Offset: 0, Length: 3},
+					{Offset: 1, Length: 1},
+				},
+				NegativeBuckets: []float64{1, 4, 2, 2},
+			},
+		},
+	}
+
+	for _, tc := range tcs {
+		target := tc.origin.ReduceResolution(tc.target.Schema)
+		require.Equal(t, tc.target, target)
+	}
+}
--- a/model/histogram/generic.go
+++ b/model/histogram/generic.go
@ -14,11 +14,20 @@
 package histogram

 import (
+	"errors"
 	"fmt"
 	"math"
 	"strings"
 )

+var (
+	ErrHistogramCountNotBigEnough    = errors.New("histogram's observation count should be at least the number of observations found in the buckets")
+	ErrHistogramCountMismatch        = errors.New("histogram's observation count should equal the number of observations found in the buckets (in absence of NaN)")
+	ErrHistogramNegativeBucketCount  = errors.New("histogram has a bucket whose observation count is negative")
+	ErrHistogramSpanNegativeOffset   = errors.New("histogram has a span whose offset is negative")
+	ErrHistogramSpansBucketsMismatch = errors.New("histogram spans specify different number of buckets than provided")
+)
+
 // BucketCount is a type constraint for the count in a bucket, which can be
 // float64 (for type FloatHistogram) or uint64 (for type Histogram).
 type BucketCount interface {
@ -53,6 +62,13 @@ type Bucket[BC BucketCount] struct {
 	Index int32
 }

+// strippedBucket is Bucket without bound values (which are expensive to calculate
+// and not used in certain use cases).
+type strippedBucket[BC BucketCount] struct {
+	count BC
+	index int32
+}
+
 // String returns a string representation of a Bucket, using the usual
 // mathematical notation of '['/']' for inclusive bounds and '('/')' for
 // non-inclusive bounds.
@ -101,13 +117,12 @@ type baseBucketIterator[BC BucketCount, IBC InternalBucketCount] struct {
 	currIdx   int32 // The actual bucket index.
 }

-func (b baseBucketIterator[BC, IBC]) At() Bucket[BC] {
+func (b *baseBucketIterator[BC, IBC]) At() Bucket[BC] {
 	return b.at(b.schema)
 }

-// at is an internal version of the exported At to enable using a different
-// schema.
-func (b baseBucketIterator[BC, IBC]) at(schema int32) Bucket[BC] {
+// at is an internal version of the exported At to enable using a different schema.
+func (b *baseBucketIterator[BC, IBC]) at(schema int32) Bucket[BC] {
 	bucket := Bucket[BC]{
 		Count: BC(b.currCount),
 		Index: b.currIdx,
@ -124,6 +139,14 @@ func (b baseBucketIterator[BC, IBC]) at(schema int32) Bucket[BC] {
 	return bucket
 }

+// strippedAt returns current strippedBucket (which lacks bucket bounds but is cheaper to compute).
+func (b *baseBucketIterator[BC, IBC]) strippedAt() strippedBucket[BC] {
+	return strippedBucket[BC]{
+		count: BC(b.currCount),
+		index: b.currIdx,
+	}
+}
+
 // compactBuckets is a generic function used by both Histogram.Compact and
 // FloatHistogram.Compact. Set deltaBuckets to true if the provided buckets are
 // deltas. Set it to false if the buckets contain absolute counts.
@ -333,6 +356,43 @@ func compactBuckets[IBC InternalBucketCount](buckets []IBC, spans []Span, maxEmp
 	return buckets, spans
 }

+func checkHistogramSpans(spans []Span, numBuckets int) error {
+	var spanBuckets int
+	for n, span := range spans {
+		if n > 0 && span.Offset < 0 {
+			return fmt.Errorf("span number %d with offset %d: %w", n+1, span.Offset, ErrHistogramSpanNegativeOffset)
+		}
+		spanBuckets += int(span.Length)
+	}
+	if spanBuckets != numBuckets {
+		return fmt.Errorf("spans need %d buckets, have %d buckets: %w", spanBuckets, numBuckets, ErrHistogramSpansBucketsMismatch)
+	}
+	return nil
+}
+
+func checkHistogramBuckets[BC BucketCount, IBC InternalBucketCount](buckets []IBC, count *BC, deltas bool) error {
+	if len(buckets) == 0 {
+		return nil
+	}
+
+	var last IBC
+	for i := 0; i < len(buckets); i++ {
+		var c IBC
+		if deltas {
+			c = last + buckets[i]
+		} else {
+			c = buckets[i]
+		}
+		if c < 0 {
+			return fmt.Errorf("bucket number %d has observation count of %v: %w", i+1, c, ErrHistogramNegativeBucketCount)
+		}
+		last = c
+		*count += BC(c)
+	}
+
+	return nil
+}
+
 func getBound(idx, schema int32) float64 {
 	// Here a bit of context about the behavior for the last bucket counting
 	// regular numbers (called simply "last bucket" below) and the bucket
@ -540,3 +600,90 @@ var exponentialBounds = [][]float64{
 		0.9892280131939752, 0.9919100824251095, 0.9945994234836328, 0.9972960560854698,
 	},
 }
+
+// reduceResolution reduces the input spans, buckets in origin schema to the spans, buckets in target schema.
+// The target schema must be smaller than the original schema.
+// Set deltaBuckets to true if the provided buckets are
+// deltas. Set it to false if the buckets contain absolute counts.
+func reduceResolution[IBC InternalBucketCount](originSpans []Span, originBuckets []IBC, originSchema, targetSchema int32, deltaBuckets bool) ([]Span, []IBC) {
+	var (
+		targetSpans           []Span // The spans in the target schema.
+		targetBuckets         []IBC  // The bucket counts in the target schema.
+		bucketIdx             int32  // The index of bucket in the origin schema.
+		bucketCountIdx        int    // The position of a bucket in origin bucket count slice `originBuckets`.
+		targetBucketIdx       int32  // The index of bucket in the target schema.
+		lastBucketCount       IBC    // The last visited bucket's count in the origin schema.
+		lastTargetBucketIdx   int32  // The index of the last added target bucket.
+		lastTargetBucketCount IBC
+	)
+
+	for _, span := range originSpans {
+		// Determine the index of the first bucket in this span.
+		bucketIdx += span.Offset
+		for j := 0; j < int(span.Length); j++ {
+			// Determine the index of the bucket in the target schema from the index in the original schema.
+			targetBucketIdx = targetIdx(bucketIdx, originSchema, targetSchema)
+
+			switch {
+			case len(targetSpans) == 0:
+				// This is the first span in the targetSpans.
+				span := Span{
+					Offset: targetBucketIdx,
+					Length: 1,
+				}
+				targetSpans = append(targetSpans, span)
+				targetBuckets = append(targetBuckets, originBuckets[bucketCountIdx])
+				lastTargetBucketIdx = targetBucketIdx
+				lastBucketCount = originBuckets[bucketCountIdx]
+				lastTargetBucketCount = originBuckets[bucketCountIdx]
+
+			case lastTargetBucketIdx == targetBucketIdx:
+				// The current bucket has to be merged into the same target bucket as the previous bucket.
+				if deltaBuckets {
+					lastBucketCount += originBuckets[bucketCountIdx]
+					targetBuckets[len(targetBuckets)-1] += lastBucketCount
+					lastTargetBucketCount += lastBucketCount
+				} else {
+					targetBuckets[len(targetBuckets)-1] += originBuckets[bucketCountIdx]
+				}
+
+			case (lastTargetBucketIdx + 1) == targetBucketIdx:
+				// The current bucket has to go into a new target bucket,
+				// and that bucket is next to the previous target bucket,
+				// so we add it to the current target span.
+				targetSpans[len(targetSpans)-1].Length++
+				lastTargetBucketIdx++
+				if deltaBuckets {
+					lastBucketCount += originBuckets[bucketCountIdx]
+					targetBuckets = append(targetBuckets, lastBucketCount-lastTargetBucketCount)
+					lastTargetBucketCount = lastBucketCount
+				} else {
+					targetBuckets = append(targetBuckets, originBuckets[bucketCountIdx])
+				}
+
+			case (lastTargetBucketIdx + 1) < targetBucketIdx:
+				// The current bucket has to go into a new target bucket,
+				// and that bucket is separated by a gap from the previous target bucket,
+				// so we need to add a new target span.
+				span := Span{
+					Offset: targetBucketIdx - lastTargetBucketIdx - 1,
+					Length: 1,
+				}
+				targetSpans = append(targetSpans, span)
+				lastTargetBucketIdx = targetBucketIdx
+				if deltaBuckets {
+					lastBucketCount += originBuckets[bucketCountIdx]
+					targetBuckets = append(targetBuckets, lastBucketCount-lastTargetBucketCount)
+					lastTargetBucketCount = lastBucketCount
+				} else {
+					targetBuckets = append(targetBuckets, originBuckets[bucketCountIdx])
+				}
+			}
+
+			bucketIdx++
+			bucketCountIdx++
+		}
+	}
+
+	return targetSpans, targetBuckets
+}
--- a/model/histogram/generic_test.go
+++ b/model/histogram/generic_test.go
@ -110,3 +110,73 @@ func TestGetBound(t *testing.T) {
 		}
 	}
 }
+
+func TestReduceResolutionHistogram(t *testing.T) {
+	cases := []struct {
+		spans           []Span
+		buckets         []int64
+		schema          int32
+		targetSchema    int32
+		expectedSpans   []Span
+		expectedBuckets []int64
+	}{
+		{
+			spans: []Span{
+				{Offset: 0, Length: 4},
+				{Offset: 0, Length: 0},
+				{Offset: 3, Length: 2},
+			},
+			buckets:      []int64{1, 2, -2, 1, -1, 0},
+			schema:       0,
+			targetSchema: -1,
+			expectedSpans: []Span{
+				{Offset: 0, Length: 3},
+				{Offset: 1, Length: 1},
+			},
+			expectedBuckets: []int64{1, 3, -2, 0},
+			// schema 0, base 2 { (0.5, 1]:1  (1,2]:3, (2,4]:1, (4,8]:2, (8,16]:0, (16,32]:0, (32,64]:0, (64,128]:1, (128,256]:1}",
+			// schema 1, base 4 { (0.25, 1):1 (1,4]:4,          (4,16]:2,          (16,64]:0,            (64,256]:2}
+		},
+	}
+
+	for _, tc := range cases {
+		spans, buckets := reduceResolution(tc.spans, tc.buckets, tc.schema, tc.targetSchema, true)
+		require.Equal(t, tc.expectedSpans, spans)
+		require.Equal(t, tc.expectedBuckets, buckets)
+	}
+}
+
+func TestReduceResolutionFloatHistogram(t *testing.T) {
+	cases := []struct {
+		spans           []Span
+		buckets         []float64
+		schema          int32
+		targetSchema    int32
+		expectedSpans   []Span
+		expectedBuckets []float64
+	}{
+		{
+			spans: []Span{
+				{Offset: 0, Length: 4},
+				{Offset: 0, Length: 0},
+				{Offset: 3, Length: 2},
+			},
+			buckets:      []float64{1, 3, 1, 2, 1, 1},
+			schema:       0,
+			targetSchema: -1,
+			expectedSpans: []Span{
+				{Offset: 0, Length: 3},
+				{Offset: 1, Length: 1},
+			},
+			expectedBuckets: []float64{1, 4, 2, 2},
+			// schema 0, base 2 { (0.5, 1]:1  (1,2]:3, (2,4]:1, (4,8]:2, (8,16]:0, (16,32]:0, (32,64]:0, (64,128]:1, (128,256]:1}",
+			// schema 1, base 4 { (0.25, 1):1 (1,4]:4,          (4,16]:2,          (16,64]:0,            (64,256]:2}
+		},
+	}
+
+	for _, tc := range cases {
+		spans, buckets := reduceResolution(tc.spans, tc.buckets, tc.schema, tc.targetSchema, false)
+		require.Equal(t, tc.expectedSpans, spans)
+		require.Equal(t, tc.expectedBuckets, buckets)
+	}
+}
--- a/model/histogram/histogram.go
+++ b/model/histogram/histogram.go
@ -150,13 +150,15 @@ func (h *Histogram) ZeroBucket() Bucket[uint64] {
 // PositiveBucketIterator returns a BucketIterator to iterate over all positive
 // buckets in ascending order (starting next to the zero bucket and going up).
 func (h *Histogram) PositiveBucketIterator() BucketIterator[uint64] {
-	return newRegularBucketIterator(h.PositiveSpans, h.PositiveBuckets, h.Schema, true)
+	it := newRegularBucketIterator(h.PositiveSpans, h.PositiveBuckets, h.Schema, true)
+	return &it
 }

 // NegativeBucketIterator returns a BucketIterator to iterate over all negative
 // buckets in descending order (starting next to the zero bucket and going down).
 func (h *Histogram) NegativeBucketIterator() BucketIterator[uint64] {
-	return newRegularBucketIterator(h.NegativeSpans, h.NegativeBuckets, h.Schema, false)
+	it := newRegularBucketIterator(h.NegativeSpans, h.NegativeBuckets, h.Schema, false)
+	return &it
 }

 // CumulativeBucketIterator returns a BucketIterator to iterate over a
@ -326,18 +328,56 @@ func (h *Histogram) ToFloat() *FloatHistogram {
 	}
 }

+// Validate validates consistency between span and bucket slices. Also, buckets are checked
+// against negative values.
+// For histograms that have not observed any NaN values (based on IsNaN(h.Sum) check), a
+// strict h.Count = nCount + pCount + h.ZeroCount check is performed.
+// Otherwise, only a lower bound check will be done (h.Count >= nCount + pCount + h.ZeroCount),
+// because NaN observations do not increment the values of buckets (but they do increment
+// the total h.Count).
+func (h *Histogram) Validate() error {
+	if err := checkHistogramSpans(h.NegativeSpans, len(h.NegativeBuckets)); err != nil {
+		return fmt.Errorf("negative side: %w", err)
+	}
+	if err := checkHistogramSpans(h.PositiveSpans, len(h.PositiveBuckets)); err != nil {
+		return fmt.Errorf("positive side: %w", err)
+	}
+	var nCount, pCount uint64
+	err := checkHistogramBuckets(h.NegativeBuckets, &nCount, true)
+	if err != nil {
+		return fmt.Errorf("negative side: %w", err)
+	}
+	err = checkHistogramBuckets(h.PositiveBuckets, &pCount, true)
+	if err != nil {
+		return fmt.Errorf("positive side: %w", err)
+	}
+
+	sumOfBuckets := nCount + pCount + h.ZeroCount
+	if math.IsNaN(h.Sum) {
+		if sumOfBuckets > h.Count {
+			return fmt.Errorf("%d observations found in buckets, but the Count field is %d: %w", sumOfBuckets, h.Count, ErrHistogramCountNotBigEnough)
+		}
+	} else {
+		if sumOfBuckets != h.Count {
+			return fmt.Errorf("%d observations found in buckets, but the Count field is %d: %w", sumOfBuckets, h.Count, ErrHistogramCountMismatch)
+		}
+	}
+
+	return nil
+}
+
 type regularBucketIterator struct {
 	baseBucketIterator[uint64, int64]
 }

-func newRegularBucketIterator(spans []Span, buckets []int64, schema int32, positive bool) *regularBucketIterator {
+func newRegularBucketIterator(spans []Span, buckets []int64, schema int32, positive bool) regularBucketIterator {
 	i := baseBucketIterator[uint64, int64]{
 		schema:   schema,
 		spans:    spans,
 		buckets:  buckets,
 		positive: positive,
 	}
-	return &regularBucketIterator{i}
+	return regularBucketIterator{i}
 }

 func (r *regularBucketIterator) Next() bool {
@ -453,3 +493,20 @@ func (c *cumulativeBucketIterator) At() Bucket[uint64] {
 		Index:          c.currIdx - 1,
 	}
 }
+
+// ReduceResolution reduces the histogram's spans, buckets into target schema.
+// The target schema must be smaller than the current histogram's schema.
+func (h *Histogram) ReduceResolution(targetSchema int32) *Histogram {
+	if targetSchema >= h.Schema {
+		panic(fmt.Errorf("cannot reduce resolution from schema %d to %d", h.Schema, targetSchema))
+	}
+
+	h.PositiveSpans, h.PositiveBuckets = reduceResolution(
+		h.PositiveSpans, h.PositiveBuckets, h.Schema, targetSchema, true,
+	)
+	h.NegativeSpans, h.NegativeBuckets = reduceResolution(
+		h.NegativeSpans, h.NegativeBuckets, h.Schema, targetSchema, true,
+	)
+	h.Schema = targetSchema
+	return h
+}
--- a/model/histogram/histogram_test.go
+++ b/model/histogram/histogram_test.go
@ -811,3 +811,202 @@ func TestHistogramCompact(t *testing.T) {
 		})
 	}
 }
+
+func TestHistogramValidation(t *testing.T) {
+	tests := map[string]struct {
+		h         *Histogram
+		errMsg    string
+		skipFloat bool
+	}{
+		"valid histogram": {
+			h: &Histogram{
+				Count:         12,
+				ZeroCount:     2,
+				ZeroThreshold: 0.001,
+				Sum:           19.4,
+				Schema:        1,
+				PositiveSpans: []Span{
+					{Offset: 0, Length: 2},
+					{Offset: 1, Length: 2},
+				},
+				PositiveBuckets: []int64{1, 1, -1, 0},
+				NegativeSpans: []Span{
+					{Offset: 0, Length: 2},
+					{Offset: 1, Length: 2},
+				},
+				NegativeBuckets: []int64{1, 1, -1, 0},
+			},
+		},
+		"valid histogram with NaN observations that has its Count (4) higher than the actual total of buckets (2 + 1)": {
+			// This case is possible if NaN values (which do not fall into any bucket) are observed.
+			h: &Histogram{
+				ZeroCount:       2,
+				Count:           4,
+				Sum:             math.NaN(),
+				PositiveSpans:   []Span{{Offset: 0, Length: 1}},
+				PositiveBuckets: []int64{1},
+			},
+		},
+		"rejects histogram without NaN observations that has its Count (4) higher than the actual total of buckets (2 + 1)": {
+			h: &Histogram{
+				ZeroCount:       2,
+				Count:           4,
+				Sum:             333,
+				PositiveSpans:   []Span{{Offset: 0, Length: 1}},
+				PositiveBuckets: []int64{1},
+			},
+			errMsg:    `3 observations found in buckets, but the Count field is 4: histogram's observation count should equal the number of observations found in the buckets (in absence of NaN)`,
+			skipFloat: true,
+		},
+		"rejects histogram that has too few negative buckets": {
+			h: &Histogram{
+				NegativeSpans:   []Span{{Offset: 0, Length: 1}},
+				NegativeBuckets: []int64{},
+			},
+			errMsg: `negative side: spans need 1 buckets, have 0 buckets: histogram spans specify different number of buckets than provided`,
+		},
+		"rejects histogram that has too few positive buckets": {
+			h: &Histogram{
+				PositiveSpans:   []Span{{Offset: 0, Length: 1}},
+				PositiveBuckets: []int64{},
+			},
+			errMsg: `positive side: spans need 1 buckets, have 0 buckets: histogram spans specify different number of buckets than provided`,
+		},
+		"rejects histogram that has too many negative buckets": {
+			h: &Histogram{
+				NegativeSpans:   []Span{{Offset: 0, Length: 1}},
+				NegativeBuckets: []int64{1, 2},
+			},
+			errMsg: `negative side: spans need 1 buckets, have 2 buckets: histogram spans specify different number of buckets than provided`,
+		},
+		"rejects histogram that has too many positive buckets": {
+			h: &Histogram{
+				PositiveSpans:   []Span{{Offset: 0, Length: 1}},
+				PositiveBuckets: []int64{1, 2},
+			},
+			errMsg: `positive side: spans need 1 buckets, have 2 buckets: histogram spans specify different number of buckets than provided`,
+		},
+		"rejects a histogram that has a negative span with a negative offset": {
+			h: &Histogram{
+				NegativeSpans:   []Span{{Offset: -1, Length: 1}, {Offset: -1, Length: 1}},
+				NegativeBuckets: []int64{1, 2},
+			},
+			errMsg: `negative side: span number 2 with offset -1: histogram has a span whose offset is negative`,
+		},
+		"rejects a histogram which has a positive span with a negative offset": {
+			h: &Histogram{
+				PositiveSpans:   []Span{{Offset: -1, Length: 1}, {Offset: -1, Length: 1}},
+				PositiveBuckets: []int64{1, 2},
+			},
+			errMsg: `positive side: span number 2 with offset -1: histogram has a span whose offset is negative`,
+		},
+		"rejects a histogram that has a negative bucket with a negative count": {
+			h: &Histogram{
+				NegativeSpans:   []Span{{Offset: -1, Length: 1}},
+				NegativeBuckets: []int64{-1},
+			},
+			errMsg: `negative side: bucket number 1 has observation count of -1: histogram has a bucket whose observation count is negative`,
+		},
+		"rejects a histogram that has a positive bucket with a negative count": {
+			h: &Histogram{
+				PositiveSpans:   []Span{{Offset: -1, Length: 1}},
+				PositiveBuckets: []int64{-1},
+			},
+			errMsg: `positive side: bucket number 1 has observation count of -1: histogram has a bucket whose observation count is negative`,
+		},
+		"rejects a histogram that has a lower count than count in buckets": {
+			h: &Histogram{
+				Count:           0,
+				NegativeSpans:   []Span{{Offset: -1, Length: 1}},
+				PositiveSpans:   []Span{{Offset: -1, Length: 1}},
+				NegativeBuckets: []int64{1},
+				PositiveBuckets: []int64{1},
+			},
+			errMsg:    `2 observations found in buckets, but the Count field is 0: histogram's observation count should equal the number of observations found in the buckets (in absence of NaN)`,
+			skipFloat: true,
+		},
+		"rejects a histogram that doesn't count the zero bucket in its count": {
+			h: &Histogram{
+				Count:           2,
+				ZeroCount:       1,
+				NegativeSpans:   []Span{{Offset: -1, Length: 1}},
+				PositiveSpans:   []Span{{Offset: -1, Length: 1}},
+				NegativeBuckets: []int64{1},
+				PositiveBuckets: []int64{1},
+			},
+			errMsg:    `3 observations found in buckets, but the Count field is 2: histogram's observation count should equal the number of observations found in the buckets (in absence of NaN)`,
+			skipFloat: true,
+		},
+	}
+
+	for testName, tc := range tests {
+		t.Run(testName, func(t *testing.T) {
+			if err := tc.h.Validate(); tc.errMsg != "" {
+				require.EqualError(t, err, tc.errMsg)
+			} else {
+				require.NoError(t, err)
+			}
+			if tc.skipFloat {
+				return
+			}
+
+			fh := tc.h.ToFloat()
+			if err := fh.Validate(); tc.errMsg != "" {
+				require.EqualError(t, err, tc.errMsg)
+			} else {
+				require.NoError(t, err)
+			}
+		})
+	}
+}
+
+func BenchmarkHistogramValidation(b *testing.B) {
+	histograms := GenerateBigTestHistograms(b.N, 500)
+	b.ResetTimer()
+	for _, h := range histograms {
+		require.NoError(b, h.Validate())
+	}
+}
+
+func TestHistogramReduceResolution(t *testing.T) {
+	tcs := map[string]struct {
+		origin *Histogram
+		target *Histogram
+	}{
+		"valid histogram": {
+			origin: &Histogram{
+				Schema: 0,
+				PositiveSpans: []Span{
+					{Offset: 0, Length: 4},
+					{Offset: 0, Length: 0},
+					{Offset: 3, Length: 2},
+				},
+				PositiveBuckets: []int64{1, 2, -2, 1, -1, 0},
+				NegativeSpans: []Span{
+					{Offset: 0, Length: 4},
+					{Offset: 0, Length: 0},
+					{Offset: 3, Length: 2},
+				},
+				NegativeBuckets: []int64{1, 2, -2, 1, -1, 0},
+			},
+			target: &Histogram{
+				Schema: -1,
+				PositiveSpans: []Span{
+					{Offset: 0, Length: 3},
+					{Offset: 1, Length: 1},
+				},
+				PositiveBuckets: []int64{1, 3, -2, 0},
+				NegativeSpans: []Span{
+					{Offset: 0, Length: 3},
+					{Offset: 1, Length: 1},
+				},
+				NegativeBuckets: []int64{1, 3, -2, 0},
+			},
+		},
+	}
+
+	for _, tc := range tcs {
+		target := tc.origin.ReduceResolution(tc.target.Schema)
+		require.Equal(t, tc.target, target)
+	}
+}
--- a/model/histogram/test_utils.go
+++ b/model/histogram/test_utils.go
@ -0,0 +1,52 @@
+// Copyright 2023 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package histogram
+
+// GenerateBigTestHistograms generates a slice of histograms with given number of buckets each.
+func GenerateBigTestHistograms(numHistograms, numBuckets int) []*Histogram {
+	numSpans := numBuckets / 10
+	bucketsPerSide := numBuckets / 2
+	spanLength := uint32(bucketsPerSide / numSpans)
+	// Given all bucket deltas are 1, sum bucketsPerSide + 1.
+	observationCount := bucketsPerSide * (1 + bucketsPerSide)
+
+	var histograms []*Histogram
+	for i := 0; i < numHistograms; i++ {
+		h := &Histogram{
+			Count:           uint64(i + observationCount),
+			ZeroCount:       uint64(i),
+			ZeroThreshold:   1e-128,
+			Sum:             18.4 * float64(i+1),
+			Schema:          2,
+			NegativeSpans:   make([]Span, numSpans),
+			PositiveSpans:   make([]Span, numSpans),
+			NegativeBuckets: make([]int64, bucketsPerSide),
+			PositiveBuckets: make([]int64, bucketsPerSide),
+		}
+
+		for j := 0; j < numSpans; j++ {
+			s := Span{Offset: 1, Length: spanLength}
+			h.NegativeSpans[j] = s
+			h.PositiveSpans[j] = s
+		}
+
+		for j := 0; j < bucketsPerSide; j++ {
+			h.NegativeBuckets[j] = 1
+			h.PositiveBuckets[j] = 1
+		}
+
+		histograms = append(histograms, h)
+	}
+	return histograms
+}
--- a/model/labels/labels.go
+++ b/model/labels/labels.go
@ -17,32 +17,12 @@ package labels

 import (
 	"bytes"
-	"encoding/json"
-	"strconv"
 	"strings"

 	"github.com/cespare/xxhash/v2"
-	"github.com/prometheus/common/model"
 	"golang.org/x/exp/slices"
 )

-// Well-known label names used by Prometheus components.
-const (
-	MetricName   = "__name__"
-	AlertName    = "alertname"
-	BucketLabel  = "le"
-	InstanceName = "instance"
-
-	labelSep = '\xfe'
-)
-
-var seps = []byte{'\xff'}
-
-// Label is a key/value pair of strings.
-type Label struct {
-	Name, Value string
-}
-
 // Labels is a sorted set of labels. Order has to be guaranteed upon
 // instantiation.
 type Labels []Label
@ -51,23 +31,6 @@ func (ls Labels) Len() int           { return len(ls) }
 func (ls Labels) Swap(i, j int)      { ls[i], ls[j] = ls[j], ls[i] }
 func (ls Labels) Less(i, j int) bool { return ls[i].Name < ls[j].Name }

-func (ls Labels) String() string {
-	var b bytes.Buffer
-
-	b.WriteByte('{')
-	for i, l := range ls {
-		if i > 0 {
-			b.WriteByte(',')
-			b.WriteByte(' ')
-		}
-		b.WriteString(l.Name)
-		b.WriteByte('=')
-		b.WriteString(strconv.Quote(l.Value))
-	}
-	b.WriteByte('}')
-	return b.String()
-}
-
 // Bytes returns ls as a byte slice.
 // It uses an byte invalid character as a separator and so should not be used for printing.
 func (ls Labels) Bytes(buf []byte) []byte {
@ -84,40 +47,6 @@ func (ls Labels) Bytes(buf []byte) []byte {
 	return b.Bytes()
 }

-// MarshalJSON implements json.Marshaler.
-func (ls Labels) MarshalJSON() ([]byte, error) {
-	return json.Marshal(ls.Map())
-}
-
-// UnmarshalJSON implements json.Unmarshaler.
-func (ls *Labels) UnmarshalJSON(b []byte) error {
-	var m map[string]string
-
-	if err := json.Unmarshal(b, &m); err != nil {
-		return err
-	}
-
-	*ls = FromMap(m)
-	return nil
-}
-
-// MarshalYAML implements yaml.Marshaler.
-func (ls Labels) MarshalYAML() (interface{}, error) {
-	return ls.Map(), nil
-}
-
-// UnmarshalYAML implements yaml.Unmarshaler.
-func (ls *Labels) UnmarshalYAML(unmarshal func(interface{}) error) error {
-	var m map[string]string
-
-	if err := unmarshal(&m); err != nil {
-		return err
-	}
-
-	*ls = FromMap(m)
-	return nil
-}
-
 // MatchLabels returns a subset of Labels that matches/does not match with the provided label names based on the 'on' boolean.
 // If on is set to true, it returns the subset of labels that match with the provided label names and its inverse when 'on' is set to false.
 func (ls Labels) MatchLabels(on bool, names ...string) Labels {
@ -318,19 +247,6 @@ func (ls Labels) WithoutEmpty() Labels {
 	return ls
 }

-// IsValid checks if the metric name or label names are valid.
-func (ls Labels) IsValid() bool {
-	for _, l := range ls {
-		if l.Name == model.MetricNameLabel && !model.IsValidMetricName(model.LabelValue(l.Value)) {
-			return false
-		}
-		if !model.LabelName(l.Name).IsValid() || !model.LabelValue(l.Value).IsValid() {
-			return false
-		}
-	}
-	return true
-}
-
 // Equal returns whether the two label sets are equal.
 func Equal(ls, o Labels) bool {
 	if len(ls) != len(o) {
@ -344,15 +260,6 @@ func Equal(ls, o Labels) bool {
 	return true
 }

-// Map returns a string map of the labels.
-func (ls Labels) Map() map[string]string {
-	m := make(map[string]string, len(ls))
-	for _, l := range ls {
-		m[l.Name] = l.Value
-	}
-	return m
-}
-
 // EmptyLabels returns n empty Labels value, for convenience.
 func EmptyLabels() Labels {
 	return Labels{}
@ -368,15 +275,6 @@ func New(ls ...Label) Labels {
 	return set
 }

-// FromMap returns new sorted Labels from the given map.
-func FromMap(m map[string]string) Labels {
-	l := make([]Label, 0, len(m))
-	for k, v := range m {
-		l = append(l, Label{Name: k, Value: v})
-	}
-	return New(l...)
-}
-
 // FromStrings creates new labels from pairs of strings.
 func FromStrings(ss ...string) Labels {
 	if len(ss)%2 != 0 {
@ -460,118 +358,6 @@ func (ls Labels) ReleaseStrings(release func(string)) {
 	}
 }

-// Builder allows modifying Labels.
-type Builder struct {
-	base Labels
-	del  []string
-	add  []Label
-}
-
-// NewBuilder returns a new LabelsBuilder.
-func NewBuilder(base Labels) *Builder {
-	b := &Builder{
-		del: make([]string, 0, 5),
-		add: make([]Label, 0, 5),
-	}
-	b.Reset(base)
-	return b
-}
-
-// Reset clears all current state for the builder.
-func (b *Builder) Reset(base Labels) {
-	b.base = base
-	b.del = b.del[:0]
-	b.add = b.add[:0]
-	for _, l := range b.base {
-		if l.Value == "" {
-			b.del = append(b.del, l.Name)
-		}
-	}
-}
-
-// Del deletes the label of the given name.
-func (b *Builder) Del(ns ...string) *Builder {
-	for _, n := range ns {
-		for i, a := range b.add {
-			if a.Name == n {
-				b.add = append(b.add[:i], b.add[i+1:]...)
-			}
-		}
-		b.del = append(b.del, n)
-	}
-	return b
-}
-
-// Keep removes all labels from the base except those with the given names.
-func (b *Builder) Keep(ns ...string) *Builder {
-Outer:
-	for _, l := range b.base {
-		for _, n := range ns {
-			if l.Name == n {
-				continue Outer
-			}
-		}
-		b.del = append(b.del, l.Name)
-	}
-	return b
-}
-
-// Set the name/value pair as a label. A value of "" means delete that label.
-func (b *Builder) Set(n, v string) *Builder {
-	if v == "" {
-		// Empty labels are the same as missing labels.
-		return b.Del(n)
-	}
-	for i, a := range b.add {
-		if a.Name == n {
-			b.add[i].Value = v
-			return b
-		}
-	}
-	b.add = append(b.add, Label{Name: n, Value: v})
-
-	return b
-}
-
-func (b *Builder) Get(n string) string {
-	// Del() removes entries from .add but Set() does not remove from .del, so check .add first.
-	for _, a := range b.add {
-		if a.Name == n {
-			return a.Value
-		}
-	}
-	if slices.Contains(b.del, n) {
-		return ""
-	}
-	return b.base.Get(n)
-}
-
-// Range calls f on each label in the Builder.
-func (b *Builder) Range(f func(l Label)) {
-	// Stack-based arrays to avoid heap allocation in most cases.
-	var addStack [128]Label
-	var delStack [128]string
-	// Take a copy of add and del, so they are unaffected by calls to Set() or Del().
-	origAdd, origDel := append(addStack[:0], b.add...), append(delStack[:0], b.del...)
-	b.base.Range(func(l Label) {
-		if !slices.Contains(origDel, l.Name) && !contains(origAdd, l.Name) {
-			f(l)
-		}
-	})
-	for _, a := range origAdd {
-		f(a)
-	}
-}
-
-func contains(s []Label, n string) bool {
-	for _, a := range s {
-		if a.Name == n {
-			return true
-		}
-	}
-	return false
-}
-
 // Labels returns the labels from the builder.
 // If no modifications were made, the original labels are returned.
 func (b *Builder) Labels() Labels {
@ -617,6 +403,13 @@ func (b *ScratchBuilder) Add(name, value string) {
 	b.add = append(b.add, Label{Name: name, Value: value})
 }

+// Add a name/value pair, using []byte instead of string.
+// The '-tags stringlabels' version of this function is unsafe, hence the name.
+// This version is safe - it copies the strings immediately - but we keep the same name so everything compiles.
+func (b *ScratchBuilder) UnsafeAddBytes(name, value []byte) {
+	b.add = append(b.add, Label{Name: string(name), Value: string(value)})
+}
+
 // Sort the labels added so far by name.
 func (b *ScratchBuilder) Sort() {
 	slices.SortFunc(b.add, func(a, b Label) int { return strings.Compare(a.Name, b.Name) })
--- a/model/labels/labels_common.go
+++ b/model/labels/labels_common.go
@ -0,0 +1,235 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package labels
+
+import (
+	"bytes"
+	"encoding/json"
+	"strconv"
+
+	"github.com/prometheus/common/model"
+	"golang.org/x/exp/slices"
+)
+
+const (
+	MetricName   = "__name__"
+	AlertName    = "alertname"
+	BucketLabel  = "le"
+	InstanceName = "instance"
+
+	labelSep = '\xfe'
+)
+
+var seps = []byte{'\xff'}
+
+// Label is a key/value pair of strings.
+type Label struct {
+	Name, Value string
+}
+
+func (ls Labels) String() string {
+	var b bytes.Buffer
+
+	b.WriteByte('{')
+	i := 0
+	ls.Range(func(l Label) {
+		if i > 0 {
+			b.WriteByte(',')
+			b.WriteByte(' ')
+		}
+		b.WriteString(l.Name)
+		b.WriteByte('=')
+		b.WriteString(strconv.Quote(l.Value))
+		i++
+	})
+	b.WriteByte('}')
+	return b.String()
+}
+
+// MarshalJSON implements json.Marshaler.
+func (ls Labels) MarshalJSON() ([]byte, error) {
+	return json.Marshal(ls.Map())
+}
+
+// UnmarshalJSON implements json.Unmarshaler.
+func (ls *Labels) UnmarshalJSON(b []byte) error {
+	var m map[string]string
+
+	if err := json.Unmarshal(b, &m); err != nil {
+		return err
+	}
+
+	*ls = FromMap(m)
+	return nil
+}
+
+// MarshalYAML implements yaml.Marshaler.
+func (ls Labels) MarshalYAML() (interface{}, error) {
+	return ls.Map(), nil
+}
+
+// UnmarshalYAML implements yaml.Unmarshaler.
+func (ls *Labels) UnmarshalYAML(unmarshal func(interface{}) error) error {
+	var m map[string]string
+
+	if err := unmarshal(&m); err != nil {
+		return err
+	}
+
+	*ls = FromMap(m)
+	return nil
+}
+
+// IsValid checks if the metric name or label names are valid.
+func (ls Labels) IsValid() bool {
+	err := ls.Validate(func(l Label) error {
+		if l.Name == model.MetricNameLabel && !model.IsValidMetricName(model.LabelValue(l.Value)) {
+			return strconv.ErrSyntax
+		}
+		if !model.LabelName(l.Name).IsValid() || !model.LabelValue(l.Value).IsValid() {
+			return strconv.ErrSyntax
+		}
+		return nil
+	})
+	return err == nil
+}
+
+// Map returns a string map of the labels.
+func (ls Labels) Map() map[string]string {
+	m := make(map[string]string)
+	ls.Range(func(l Label) {
+		m[l.Name] = l.Value
+	})
+	return m
+}
+
+// FromMap returns new sorted Labels from the given map.
+func FromMap(m map[string]string) Labels {
+	l := make([]Label, 0, len(m))
+	for k, v := range m {
+		l = append(l, Label{Name: k, Value: v})
+	}
+	return New(l...)
+}
+
+// Builder allows modifying Labels.
+type Builder struct {
+	base Labels
+	del  []string
+	add  []Label
+}
+
+// NewBuilder returns a new LabelsBuilder.
+func NewBuilder(base Labels) *Builder {
+	b := &Builder{
+		del: make([]string, 0, 5),
+		add: make([]Label, 0, 5),
+	}
+	b.Reset(base)
+	return b
+}
+
+// Reset clears all current state for the builder.
+func (b *Builder) Reset(base Labels) {
+	b.base = base
+	b.del = b.del[:0]
+	b.add = b.add[:0]
+	b.base.Range(func(l Label) {
+		if l.Value == "" {
+			b.del = append(b.del, l.Name)
+		}
+	})
+}
+
+// Del deletes the label of the given name.
+func (b *Builder) Del(ns ...string) *Builder {
+	for _, n := range ns {
+		for i, a := range b.add {
+			if a.Name == n {
+				b.add = append(b.add[:i], b.add[i+1:]...)
+			}
+		}
+		b.del = append(b.del, n)
+	}
+	return b
+}
+
+// Keep removes all labels from the base except those with the given names.
+func (b *Builder) Keep(ns ...string) *Builder {
+	b.base.Range(func(l Label) {
+		for _, n := range ns {
+			if l.Name == n {
+				return
+			}
+		}
+		b.del = append(b.del, l.Name)
+	})
+	return b
+}
+
+// Set the name/value pair as a label. A value of "" means delete that label.
+func (b *Builder) Set(n, v string) *Builder {
+	if v == "" {
+		// Empty labels are the same as missing labels.
+		return b.Del(n)
+	}
+	for i, a := range b.add {
+		if a.Name == n {
+			b.add[i].Value = v
+			return b
+		}
+	}
+	b.add = append(b.add, Label{Name: n, Value: v})
+
+	return b
+}
+
+func (b *Builder) Get(n string) string {
+	// Del() removes entries from .add but Set() does not remove from .del, so check .add first.
+	for _, a := range b.add {
+		if a.Name == n {
+			return a.Value
+		}
+	}
+	if slices.Contains(b.del, n) {
+		return ""
+	}
+	return b.base.Get(n)
+}
+
+// Range calls f on each label in the Builder.
+func (b *Builder) Range(f func(l Label)) {
+	// Stack-based arrays to avoid heap allocation in most cases.
+	var addStack [128]Label
+	var delStack [128]string
+	// Take a copy of add and del, so they are unaffected by calls to Set() or Del().
+	origAdd, origDel := append(addStack[:0], b.add...), append(delStack[:0], b.del...)
+	b.base.Range(func(l Label) {
+		if !slices.Contains(origDel, l.Name) && !contains(origAdd, l.Name) {
+			f(l)
+		}
+	})
+	for _, a := range origAdd {
+		f(a)
+	}
+}
+
+func contains(s []Label, n string) bool {
+	for _, a := range s {
+		if a.Name == n {
+			return true
+		}
+	}
+	return false
+}
--- a/model/labels/labels_stringlabels.go
+++ b/model/labels/labels_stringlabels.go
@ -16,33 +16,14 @@
 package labels

 import (
-	"bytes"
-	"encoding/json"
 	"reflect"
-	"strconv"
 	"strings"
 	"unsafe"

 	"github.com/cespare/xxhash/v2"
-	"github.com/prometheus/common/model"
 	"golang.org/x/exp/slices"
 )

-// Well-known label names used by Prometheus components.
-const (
-	MetricName   = "__name__"
-	AlertName    = "alertname"
-	BucketLabel  = "le"
-	InstanceName = "instance"
-)
-
-var seps = []byte{'\xff'}
-
-// Label is a key/value pair of strings.
-type Label struct {
-	Name, Value string
-}
-
 // Labels is implemented by a single flat string holding name/value pairs.
 // Each name and value is preceded by its length in varint encoding.
 // Names are in order.
@ -77,26 +58,6 @@ func decodeString(data string, index int) (string, int) {
 	return data[index : index+size], index + size
 }

-func (ls Labels) String() string {
-	var b bytes.Buffer
-
-	b.WriteByte('{')
-	for i := 0; i < len(ls.data); {
-		if i > 0 {
-			b.WriteByte(',')
-			b.WriteByte(' ')
-		}
-		var name, value string
-		name, i = decodeString(ls.data, i)
-		value, i = decodeString(ls.data, i)
-		b.WriteString(name)
-		b.WriteByte('=')
-		b.WriteString(strconv.Quote(value))
-	}
-	b.WriteByte('}')
-	return b.String()
-}
-
 // Bytes returns ls as a byte slice.
 // It uses non-printing characters and so should not be used for printing.
 func (ls Labels) Bytes(buf []byte) []byte {
@ -109,45 +70,11 @@ func (ls Labels) Bytes(buf []byte) []byte {
 	return buf
 }

-// MarshalJSON implements json.Marshaler.
-func (ls Labels) MarshalJSON() ([]byte, error) {
-	return json.Marshal(ls.Map())
-}
-
-// UnmarshalJSON implements json.Unmarshaler.
-func (ls *Labels) UnmarshalJSON(b []byte) error {
-	var m map[string]string
-
-	if err := json.Unmarshal(b, &m); err != nil {
-		return err
-	}
-
-	*ls = FromMap(m)
-	return nil
-}
-
-// MarshalYAML implements yaml.Marshaler.
-func (ls Labels) MarshalYAML() (interface{}, error) {
-	return ls.Map(), nil
-}
-
 // IsZero implements yaml.IsZeroer - if we don't have this then 'omitempty' fields are always omitted.
 func (ls Labels) IsZero() bool {
 	return len(ls.data) == 0
 }

-// UnmarshalYAML implements yaml.Unmarshaler.
-func (ls *Labels) UnmarshalYAML(unmarshal func(interface{}) error) error {
-	var m map[string]string
-
-	if err := unmarshal(&m); err != nil {
-		return err
-	}
-
-	*ls = FromMap(m)
-	return nil
-}
-
 // MatchLabels returns a subset of Labels that matches/does not match with the provided label names based on the 'on' boolean.
 // If on is set to true, it returns the subset of labels that match with the provided label names and its inverse when 'on' is set to false.
 // TODO: This is only used in printing an error message
@ -364,37 +291,11 @@ func (ls Labels) WithoutEmpty() Labels {
 	return ls
 }

-// IsValid checks if the metric name or label names are valid.
-func (ls Labels) IsValid() bool {
-	err := ls.Validate(func(l Label) error {
-		if l.Name == model.MetricNameLabel && !model.IsValidMetricName(model.LabelValue(l.Value)) {
-			return strconv.ErrSyntax
-		}
-		if !model.LabelName(l.Name).IsValid() || !model.LabelValue(l.Value).IsValid() {
-			return strconv.ErrSyntax
-		}
-		return nil
-	})
-	return err == nil
-}
-
 // Equal returns whether the two label sets are equal.
 func Equal(ls, o Labels) bool {
 	return ls.data == o.data
 }

-// Map returns a string map of the labels.
-func (ls Labels) Map() map[string]string {
-	m := make(map[string]string, len(ls.data)/10)
-	for i := 0; i < len(ls.data); {
-		var lName, lValue string
-		lName, i = decodeString(ls.data, i)
-		lValue, i = decodeString(ls.data, i)
-		m[lName] = lValue
-	}
-	return m
-}
-
 // EmptyLabels returns an empty Labels value, for convenience.
 func EmptyLabels() Labels {
 	return Labels{}
@ -420,15 +321,6 @@ func New(ls ...Label) Labels {
 	return Labels{data: yoloString(buf)}
 }

-// FromMap returns new sorted Labels from the given map.
-func FromMap(m map[string]string) Labels {
-	l := make([]Label, 0, len(m))
-	for k, v := range m {
-		l = append(l, Label{Name: k, Value: v})
-	}
-	return New(l...)
-}
-
 // FromStrings creates new labels from pairs of strings.
 func FromStrings(ss ...string) Labels {
 	if len(ss)%2 != 0 {
@ -547,124 +439,6 @@ func (ls Labels) ReleaseStrings(release func(string)) {
 	release(ls.data)
 }

-// Builder allows modifying Labels.
-type Builder struct {
-	base Labels
-	del  []string
-	add  []Label
-}
-
-// NewBuilder returns a new LabelsBuilder.
-func NewBuilder(base Labels) *Builder {
-	b := &Builder{
-		del: make([]string, 0, 5),
-		add: make([]Label, 0, 5),
-	}
-	b.Reset(base)
-	return b
-}
-
-// Reset clears all current state for the builder.
-func (b *Builder) Reset(base Labels) {
-	b.base = base
-	b.del = b.del[:0]
-	b.add = b.add[:0]
-	for i := 0; i < len(base.data); {
-		var lName, lValue string
-		lName, i = decodeString(base.data, i)
-		lValue, i = decodeString(base.data, i)
-		if lValue == "" {
-			b.del = append(b.del, lName)
-		}
-	}
-}
-
-// Del deletes the label of the given name.
-func (b *Builder) Del(ns ...string) *Builder {
-	for _, n := range ns {
-		for i, a := range b.add {
-			if a.Name == n {
-				b.add = append(b.add[:i], b.add[i+1:]...)
-			}
-		}
-		b.del = append(b.del, n)
-	}
-	return b
-}
-
-// Keep removes all labels from the base except those with the given names.
-func (b *Builder) Keep(ns ...string) *Builder {
-Outer:
-	for i := 0; i < len(b.base.data); {
-		var lName string
-		lName, i = decodeString(b.base.data, i)
-		_, i = decodeString(b.base.data, i)
-		for _, n := range ns {
-			if lName == n {
-				continue Outer
-			}
-		}
-		b.del = append(b.del, lName)
-	}
-	return b
-}
-
-// Set the name/value pair as a label. A value of "" means delete that label.
-func (b *Builder) Set(n, v string) *Builder {
-	if v == "" {
-		// Empty labels are the same as missing labels.
-		return b.Del(n)
-	}
-	for i, a := range b.add {
-		if a.Name == n {
-			b.add[i].Value = v
-			return b
-		}
-	}
-	b.add = append(b.add, Label{Name: n, Value: v})
-
-	return b
-}
-
-func (b *Builder) Get(n string) string {
-	// Del() removes entries from .add but Set() does not remove from .del, so check .add first.
-	for _, a := range b.add {
-		if a.Name == n {
-			return a.Value
-		}
-	}
-	if slices.Contains(b.del, n) {
-		return ""
-	}
-	return b.base.Get(n)
-}
-
-// Range calls f on each label in the Builder.
-func (b *Builder) Range(f func(l Label)) {
-	// Stack-based arrays to avoid heap allocation in most cases.
-	var addStack [128]Label
-	var delStack [128]string
-	// Take a copy of add and del, so they are unaffected by calls to Set() or Del().
-	origAdd, origDel := append(addStack[:0], b.add...), append(delStack[:0], b.del...)
-	b.base.Range(func(l Label) {
-		if !slices.Contains(origDel, l.Name) && !contains(origAdd, l.Name) {
-			f(l)
-		}
-	})
-	for _, a := range origAdd {
-		f(a)
-	}
-}
-
-func contains(s []Label, n string) bool {
-	for _, a := range s {
-		if a.Name == n {
-			return true
-		}
-	}
-	return false
-}
-
 // Labels returns the labels from the builder.
 // If no modifications were made, the original labels are returned.
 func (b *Builder) Labels() Labels {
@ -829,6 +603,12 @@ func (b *ScratchBuilder) Add(name, value string) {
 	b.add = append(b.add, Label{Name: name, Value: value})
 }

+// Add a name/value pair, using []byte instead of string to reduce memory allocations.
+// The values must remain live until Labels() is called.
+func (b *ScratchBuilder) UnsafeAddBytes(name, value []byte) {
+	b.add = append(b.add, Label{Name: yoloString(name), Value: yoloString(value)})
+}
+
 // Sort the labels added so far by name.
 func (b *ScratchBuilder) Sort() {
 	slices.SortFunc(b.add, func(a, b Label) int { return strings.Compare(a.Name, b.Name) })
--- a/model/textparse/protobufparse.go
+++ b/model/textparse/protobufparse.go
@ -16,6 +16,7 @@ package textparse
 import (
 	"bytes"
 	"encoding/binary"
+	"errors"
 	"fmt"
 	"io"
 	"math"
@ -24,7 +25,6 @@ import (

 	"github.com/gogo/protobuf/proto"
 	"github.com/gogo/protobuf/types"
-	"github.com/pkg/errors"
 	"github.com/prometheus/common/model"

 	"github.com/prometheus/prometheus/model/exemplar"
@ -317,22 +317,28 @@ func (p *ProtobufParser) Exemplar(ex *exemplar.Exemplar) bool {
 		exProto = m.GetCounter().GetExemplar()
 	case dto.MetricType_HISTOGRAM, dto.MetricType_GAUGE_HISTOGRAM:
 		bb := m.GetHistogram().GetBucket()
+		isClassic := p.state == EntrySeries
 		if p.fieldPos < 0 {
-			if p.state == EntrySeries {
+			if isClassic {
 				return false // At _count or _sum.
 			}
 			p.fieldPos = 0 // Start at 1st bucket for native histograms.
 		}
 		for p.fieldPos < len(bb) {
 			exProto = bb[p.fieldPos].GetExemplar()
-			if p.state == EntrySeries {
+			if isClassic {
 				break
 			}
 			p.fieldPos++
-			if exProto != nil {
-				break
+			// We deliberately drop exemplars with no timestamp only for native histograms.
+			if exProto != nil && (isClassic || exProto.GetTimestamp() != nil) {
+				break // Found a classic histogram exemplar or a native histogram exemplar with a timestamp.
 			}
 		}
+		// If the last exemplar for native histograms has no timestamp, ignore it.
+		if !isClassic && exProto.GetTimestamp() == nil {
+			return false
+		}
 	default:
 		return false
 	}
@ -396,10 +402,10 @@ func (p *ProtobufParser) Next() (Entry, error) {
 		// into metricBytes and validate only name, help, and type for now.
 		name := p.mf.GetName()
 		if !model.IsValidMetricName(model.LabelValue(name)) {
-			return EntryInvalid, errors.Errorf("invalid metric name: %s", name)
+			return EntryInvalid, fmt.Errorf("invalid metric name: %s", name)
 		}
 		if help := p.mf.GetHelp(); !utf8.ValidString(help) {
-			return EntryInvalid, errors.Errorf("invalid help for metric %q: %s", name, help)
+			return EntryInvalid, fmt.Errorf("invalid help for metric %q: %s", name, help)
 		}
 		switch p.mf.GetType() {
 		case dto.MetricType_COUNTER,
@ -410,7 +416,7 @@ func (p *ProtobufParser) Next() (Entry, error) {
 			dto.MetricType_UNTYPED:
 			// All good.
 		default:
-			return EntryInvalid, errors.Errorf("unknown metric type for metric %q: %s", name, p.mf.GetType())
+			return EntryInvalid, fmt.Errorf("unknown metric type for metric %q: %s", name, p.mf.GetType())
 		}
 		p.metricBytes.Reset()
 		p.metricBytes.WriteString(name)
@ -463,7 +469,7 @@ func (p *ProtobufParser) Next() (Entry, error) {
 			return EntryInvalid, err
 		}
 	default:
-		return EntryInvalid, errors.Errorf("invalid protobuf parsing state: %d", p.state)
+		return EntryInvalid, fmt.Errorf("invalid protobuf parsing state: %d", p.state)
 	}
 	return p.state, nil
 }
@ -476,13 +482,13 @@ func (p *ProtobufParser) updateMetricBytes() error {
 		b.WriteByte(model.SeparatorByte)
 		n := lp.GetName()
 		if !model.LabelName(n).IsValid() {
-			return errors.Errorf("invalid label name: %s", n)
+			return fmt.Errorf("invalid label name: %s", n)
 		}
 		b.WriteString(n)
 		b.WriteByte(model.SeparatorByte)
 		v := lp.GetValue()
 		if !utf8.ValidString(v) {
-			return errors.Errorf("invalid label value: %s", v)
+			return fmt.Errorf("invalid label value: %s", v)
 		}
 		b.WriteString(v)
 	}
@ -557,7 +563,7 @@ func readDelimited(b []byte, mf *dto.MetricFamily) (n int, err error) {
 	}
 	totalLength := varIntLength + int(messageLength)
 	if totalLength > len(b) {
-		return 0, errors.Errorf("protobufparse: insufficient length of buffer, expected at least %d bytes, got %d bytes", totalLength, len(b))
+		return 0, fmt.Errorf("protobufparse: insufficient length of buffer, expected at least %d bytes, got %d bytes", totalLength, len(b))
 	}
 	mf.Reset()
 	return totalLength, mf.Unmarshal(b[varIntLength:totalLength])
--- a/model/textparse/protobufparse_test.go
+++ b/model/textparse/protobufparse_test.go
@ -729,7 +729,6 @@ func TestProtobufParse(t *testing.T) {
 					),
 					e: []exemplar.Exemplar{
 						{Labels: labels.FromStrings("dummyID", "59727"), Value: -0.00039, HasTs: true, Ts: 1625851155146},
-						{Labels: labels.FromStrings("dummyID", "5617"), Value: -0.00029, HasTs: false},
 					},
 				},
 				{
@ -766,7 +765,6 @@ func TestProtobufParse(t *testing.T) {
 					),
 					e: []exemplar.Exemplar{
 						{Labels: labels.FromStrings("dummyID", "59727"), Value: -0.00039, HasTs: true, Ts: 1625851155146},
-						{Labels: labels.FromStrings("dummyID", "5617"), Value: -0.00029, HasTs: false},
 					},
 				},
 				{
@ -802,7 +800,6 @@ func TestProtobufParse(t *testing.T) {
 					),
 					e: []exemplar.Exemplar{
 						{Labels: labels.FromStrings("dummyID", "59727"), Value: -0.00039, HasTs: true, Ts: 1625851155146},
-						{Labels: labels.FromStrings("dummyID", "5617"), Value: -0.00029, HasTs: false},
 					},
 				},
 				{
@ -839,7 +836,6 @@ func TestProtobufParse(t *testing.T) {
 					),
 					e: []exemplar.Exemplar{
 						{Labels: labels.FromStrings("dummyID", "59727"), Value: -0.00039, HasTs: true, Ts: 1625851155146},
-						{Labels: labels.FromStrings("dummyID", "5617"), Value: -0.00029, HasTs: false},
 					},
 				},
 				{
@ -1233,7 +1229,6 @@ func TestProtobufParse(t *testing.T) {
 					),
 					e: []exemplar.Exemplar{
 						{Labels: labels.FromStrings("dummyID", "59727"), Value: -0.00039, HasTs: true, Ts: 1625851155146},
-						{Labels: labels.FromStrings("dummyID", "5617"), Value: -0.00029, HasTs: false},
 					},
 				},
 				{ // 12
@ -1328,7 +1323,6 @@ func TestProtobufParse(t *testing.T) {
 					),
 					e: []exemplar.Exemplar{
 						{Labels: labels.FromStrings("dummyID", "59727"), Value: -0.00039, HasTs: true, Ts: 1625851155146},
-						{Labels: labels.FromStrings("dummyID", "5617"), Value: -0.00029, HasTs: false},
 					},
 				},
 				{ // 21
@ -1422,7 +1416,6 @@ func TestProtobufParse(t *testing.T) {
 					),
 					e: []exemplar.Exemplar{
 						{Labels: labels.FromStrings("dummyID", "59727"), Value: -0.00039, HasTs: true, Ts: 1625851155146},
-						{Labels: labels.FromStrings("dummyID", "5617"), Value: -0.00029, HasTs: false},
 					},
 				},
 				{ // 30
@ -1517,7 +1510,6 @@ func TestProtobufParse(t *testing.T) {
 					),
 					e: []exemplar.Exemplar{
 						{Labels: labels.FromStrings("dummyID", "59727"), Value: -0.00039, HasTs: true, Ts: 1625851155146},
-						{Labels: labels.FromStrings("dummyID", "5617"), Value: -0.00029, HasTs: false},
 					},
 				},
 				{ // 39
--- a/plugins.yml
+++ b/plugins.yml
@ -18,5 +18,6 @@
 - github.com/prometheus/prometheus/discovery/scaleway
 - github.com/prometheus/prometheus/discovery/triton
 - github.com/prometheus/prometheus/discovery/uyuni
+- github.com/prometheus/prometheus/discovery/vultr
 - github.com/prometheus/prometheus/discovery/xds
 - github.com/prometheus/prometheus/discovery/zookeeper
--- a/promql/bench_test.go
+++ b/promql/bench_test.go
@ -21,9 +21,11 @@ import (
 	"testing"
 	"time"

+	"github.com/prometheus/prometheus/model/histogram"
 	"github.com/prometheus/prometheus/model/labels"
 	"github.com/prometheus/prometheus/promql/parser"
 	"github.com/prometheus/prometheus/storage"
+	"github.com/prometheus/prometheus/tsdb/tsdbutil"
 	"github.com/prometheus/prometheus/util/teststorage"
 )

@ -269,6 +271,99 @@ func BenchmarkRangeQuery(b *testing.B) {
 	}
 }

+func BenchmarkNativeHistograms(b *testing.B) {
+	testStorage := teststorage.New(b)
+	defer testStorage.Close()
+
+	app := testStorage.Appender(context.TODO())
+	if err := generateNativeHistogramSeries(app, 3000); err != nil {
+		b.Fatal(err)
+	}
+	if err := app.Commit(); err != nil {
+		b.Fatal(err)
+	}
+
+	start := time.Unix(0, 0)
+	end := start.Add(2 * time.Hour)
+	step := time.Second * 30
+
+	cases := []struct {
+		name  string
+		query string
+	}{
+		{
+			name:  "sum",
+			query: "sum(native_histogram_series)",
+		},
+		{
+			name:  "sum rate",
+			query: "sum(rate(native_histogram_series[1m]))",
+		},
+	}
+
+	opts := EngineOpts{
+		Logger:               nil,
+		Reg:                  nil,
+		MaxSamples:           50000000,
+		Timeout:              100 * time.Second,
+		EnableAtModifier:     true,
+		EnableNegativeOffset: true,
+	}
+
+	b.ResetTimer()
+	b.ReportAllocs()
+
+	for _, tc := range cases {
+		b.Run(tc.name, func(b *testing.B) {
+			ng := NewEngine(opts)
+			for i := 0; i < b.N; i++ {
+				qry, err := ng.NewRangeQuery(context.Background(), testStorage, nil, tc.query, start, end, step)
+				if err != nil {
+					b.Fatal(err)
+				}
+				if result := qry.Exec(context.Background()); result.Err != nil {
+					b.Fatal(result.Err)
+				}
+			}
+		})
+	}
+}
+
+func generateNativeHistogramSeries(app storage.Appender, numSeries int) error {
+	commonLabels := []string{labels.MetricName, "native_histogram_series", "foo", "bar"}
+	series := make([][]*histogram.Histogram, numSeries)
+	for i := range series {
+		series[i] = tsdbutil.GenerateTestHistograms(2000)
+	}
+	higherSchemaHist := &histogram.Histogram{
+		Schema: 3,
+		PositiveSpans: []histogram.Span{
+			{Offset: -5, Length: 2}, // -5 -4
+			{Offset: 2, Length: 3},  // -1 0 1
+			{Offset: 2, Length: 2},  // 4 5
+		},
+		PositiveBuckets: []int64{1, 2, -2, 1, -1, 0, 3},
+		Count:           13,
+	}
+	for sid, histograms := range series {
+		seriesLabels := labels.FromStrings(append(commonLabels, "h", strconv.Itoa(sid))...)
+		for i := range histograms {
+			ts := time.Unix(int64(i*15), 0).UnixMilli()
+			if i == 0 {
+				// Inject a histogram with a higher schema.
+				if _, err := app.AppendHistogram(0, seriesLabels, ts, higherSchemaHist, nil); err != nil {
+					return err
+				}
+			}
+			if _, err := app.AppendHistogram(0, seriesLabels, ts, histograms[i], nil); err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
+
 func BenchmarkParser(b *testing.B) {
 	cases := []string{
 		"a",
--- a/promql/engine_test.go
+++ b/promql/engine_test.go
@ -3420,7 +3420,7 @@ func TestNativeHistogram_HistogramStdDevVar(t *testing.T) {
 		{
 			name: "-50, -8, 0, 3, 8, 9, 100, +Inf",
 			h: &histogram.Histogram{
-				Count:     8,
+				Count:     7,
 				ZeroCount: 1,
 				Sum:       math.Inf(1),
 				Schema:    3,
@ -3720,7 +3720,7 @@ func TestNativeHistogram_HistogramQuantile(t *testing.T) {

 						require.Len(t, vector, 1)
 						require.Nil(t, vector[0].H)
-						require.True(t, almostEqual(sc.value, vector[0].F))
+						require.True(t, almostEqual(sc.value, vector[0].F, defaultEpsilon))
 					})
 				}
 				idx++
--- a/promql/functions.go
+++ b/promql/functions.go
@ -1163,7 +1163,7 @@ func funcHistogramQuantile(vals []parser.Value, args parser.Expressions, enh *Ev

 	for _, mb := range enh.signatureToMetricWithBuckets {
 		if len(mb.buckets) > 0 {
-			res, forcedMonotonicity := bucketQuantile(q, mb.buckets)
+			res, forcedMonotonicity, _ := bucketQuantile(q, mb.buckets)
 			enh.Out = append(enh.Out, Sample{
 				Metric: mb.metric,
 				F:      res,
--- a/promql/parser/functions.go
+++ b/promql/parser/functions.go
@ -16,12 +16,16 @@ package parser
 // Function represents a function of the expression language and is
 // used by function nodes.
 type Function struct {
-	Name       string
-	ArgTypes   []ValueType
-	Variadic   int
-	ReturnType ValueType
+	Name         string
+	ArgTypes     []ValueType
+	Variadic     int
+	ReturnType   ValueType
+	Experimental bool
 }

+// EnableExperimentalFunctions controls whether experimentalFunctions are enabled.
+var EnableExperimentalFunctions bool
+
 // Functions is a list of all functions supported by PromQL, including their types.
 var Functions = map[string]*Function{
 	"abs": {
--- a/promql/parser/generated_parser.y
+++ b/promql/parser/generated_parser.y
@ -22,7 +22,7 @@ import (
        "github.com/prometheus/prometheus/model/labels"
        "github.com/prometheus/prometheus/model/value"
        "github.com/prometheus/prometheus/model/histogram"
-	"github.com/prometheus/prometheus/promql/parser/posrange"
+        "github.com/prometheus/prometheus/promql/parser/posrange"
 )

 %}
@ -369,6 +369,9 @@ function_call   : IDENTIFIER function_call_body
                        if !exist{
                                yylex.(*parser).addParseErrf($1.PositionRange(),"unknown function with name %q", $1.Val)
                        }
+                        if fn != nil && fn.Experimental && !EnableExperimentalFunctions {
+                                yylex.(*parser).addParseErrf($1.PositionRange(),"function %q is not enabled", $1.Val)
+                        }
                        $$ = &Call{
                                Func: fn,
                                Args: $2.(Expressions),
--- a/promql/parser/generated_parser.y.go
+++ b/promql/parser/generated_parser.y.go
@ -230,7 +230,7 @@ const yyEofCode = 1
 const yyErrCode = 2
 const yyInitialStackSize = 16

-//line promql/parser/generated_parser.y:916
+//line promql/parser/generated_parser.y:919

 //line yacctab:1
 var yyExca = [...]int16{
@ -1277,6 +1277,9 @@ yydefault:
 			if !exist {
 				yylex.(*parser).addParseErrf(yyDollar[1].item.PositionRange(), "unknown function with name %q", yyDollar[1].item.Val)
 			}
+			if fn != nil && fn.Experimental && !EnableExperimentalFunctions {
+				yylex.(*parser).addParseErrf(yyDollar[1].item.PositionRange(), "function %q is not enabled", yyDollar[1].item.Val)
+			}
 			yyVAL.node = &Call{
 				Func: fn,
 				Args: yyDollar[2].node.(Expressions),
@ -1288,86 +1291,86 @@ yydefault:
 		}
 	case 61:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:384
+//line promql/parser/generated_parser.y:387
 		{
 			yyVAL.node = yyDollar[2].node
 		}
 	case 62:
 		yyDollar = yyS[yypt-2 : yypt+1]
-//line promql/parser/generated_parser.y:386
+//line promql/parser/generated_parser.y:389
 		{
 			yyVAL.node = Expressions{}
 		}
 	case 63:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:390
+//line promql/parser/generated_parser.y:393
 		{
 			yyVAL.node = append(yyDollar[1].node.(Expressions), yyDollar[3].node.(Expr))
 		}
 	case 64:
 		yyDollar = yyS[yypt-1 : yypt+1]
-//line promql/parser/generated_parser.y:392
+//line promql/parser/generated_parser.y:395
 		{
 			yyVAL.node = Expressions{yyDollar[1].node.(Expr)}
 		}
 	case 65:
 		yyDollar = yyS[yypt-2 : yypt+1]
-//line promql/parser/generated_parser.y:394
+//line promql/parser/generated_parser.y:397
 		{
 			yylex.(*parser).addParseErrf(yyDollar[2].item.PositionRange(), "trailing commas not allowed in function call args")
 			yyVAL.node = yyDollar[1].node
 		}
 	case 66:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:405
+//line promql/parser/generated_parser.y:408
 		{
 			yyVAL.node = &ParenExpr{Expr: yyDollar[2].node.(Expr), PosRange: mergeRanges(&yyDollar[1].item, &yyDollar[3].item)}
 		}
 	case 67:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:413
+//line promql/parser/generated_parser.y:416
 		{
 			yylex.(*parser).addOffset(yyDollar[1].node, yyDollar[3].duration)
 			yyVAL.node = yyDollar[1].node
 		}
 	case 68:
 		yyDollar = yyS[yypt-4 : yypt+1]
-//line promql/parser/generated_parser.y:418
+//line promql/parser/generated_parser.y:421
 		{
 			yylex.(*parser).addOffset(yyDollar[1].node, -yyDollar[4].duration)
 			yyVAL.node = yyDollar[1].node
 		}
 	case 69:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:423
+//line promql/parser/generated_parser.y:426
 		{
 			yylex.(*parser).unexpected("offset", "duration")
 			yyVAL.node = yyDollar[1].node
 		}
 	case 70:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:430
+//line promql/parser/generated_parser.y:433
 		{
 			yylex.(*parser).setTimestamp(yyDollar[1].node, yyDollar[3].float)
 			yyVAL.node = yyDollar[1].node
 		}
 	case 71:
 		yyDollar = yyS[yypt-5 : yypt+1]
-//line promql/parser/generated_parser.y:435
+//line promql/parser/generated_parser.y:438
 		{
 			yylex.(*parser).setAtModifierPreprocessor(yyDollar[1].node, yyDollar[3].item)
 			yyVAL.node = yyDollar[1].node
 		}
 	case 72:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:440
+//line promql/parser/generated_parser.y:443
 		{
 			yylex.(*parser).unexpected("@", "timestamp")
 			yyVAL.node = yyDollar[1].node
 		}
 	case 75:
 		yyDollar = yyS[yypt-4 : yypt+1]
-//line promql/parser/generated_parser.y:450
+//line promql/parser/generated_parser.y:453
 		{
 			var errMsg string
 			vs, ok := yyDollar[1].node.(*VectorSelector)
@ -1392,7 +1395,7 @@ yydefault:
 		}
 	case 76:
 		yyDollar = yyS[yypt-6 : yypt+1]
-//line promql/parser/generated_parser.y:475
+//line promql/parser/generated_parser.y:478
 		{
 			yyVAL.node = &SubqueryExpr{
 				Expr:  yyDollar[1].node.(Expr),
@ -1404,35 +1407,35 @@ yydefault:
 		}
 	case 77:
 		yyDollar = yyS[yypt-6 : yypt+1]
-//line promql/parser/generated_parser.y:485
+//line promql/parser/generated_parser.y:488
 		{
 			yylex.(*parser).unexpected("subquery selector", "\"]\"")
 			yyVAL.node = yyDollar[1].node
 		}
 	case 78:
 		yyDollar = yyS[yypt-5 : yypt+1]
-//line promql/parser/generated_parser.y:487
+//line promql/parser/generated_parser.y:490
 		{
 			yylex.(*parser).unexpected("subquery selector", "duration or \"]\"")
 			yyVAL.node = yyDollar[1].node
 		}
 	case 79:
 		yyDollar = yyS[yypt-4 : yypt+1]
-//line promql/parser/generated_parser.y:489
+//line promql/parser/generated_parser.y:492
 		{
 			yylex.(*parser).unexpected("subquery or range", "\":\" or \"]\"")
 			yyVAL.node = yyDollar[1].node
 		}
 	case 80:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:491
+//line promql/parser/generated_parser.y:494
 		{
 			yylex.(*parser).unexpected("subquery selector", "duration")
 			yyVAL.node = yyDollar[1].node
 		}
 	case 81:
 		yyDollar = yyS[yypt-2 : yypt+1]
-//line promql/parser/generated_parser.y:501
+//line promql/parser/generated_parser.y:504
 		{
 			if nl, ok := yyDollar[2].node.(*NumberLiteral); ok {
 				if yyDollar[1].item.Typ == SUB {
@ -1446,7 +1449,7 @@ yydefault:
 		}
 	case 82:
 		yyDollar = yyS[yypt-2 : yypt+1]
-//line promql/parser/generated_parser.y:519
+//line promql/parser/generated_parser.y:522
 		{
 			vs := yyDollar[2].node.(*VectorSelector)
 			vs.PosRange = mergeRanges(&yyDollar[1].item, vs)
@ -1456,7 +1459,7 @@ yydefault:
 		}
 	case 83:
 		yyDollar = yyS[yypt-1 : yypt+1]
-//line promql/parser/generated_parser.y:527
+//line promql/parser/generated_parser.y:530
 		{
 			vs := &VectorSelector{
 				Name:          yyDollar[1].item.Val,
@ -1468,7 +1471,7 @@ yydefault:
 		}
 	case 84:
 		yyDollar = yyS[yypt-1 : yypt+1]
-//line promql/parser/generated_parser.y:537
+//line promql/parser/generated_parser.y:540
 		{
 			vs := yyDollar[1].node.(*VectorSelector)
 			yylex.(*parser).assembleVectorSelector(vs)
@ -1476,7 +1479,7 @@ yydefault:
 		}
 	case 85:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:545
+//line promql/parser/generated_parser.y:548
 		{
 			yyVAL.node = &VectorSelector{
 				LabelMatchers: yyDollar[2].matchers,
@ -1485,7 +1488,7 @@ yydefault:
 		}
 	case 86:
 		yyDollar = yyS[yypt-4 : yypt+1]
-//line promql/parser/generated_parser.y:552
+//line promql/parser/generated_parser.y:555
 		{
 			yyVAL.node = &VectorSelector{
 				LabelMatchers: yyDollar[2].matchers,
@ -1494,7 +1497,7 @@ yydefault:
 		}
 	case 87:
 		yyDollar = yyS[yypt-2 : yypt+1]
-//line promql/parser/generated_parser.y:559
+//line promql/parser/generated_parser.y:562
 		{
 			yyVAL.node = &VectorSelector{
 				LabelMatchers: []*labels.Matcher{},
@ -1503,7 +1506,7 @@ yydefault:
 		}
 	case 88:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:568
+//line promql/parser/generated_parser.y:571
 		{
 			if yyDollar[1].matchers != nil {
 				yyVAL.matchers = append(yyDollar[1].matchers, yyDollar[3].matcher)
@ -1513,47 +1516,47 @@ yydefault:
 		}
 	case 89:
 		yyDollar = yyS[yypt-1 : yypt+1]
-//line promql/parser/generated_parser.y:576
+//line promql/parser/generated_parser.y:579
 		{
 			yyVAL.matchers = []*labels.Matcher{yyDollar[1].matcher}
 		}
 	case 90:
 		yyDollar = yyS[yypt-2 : yypt+1]
-//line promql/parser/generated_parser.y:578
+//line promql/parser/generated_parser.y:581
 		{
 			yylex.(*parser).unexpected("label matching", "\",\" or \"}\"")
 			yyVAL.matchers = yyDollar[1].matchers
 		}
 	case 91:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:582
+//line promql/parser/generated_parser.y:585
 		{
 			yyVAL.matcher = yylex.(*parser).newLabelMatcher(yyDollar[1].item, yyDollar[2].item, yyDollar[3].item)
 		}
 	case 92:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:584
+//line promql/parser/generated_parser.y:587
 		{
 			yylex.(*parser).unexpected("label matching", "string")
 			yyVAL.matcher = nil
 		}
 	case 93:
 		yyDollar = yyS[yypt-2 : yypt+1]
-//line promql/parser/generated_parser.y:586
+//line promql/parser/generated_parser.y:589
 		{
 			yylex.(*parser).unexpected("label matching", "label matching operator")
 			yyVAL.matcher = nil
 		}
 	case 94:
 		yyDollar = yyS[yypt-1 : yypt+1]
-//line promql/parser/generated_parser.y:588
+//line promql/parser/generated_parser.y:591
 		{
 			yylex.(*parser).unexpected("label matching", "identifier or \"}\"")
 			yyVAL.matcher = nil
 		}
 	case 95:
 		yyDollar = yyS[yypt-2 : yypt+1]
-//line promql/parser/generated_parser.y:596
+//line promql/parser/generated_parser.y:599
 		{
 			b := labels.NewBuilder(yyDollar[2].labels)
 			b.Set(labels.MetricName, yyDollar[1].item.Val)
@ -1561,83 +1564,83 @@ yydefault:
 		}
 	case 96:
 		yyDollar = yyS[yypt-1 : yypt+1]
-//line promql/parser/generated_parser.y:598
+//line promql/parser/generated_parser.y:601
 		{
 			yyVAL.labels = yyDollar[1].labels
 		}
 	case 119:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:605
+//line promql/parser/generated_parser.y:608
 		{
 			yyVAL.labels = labels.New(yyDollar[2].lblList...)
 		}
 	case 120:
 		yyDollar = yyS[yypt-4 : yypt+1]
-//line promql/parser/generated_parser.y:607
+//line promql/parser/generated_parser.y:610
 		{
 			yyVAL.labels = labels.New(yyDollar[2].lblList...)
 		}
 	case 121:
 		yyDollar = yyS[yypt-2 : yypt+1]
-//line promql/parser/generated_parser.y:609
+//line promql/parser/generated_parser.y:612
 		{
 			yyVAL.labels = labels.New()
 		}
 	case 122:
 		yyDollar = yyS[yypt-0 : yypt+1]
-//line promql/parser/generated_parser.y:611
+//line promql/parser/generated_parser.y:614
 		{
 			yyVAL.labels = labels.New()
 		}
 	case 123:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:615
+//line promql/parser/generated_parser.y:618
 		{
 			yyVAL.lblList = append(yyDollar[1].lblList, yyDollar[3].label)
 		}
 	case 124:
 		yyDollar = yyS[yypt-1 : yypt+1]
-//line promql/parser/generated_parser.y:617
+//line promql/parser/generated_parser.y:620
 		{
 			yyVAL.lblList = []labels.Label{yyDollar[1].label}
 		}
 	case 125:
 		yyDollar = yyS[yypt-2 : yypt+1]
-//line promql/parser/generated_parser.y:619
+//line promql/parser/generated_parser.y:622
 		{
 			yylex.(*parser).unexpected("label set", "\",\" or \"}\"")
 			yyVAL.lblList = yyDollar[1].lblList
 		}
 	case 126:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:624
+//line promql/parser/generated_parser.y:627
 		{
 			yyVAL.label = labels.Label{Name: yyDollar[1].item.Val, Value: yylex.(*parser).unquoteString(yyDollar[3].item.Val)}
 		}
 	case 127:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:626
+//line promql/parser/generated_parser.y:629
 		{
 			yylex.(*parser).unexpected("label set", "string")
 			yyVAL.label = labels.Label{}
 		}
 	case 128:
 		yyDollar = yyS[yypt-2 : yypt+1]
-//line promql/parser/generated_parser.y:628
+//line promql/parser/generated_parser.y:631
 		{
 			yylex.(*parser).unexpected("label set", "\"=\"")
 			yyVAL.label = labels.Label{}
 		}
 	case 129:
 		yyDollar = yyS[yypt-1 : yypt+1]
-//line promql/parser/generated_parser.y:630
+//line promql/parser/generated_parser.y:633
 		{
 			yylex.(*parser).unexpected("label set", "identifier or \"}\"")
 			yyVAL.label = labels.Label{}
 		}
 	case 130:
 		yyDollar = yyS[yypt-2 : yypt+1]
-//line promql/parser/generated_parser.y:641
+//line promql/parser/generated_parser.y:644
 		{
 			yylex.(*parser).generatedParserResult = &seriesDescription{
 				labels: yyDollar[1].labels,
@ -1646,38 +1649,38 @@ yydefault:
 		}
 	case 131:
 		yyDollar = yyS[yypt-0 : yypt+1]
-//line promql/parser/generated_parser.y:650
+//line promql/parser/generated_parser.y:653
 		{
 			yyVAL.series = []SequenceValue{}
 		}
 	case 132:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:652
+//line promql/parser/generated_parser.y:655
 		{
 			yyVAL.series = append(yyDollar[1].series, yyDollar[3].series...)
 		}
 	case 133:
 		yyDollar = yyS[yypt-2 : yypt+1]
-//line promql/parser/generated_parser.y:654
+//line promql/parser/generated_parser.y:657
 		{
 			yyVAL.series = yyDollar[1].series
 		}
 	case 134:
 		yyDollar = yyS[yypt-1 : yypt+1]
-//line promql/parser/generated_parser.y:656
+//line promql/parser/generated_parser.y:659
 		{
 			yylex.(*parser).unexpected("series values", "")
 			yyVAL.series = nil
 		}
 	case 135:
 		yyDollar = yyS[yypt-1 : yypt+1]
-//line promql/parser/generated_parser.y:660
+//line promql/parser/generated_parser.y:663
 		{
 			yyVAL.series = []SequenceValue{{Omitted: true}}
 		}
 	case 136:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:662
+//line promql/parser/generated_parser.y:665
 		{
 			yyVAL.series = []SequenceValue{}
 			for i := uint64(0); i < yyDollar[3].uint; i++ {
@ -1686,13 +1689,13 @@ yydefault:
 		}
 	case 137:
 		yyDollar = yyS[yypt-1 : yypt+1]
-//line promql/parser/generated_parser.y:669
+//line promql/parser/generated_parser.y:672
 		{
 			yyVAL.series = []SequenceValue{{Value: yyDollar[1].float}}
 		}
 	case 138:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:671
+//line promql/parser/generated_parser.y:674
 		{
 			yyVAL.series = []SequenceValue{}
 			// Add an additional value for time 0, which we ignore in tests.
@ -1702,7 +1705,7 @@ yydefault:
 		}
 	case 139:
 		yyDollar = yyS[yypt-4 : yypt+1]
-//line promql/parser/generated_parser.y:679
+//line promql/parser/generated_parser.y:682
 		{
 			yyVAL.series = []SequenceValue{}
 			// Add an additional value for time 0, which we ignore in tests.
@ -1713,13 +1716,13 @@ yydefault:
 		}
 	case 140:
 		yyDollar = yyS[yypt-1 : yypt+1]
-//line promql/parser/generated_parser.y:689
+//line promql/parser/generated_parser.y:692
 		{
 			yyVAL.series = []SequenceValue{{Histogram: yyDollar[1].histogram}}
 		}
 	case 141:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:693
+//line promql/parser/generated_parser.y:696
 		{
 			yyVAL.series = []SequenceValue{}
 			// Add an additional value for time 0, which we ignore in tests.
@ -1730,7 +1733,7 @@ yydefault:
 		}
 	case 142:
 		yyDollar = yyS[yypt-5 : yypt+1]
-//line promql/parser/generated_parser.y:702
+//line promql/parser/generated_parser.y:705
 		{
 			val, err := yylex.(*parser).histogramsIncreaseSeries(yyDollar[1].histogram, yyDollar[3].histogram, yyDollar[5].uint)
 			if err != nil {
@ -1740,7 +1743,7 @@ yydefault:
 		}
 	case 143:
 		yyDollar = yyS[yypt-5 : yypt+1]
-//line promql/parser/generated_parser.y:710
+//line promql/parser/generated_parser.y:713
 		{
 			val, err := yylex.(*parser).histogramsDecreaseSeries(yyDollar[1].histogram, yyDollar[3].histogram, yyDollar[5].uint)
 			if err != nil {
@ -1750,7 +1753,7 @@ yydefault:
 		}
 	case 144:
 		yyDollar = yyS[yypt-1 : yypt+1]
-//line promql/parser/generated_parser.y:720
+//line promql/parser/generated_parser.y:723
 		{
 			if yyDollar[1].item.Val != "stale" {
 				yylex.(*parser).unexpected("series values", "number or \"stale\"")
@ -1759,138 +1762,138 @@ yydefault:
 		}
 	case 147:
 		yyDollar = yyS[yypt-4 : yypt+1]
-//line promql/parser/generated_parser.y:732
+//line promql/parser/generated_parser.y:735
 		{
 			yyVAL.histogram = yylex.(*parser).buildHistogramFromMap(&yyDollar[2].descriptors)
 		}
 	case 148:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:736
+//line promql/parser/generated_parser.y:739
 		{
 			yyVAL.histogram = yylex.(*parser).buildHistogramFromMap(&yyDollar[2].descriptors)
 		}
 	case 149:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:740
+//line promql/parser/generated_parser.y:743
 		{
 			m := yylex.(*parser).newMap()
 			yyVAL.histogram = yylex.(*parser).buildHistogramFromMap(&m)
 		}
 	case 150:
 		yyDollar = yyS[yypt-2 : yypt+1]
-//line promql/parser/generated_parser.y:745
+//line promql/parser/generated_parser.y:748
 		{
 			m := yylex.(*parser).newMap()
 			yyVAL.histogram = yylex.(*parser).buildHistogramFromMap(&m)
 		}
 	case 151:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:753
+//line promql/parser/generated_parser.y:756
 		{
 			yyVAL.descriptors = *(yylex.(*parser).mergeMaps(&yyDollar[1].descriptors, &yyDollar[3].descriptors))
 		}
 	case 152:
 		yyDollar = yyS[yypt-1 : yypt+1]
-//line promql/parser/generated_parser.y:757
+//line promql/parser/generated_parser.y:760
 		{
 			yyVAL.descriptors = yyDollar[1].descriptors
 		}
 	case 153:
 		yyDollar = yyS[yypt-2 : yypt+1]
-//line promql/parser/generated_parser.y:760
+//line promql/parser/generated_parser.y:763
 		{
 			yylex.(*parser).unexpected("histogram description", "histogram description key, e.g. buckets:[5 10 7]")
 		}
 	case 154:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:767
+//line promql/parser/generated_parser.y:770
 		{
 			yyVAL.descriptors = yylex.(*parser).newMap()
 			yyVAL.descriptors["schema"] = yyDollar[3].int
 		}
 	case 155:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:772
+//line promql/parser/generated_parser.y:775
 		{
 			yyVAL.descriptors = yylex.(*parser).newMap()
 			yyVAL.descriptors["sum"] = yyDollar[3].float
 		}
 	case 156:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:777
+//line promql/parser/generated_parser.y:780
 		{
 			yyVAL.descriptors = yylex.(*parser).newMap()
 			yyVAL.descriptors["count"] = yyDollar[3].float
 		}
 	case 157:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:782
+//line promql/parser/generated_parser.y:785
 		{
 			yyVAL.descriptors = yylex.(*parser).newMap()
 			yyVAL.descriptors["z_bucket"] = yyDollar[3].float
 		}
 	case 158:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:787
+//line promql/parser/generated_parser.y:790
 		{
 			yyVAL.descriptors = yylex.(*parser).newMap()
 			yyVAL.descriptors["z_bucket_w"] = yyDollar[3].float
 		}
 	case 159:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:792
+//line promql/parser/generated_parser.y:795
 		{
 			yyVAL.descriptors = yylex.(*parser).newMap()
 			yyVAL.descriptors["buckets"] = yyDollar[3].bucket_set
 		}
 	case 160:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:797
+//line promql/parser/generated_parser.y:800
 		{
 			yyVAL.descriptors = yylex.(*parser).newMap()
 			yyVAL.descriptors["offset"] = yyDollar[3].int
 		}
 	case 161:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:802
+//line promql/parser/generated_parser.y:805
 		{
 			yyVAL.descriptors = yylex.(*parser).newMap()
 			yyVAL.descriptors["n_buckets"] = yyDollar[3].bucket_set
 		}
 	case 162:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:807
+//line promql/parser/generated_parser.y:810
 		{
 			yyVAL.descriptors = yylex.(*parser).newMap()
 			yyVAL.descriptors["n_offset"] = yyDollar[3].int
 		}
 	case 163:
 		yyDollar = yyS[yypt-4 : yypt+1]
-//line promql/parser/generated_parser.y:814
+//line promql/parser/generated_parser.y:817
 		{
 			yyVAL.bucket_set = yyDollar[2].bucket_set
 		}
 	case 164:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:818
+//line promql/parser/generated_parser.y:821
 		{
 			yyVAL.bucket_set = yyDollar[2].bucket_set
 		}
 	case 165:
 		yyDollar = yyS[yypt-3 : yypt+1]
-//line promql/parser/generated_parser.y:824
+//line promql/parser/generated_parser.y:827
 		{
 			yyVAL.bucket_set = append(yyDollar[1].bucket_set, yyDollar[3].float)
 		}
 	case 166:
 		yyDollar = yyS[yypt-1 : yypt+1]
-//line promql/parser/generated_parser.y:828
+//line promql/parser/generated_parser.y:831
 		{
 			yyVAL.bucket_set = []float64{yyDollar[1].float}
 		}
 	case 213:
 		yyDollar = yyS[yypt-1 : yypt+1]
-//line promql/parser/generated_parser.y:853
+//line promql/parser/generated_parser.y:856
 		{
 			yyVAL.node = &NumberLiteral{
 				Val:      yylex.(*parser).number(yyDollar[1].item.Val),
@ -1899,25 +1902,25 @@ yydefault:
 		}
 	case 214:
 		yyDollar = yyS[yypt-1 : yypt+1]
-//line promql/parser/generated_parser.y:861
+//line promql/parser/generated_parser.y:864
 		{
 			yyVAL.float = yylex.(*parser).number(yyDollar[1].item.Val)
 		}
 	case 215:
 		yyDollar = yyS[yypt-2 : yypt+1]
-//line promql/parser/generated_parser.y:863
+//line promql/parser/generated_parser.y:866
 		{
 			yyVAL.float = yyDollar[2].float
 		}
 	case 216:
 		yyDollar = yyS[yypt-2 : yypt+1]
-//line promql/parser/generated_parser.y:864
+//line promql/parser/generated_parser.y:867
 		{
 			yyVAL.float = -yyDollar[2].float
 		}
 	case 219:
 		yyDollar = yyS[yypt-1 : yypt+1]
-//line promql/parser/generated_parser.y:870
+//line promql/parser/generated_parser.y:873
 		{
 			var err error
 			yyVAL.uint, err = strconv.ParseUint(yyDollar[1].item.Val, 10, 64)
@ -1927,19 +1930,19 @@ yydefault:
 		}
 	case 220:
 		yyDollar = yyS[yypt-2 : yypt+1]
-//line promql/parser/generated_parser.y:879
+//line promql/parser/generated_parser.y:882
 		{
 			yyVAL.int = -int64(yyDollar[2].uint)
 		}
 	case 221:
 		yyDollar = yyS[yypt-1 : yypt+1]
-//line promql/parser/generated_parser.y:880
+//line promql/parser/generated_parser.y:883
 		{
 			yyVAL.int = int64(yyDollar[1].uint)
 		}
 	case 222:
 		yyDollar = yyS[yypt-1 : yypt+1]
-//line promql/parser/generated_parser.y:884
+//line promql/parser/generated_parser.y:887
 		{
 			var err error
 			yyVAL.duration, err = parseDuration(yyDollar[1].item.Val)
@ -1949,7 +1952,7 @@ yydefault:
 		}
 	case 223:
 		yyDollar = yyS[yypt-1 : yypt+1]
-//line promql/parser/generated_parser.y:895
+//line promql/parser/generated_parser.y:898
 		{
 			yyVAL.node = &StringLiteral{
 				Val:      yylex.(*parser).unquoteString(yyDollar[1].item.Val),
@ -1958,13 +1961,13 @@ yydefault:
 		}
 	case 224:
 		yyDollar = yyS[yypt-0 : yypt+1]
-//line promql/parser/generated_parser.y:908
+//line promql/parser/generated_parser.y:911
 		{
 			yyVAL.duration = 0
 		}
 	case 226:
 		yyDollar = yyS[yypt-0 : yypt+1]
-//line promql/parser/generated_parser.y:912
+//line promql/parser/generated_parser.y:915
 		{
 			yyVAL.strings = nil
 		}
--- a/promql/quantile.go
+++ b/promql/quantile.go
@ -23,6 +23,25 @@ import (
 	"github.com/prometheus/prometheus/model/labels"
 )

+// smallDeltaTolerance is the threshold for relative deltas between classic
+// histogram buckets that will be ignored by the histogram_quantile function
+// because they are most likely artifacts of floating point precision issues.
+// Testing on 2 sets of real data with bugs arising from small deltas,
+// the safe ranges were from:
+// - 1e-05 to 1e-15
+// - 1e-06 to 1e-15
+// Anything to the left of that would cause non-query-sharded data to have
+// small deltas ignored (unnecessary and we should avoid this), and anything
+// to the right of that would cause query-sharded data to not have its small
+// deltas ignored (so the problem won't be fixed).
+// For context, query sharding triggers these float precision errors in Mimir.
+// To illustrate, with a relative deviation of 1e-12, we need to have 1e12
+// observations in the bucket so that the change of one observation is small
+// enough to get ignored. With the usual observation rate even of very busy
+// services, this will hardly be reached in timeframes that matters for
+// monitoring.
+const smallDeltaTolerance = 1e-12
+
 // Helpers to calculate quantiles.

 // excludedLabels are the labels to exclude from signature calculation for
@ -72,16 +91,19 @@ type metricWithBuckets struct {
 //
 // If q>1, +Inf is returned.
 //
-// We also return a bool to indicate if monotonicity needed to be forced.
-func bucketQuantile(q float64, buckets buckets) (float64, bool) {
+// We also return a bool to indicate if monotonicity needed to be forced,
+// and another bool to indicate if small differences between buckets (that
+// are likely artifacts of floating point precision issues) have been
+// ignored.
+func bucketQuantile(q float64, buckets buckets) (float64, bool, bool) {
 	if math.IsNaN(q) {
-		return math.NaN(), false
+		return math.NaN(), false, false
 	}
 	if q < 0 {
-		return math.Inf(-1), false
+		return math.Inf(-1), false, false
 	}
 	if q > 1 {
-		return math.Inf(+1), false
+		return math.Inf(+1), false, false
 	}
 	slices.SortFunc(buckets, func(a, b bucket) int {
 		// We don't expect the bucket boundary to be a NaN.
@ -94,27 +116,27 @@ func bucketQuantile(q float64, buckets buckets) (float64, bool) {
 		return 0
 	})
 	if !math.IsInf(buckets[len(buckets)-1].upperBound, +1) {
-		return math.NaN(), false
+		return math.NaN(), false, false
 	}

 	buckets = coalesceBuckets(buckets)
-	forcedMonotonic := ensureMonotonic(buckets)
+	forcedMonotonic, fixedPrecision := ensureMonotonicAndIgnoreSmallDeltas(buckets, smallDeltaTolerance)

 	if len(buckets) < 2 {
-		return math.NaN(), false
+		return math.NaN(), false, false
 	}
 	observations := buckets[len(buckets)-1].count
 	if observations == 0 {
-		return math.NaN(), false
+		return math.NaN(), false, false
 	}
 	rank := q * observations
 	b := sort.Search(len(buckets)-1, func(i int) bool { return buckets[i].count >= rank })

 	if b == len(buckets)-1 {
-		return buckets[len(buckets)-2].upperBound, forcedMonotonic
+		return buckets[len(buckets)-2].upperBound, forcedMonotonic, fixedPrecision
 	}
 	if b == 0 && buckets[0].upperBound <= 0 {
-		return buckets[0].upperBound, forcedMonotonic
+		return buckets[0].upperBound, forcedMonotonic, fixedPrecision
 	}
 	var (
 		bucketStart float64
@ -126,7 +148,7 @@ func bucketQuantile(q float64, buckets buckets) (float64, bool) {
 		count -= buckets[b-1].count
 		rank -= buckets[b-1].count
 	}
-	return bucketStart + (bucketEnd-bucketStart)*(rank/count), forcedMonotonic
+	return bucketStart + (bucketEnd-bucketStart)*(rank/count), forcedMonotonic, fixedPrecision
 }

 // histogramQuantile calculates the quantile 'q' based on the given histogram.
@ -348,6 +370,7 @@ func coalesceBuckets(buckets buckets) buckets {
 //   - Ingestion via the remote write receiver that Prometheus implements.
 //   - Optimisation of query execution where precision is sacrificed for other
 //     benefits, not by Prometheus but by systems built on top of it.
+//   - Circumstances where floating point precision errors accumulate.
 //
 // Monotonicity is usually guaranteed because if a bucket with upper bound
 // u1 has count c1, then any bucket with a higher upper bound u > u1 must
@ -357,22 +380,42 @@ func coalesceBuckets(buckets buckets) buckets {
 // bucket with the φ-quantile count, so breaking the monotonicity
 // guarantee causes bucketQuantile() to return undefined (nonsense) results.
 //
-// As a somewhat hacky solution, we calculate the "envelope" of the histogram
-// buckets, essentially removing any decreases in the count between successive
-// buckets. We return a bool to indicate if this monotonicity was forced or not.
-func ensureMonotonic(buckets buckets) bool {
-	forced := false
-	max := buckets[0].count
+// As a somewhat hacky solution, we first silently ignore any numerically
+// insignificant (relative delta below the requested tolerance and likely to
+// be from floating point precision errors) differences between successive
+// buckets regardless of the direction. Then we calculate the "envelope" of
+// the histogram buckets, essentially removing any decreases in the count
+// between successive buckets.
+//
+// We return a bool to indicate if this monotonicity was forced or not, and
+// another bool to indicate if small deltas were ignored or not.
+func ensureMonotonicAndIgnoreSmallDeltas(buckets buckets, tolerance float64) (bool, bool) {
+	var forcedMonotonic, fixedPrecision bool
+	prev := buckets[0].count
 	for i := 1; i < len(buckets); i++ {
-		switch {
-		case buckets[i].count > max:
-			max = buckets[i].count
-		case buckets[i].count < max:
-			buckets[i].count = max
-			forced = true
+		curr := buckets[i].count // Assumed always positive.
+		if curr == prev {
+			// No correction needed if the counts are identical between buckets.
+			continue
 		}
+		if almostEqual(prev, curr, tolerance) {
+			// Silently correct numerically insignificant differences from floating
+			// point precision errors, regardless of direction.
+			// Do not update the 'prev' value as we are ignoring the difference.
+			buckets[i].count = prev
+			fixedPrecision = true
+			continue
+		}
+		if curr < prev {
+			// Force monotonicity by removing any decreases regardless of magnitude.
+			// Do not update the 'prev' value as we are ignoring the decrease.
+			buckets[i].count = prev
+			forcedMonotonic = true
+			continue
+		}
+		prev = curr
 	}
-	return forced
+	return forcedMonotonic, fixedPrecision
 }

 // quantile calculates the given quantile of a vector of samples.
--- a/promql/quantile_test.go
+++ b/promql/quantile_test.go
@ -0,0 +1,318 @@
+// Copyright 2023 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package promql
+
+import (
+	"math"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestBucketQuantile_ForcedMonotonicity(t *testing.T) {
+	eps := 1e-12
+
+	for name, tc := range map[string]struct {
+		getInput       func() buckets // The buckets can be modified in-place so return a new one each time.
+		expectedForced bool
+		expectedFixed  bool
+		expectedValues map[float64]float64
+	}{
+		"simple - monotonic": {
+			getInput: func() buckets {
+				return buckets{
+					{
+						upperBound: 10,
+						count:      10,
+					}, {
+						upperBound: 15,
+						count:      15,
+					}, {
+						upperBound: 20,
+						count:      15,
+					}, {
+						upperBound: 30,
+						count:      15,
+					}, {
+						upperBound: math.Inf(1),
+						count:      15,
+					},
+				}
+			},
+			expectedForced: false,
+			expectedFixed:  false,
+			expectedValues: map[float64]float64{
+				1:    15.,
+				0.99: 14.85,
+				0.9:  13.5,
+				0.5:  7.5,
+			},
+		},
+		"simple - non-monotonic middle": {
+			getInput: func() buckets {
+				return buckets{
+					{
+						upperBound: 10,
+						count:      10,
+					}, {
+						upperBound: 15,
+						count:      15,
+					}, {
+						upperBound: 20,
+						count:      15.00000000001, // Simulate the case there's a small imprecision in float64.
+					}, {
+						upperBound: 30,
+						count:      15,
+					}, {
+						upperBound: math.Inf(1),
+						count:      15,
+					},
+				}
+			},
+			expectedForced: false,
+			expectedFixed:  true,
+			expectedValues: map[float64]float64{
+				1:    15.,
+				0.99: 14.85,
+				0.9:  13.5,
+				0.5:  7.5,
+			},
+		},
+		"real example - monotonic": {
+			getInput: func() buckets {
+				return buckets{
+					{
+						upperBound: 1,
+						count:      6454661.3014166197,
+					}, {
+						upperBound: 5,
+						count:      8339611.2001912938,
+					}, {
+						upperBound: 10,
+						count:      14118319.2444762159,
+					}, {
+						upperBound: 25,
+						count:      14130031.5272856522,
+					}, {
+						upperBound: 50,
+						count:      46001270.3030008152,
+					}, {
+						upperBound: 64,
+						count:      46008473.8585563600,
+					}, {
+						upperBound: 80,
+						count:      46008473.8585563600,
+					}, {
+						upperBound: 100,
+						count:      46008473.8585563600,
+					}, {
+						upperBound: 250,
+						count:      46008473.8585563600,
+					}, {
+						upperBound: 1000,
+						count:      46008473.8585563600,
+					}, {
+						upperBound: math.Inf(1),
+						count:      46008473.8585563600,
+					},
+				}
+			},
+			expectedForced: false,
+			expectedFixed:  false,
+			expectedValues: map[float64]float64{
+				1:    64.,
+				0.99: 49.64475715376406,
+				0.9:  46.39671690938454,
+				0.5:  31.96098248992002,
+			},
+		},
+		"real example - non-monotonic": {
+			getInput: func() buckets {
+				return buckets{
+					{
+						upperBound: 1,
+						count:      6454661.3014166225,
+					}, {
+						upperBound: 5,
+						count:      8339611.2001912957,
+					}, {
+						upperBound: 10,
+						count:      14118319.2444762159,
+					}, {
+						upperBound: 25,
+						count:      14130031.5272856504,
+					}, {
+						upperBound: 50,
+						count:      46001270.3030008227,
+					}, {
+						upperBound: 64,
+						count:      46008473.8585563824,
+					}, {
+						upperBound: 80,
+						count:      46008473.8585563898,
+					}, {
+						upperBound: 100,
+						count:      46008473.8585563824,
+					}, {
+						upperBound: 250,
+						count:      46008473.8585563824,
+					}, {
+						upperBound: 1000,
+						count:      46008473.8585563898,
+					}, {
+						upperBound: math.Inf(1),
+						count:      46008473.8585563824,
+					},
+				}
+			},
+			expectedForced: false,
+			expectedFixed:  true,
+			expectedValues: map[float64]float64{
+				1:    64.,
+				0.99: 49.64475715376406,
+				0.9:  46.39671690938454,
+				0.5:  31.96098248992002,
+			},
+		},
+		"real example 2 - monotonic": {
+			getInput: func() buckets {
+				return buckets{
+					{
+						upperBound: 0.005,
+						count:      9.6,
+					}, {
+						upperBound: 0.01,
+						count:      9.688888889,
+					}, {
+						upperBound: 0.025,
+						count:      9.755555556,
+					}, {
+						upperBound: 0.05,
+						count:      9.844444444,
+					}, {
+						upperBound: 0.1,
+						count:      9.888888889,
+					}, {
+						upperBound: 0.25,
+						count:      9.888888889,
+					}, {
+						upperBound: 0.5,
+						count:      9.888888889,
+					}, {
+						upperBound: 1,
+						count:      9.888888889,
+					}, {
+						upperBound: 2.5,
+						count:      9.888888889,
+					}, {
+						upperBound: 5,
+						count:      9.888888889,
+					}, {
+						upperBound: 10,
+						count:      9.888888889,
+					}, {
+						upperBound: 25,
+						count:      9.888888889,
+					}, {
+						upperBound: 50,
+						count:      9.888888889,
+					}, {
+						upperBound: 100,
+						count:      9.888888889,
+					}, {
+						upperBound: math.Inf(1),
+						count:      9.888888889,
+					},
+				}
+			},
+			expectedForced: false,
+			expectedFixed:  false,
+			expectedValues: map[float64]float64{
+				1:    0.1,
+				0.99: 0.03468750000281261,
+				0.9:  0.00463541666671875,
+				0.5:  0.0025752314815104174,
+			},
+		},
+		"real example 2 - non-monotonic": {
+			getInput: func() buckets {
+				return buckets{
+					{
+						upperBound: 0.005,
+						count:      9.6,
+					}, {
+						upperBound: 0.01,
+						count:      9.688888889,
+					}, {
+						upperBound: 0.025,
+						count:      9.755555556,
+					}, {
+						upperBound: 0.05,
+						count:      9.844444444,
+					}, {
+						upperBound: 0.1,
+						count:      9.888888889,
+					}, {
+						upperBound: 0.25,
+						count:      9.888888889,
+					}, {
+						upperBound: 0.5,
+						count:      9.888888889,
+					}, {
+						upperBound: 1,
+						count:      9.888888889,
+					}, {
+						upperBound: 2.5,
+						count:      9.888888889,
+					}, {
+						upperBound: 5,
+						count:      9.888888889,
+					}, {
+						upperBound: 10,
+						count:      9.888888889001, // Simulate the case there's a small imprecision in float64.
+					}, {
+						upperBound: 25,
+						count:      9.888888889,
+					}, {
+						upperBound: 50,
+						count:      9.888888888999, // Simulate the case there's a small imprecision in float64.
+					}, {
+						upperBound: 100,
+						count:      9.888888889,
+					}, {
+						upperBound: math.Inf(1),
+						count:      9.888888889,
+					},
+				}
+			},
+			expectedForced: false,
+			expectedFixed:  true,
+			expectedValues: map[float64]float64{
+				1:    0.1,
+				0.99: 0.03468750000281261,
+				0.9:  0.00463541666671875,
+				0.5:  0.0025752314815104174,
+			},
+		},
+	} {
+		t.Run(name, func(t *testing.T) {
+			for q, v := range tc.expectedValues {
+				res, forced, fixed := bucketQuantile(q, tc.getInput())
+				require.Equal(t, tc.expectedForced, forced)
+				require.Equal(t, tc.expectedFixed, fixed)
+				require.InEpsilon(t, v, res, eps)
+			}
+		})
+	}
+}
--- a/promql/test.go
+++ b/promql/test.go
@ -49,7 +49,7 @@ var (
 )

 const (
-	epsilon = 0.000001 // Relative error allowed for sample values.
+	defaultEpsilon = 0.000001 // Relative error allowed for sample values.
 )

 var testStartTime = time.Unix(0, 0).UTC()
@ -440,7 +440,7 @@ func (ev *evalCmd) compareResult(result parser.Value) error {
 			if (expH == nil) != (v.H == nil) || (expH != nil && !expH.Equals(v.H)) {
 				return fmt.Errorf("expected %v for %s but got %s", HistogramTestExpression(expH), v.Metric, HistogramTestExpression(v.H))
 			}
-			if !almostEqual(exp0.Value, v.F) {
+			if !almostEqual(exp0.Value, v.F, defaultEpsilon) {
 				return fmt.Errorf("expected %v for %s but got %v", exp0.Value, v.Metric, v.F)
 			}

@ -464,7 +464,7 @@ func (ev *evalCmd) compareResult(result parser.Value) error {
 		if exp0.Histogram != nil {
 			return fmt.Errorf("expected Histogram %v but got scalar %s", exp0.Histogram.TestExpression(), val.String())
 		}
-		if !almostEqual(exp0.Value, val.V) {
+		if !almostEqual(exp0.Value, val.V, defaultEpsilon) {
 			return fmt.Errorf("expected Scalar %v but got %v", val.V, exp0.Value)
 		}

@ -663,9 +663,9 @@ func (t *test) clear() {
 	t.context, t.cancelCtx = context.WithCancel(context.Background())
 }

-// samplesAlmostEqual returns true if the two sample lines only differ by a
-// small relative error in their sample value.
-func almostEqual(a, b float64) bool {
+// almostEqual returns true if a and b differ by less than their sum
+// multiplied by epsilon.
+func almostEqual(a, b, epsilon float64) bool {
 	// NaN has no equality but for testing we still want to know whether both values
 	// are NaN.
 	if math.IsNaN(a) && math.IsNaN(b) {
@ -677,12 +677,13 @@ func almostEqual(a, b float64) bool {
 		return true
 	}

+	absSum := math.Abs(a) + math.Abs(b)
 	diff := math.Abs(a - b)

-	if a == 0 || b == 0 || diff < minNormal {
+	if a == 0 || b == 0 || absSum < minNormal {
 		return diff < epsilon*minNormal
 	}
-	return diff/(math.Abs(a)+math.Abs(b)) < epsilon
+	return diff/math.Min(absSum, math.MaxFloat64) < epsilon
 }

 func parseNumber(s string) (float64, error) {
--- a/scrape/manager.go
+++ b/scrape/manager.go
@ -14,6 +14,7 @@
 package scrape

 import (
+	"errors"
 	"fmt"
 	"hash/fnv"
 	"reflect"
@ -22,7 +23,6 @@ import (

 	"github.com/go-kit/log"
 	"github.com/go-kit/log/level"
-	"github.com/pkg/errors"
 	"github.com/prometheus/client_golang/prometheus"
 	config_util "github.com/prometheus/common/config"
 	"github.com/prometheus/common/model"
@ -32,6 +32,7 @@ import (
 	"github.com/prometheus/prometheus/model/labels"
 	"github.com/prometheus/prometheus/storage"
 	"github.com/prometheus/prometheus/util/osutil"
+	"github.com/prometheus/prometheus/util/pool"
 )

 // NewManager is the Manager constructor.
@ -57,6 +58,7 @@ func NewManager(o *Options, logger log.Logger, app storage.Appendable, registere
 		graceShut:     make(chan struct{}),
 		triggerReload: make(chan struct{}, 1),
 		metrics:       sm,
+		buffers:       pool.New(1e3, 100e6, 3, func(sz int) interface{} { return make([]byte, 0, sz) }),
 	}

 	m.metrics.setTargetMetadataCacheGatherer(m)
@ -94,6 +96,7 @@ type Manager struct {
 	scrapeConfigs map[string]*config.ScrapeConfig
 	scrapePools   map[string]*scrapePool
 	targetSets    map[string][]*targetgroup.Group
+	buffers       *pool.Pool

 	triggerReload chan struct{}

@ -156,7 +159,7 @@ func (m *Manager) reload() {
 				continue
 			}
 			m.metrics.targetScrapePools.Inc()
-			sp, err := newScrapePool(scrapeConfig, m.append, m.offsetSeed, log.With(m.logger, "scrape_pool", setName), m.opts, m.metrics)
+			sp, err := newScrapePool(scrapeConfig, m.append, m.offsetSeed, log.With(m.logger, "scrape_pool", setName), m.buffers, m.opts, m.metrics)
 			if err != nil {
 				m.metrics.targetScrapePoolsFailed.Inc()
 				level.Error(m.logger).Log("msg", "error creating new scrape pool", "err", err, "scrape_pool", setName)
--- a/scrape/scrape.go
+++ b/scrape/scrape.go
@ -108,6 +108,7 @@ type scrapeLoopOptions struct {
 	scrapeClassicHistograms  bool
 	mrc                      []*relabel.Config
 	cache                    *scrapeCache
+	enableCompression        bool
 }

 const maxAheadTime = 10 * time.Minute
@ -115,7 +116,7 @@ const maxAheadTime = 10 * time.Minute
 // returning an empty label set is interpreted as "drop".
 type labelsMutator func(labels.Labels) labels.Labels

-func newScrapePool(cfg *config.ScrapeConfig, app storage.Appendable, offsetSeed uint64, logger log.Logger, options *Options, metrics *scrapeMetrics) (*scrapePool, error) {
+func newScrapePool(cfg *config.ScrapeConfig, app storage.Appendable, offsetSeed uint64, logger log.Logger, buffers *pool.Pool, options *Options, metrics *scrapeMetrics) (*scrapePool, error) {
 	if logger == nil {
 		logger = log.NewNopLogger()
 	}
@ -125,8 +126,6 @@ func newScrapePool(cfg *config.ScrapeConfig, app storage.Appendable, offsetSeed
 		return nil, fmt.Errorf("error creating HTTP client: %w", err)
 	}

-	buffers := pool.New(1e3, 100e6, 3, func(sz int) interface{} { return make([]byte, 0, sz) })
-
 	ctx, cancel := context.WithCancel(context.Background())
 	sp := &scrapePool{
 		cancel:        cancel,
@ -162,6 +161,7 @@ func newScrapePool(cfg *config.ScrapeConfig, app storage.Appendable, offsetSeed
 			offsetSeed,
 			opts.honorTimestamps,
 			opts.trackTimestampsStaleness,
+			opts.enableCompression,
 			opts.sampleLimit,
 			opts.bucketLimit,
 			opts.labelLimits,
@ -274,6 +274,7 @@ func (sp *scrapePool) reload(cfg *config.ScrapeConfig) error {
 		}
 		honorLabels              = sp.config.HonorLabels
 		honorTimestamps          = sp.config.HonorTimestamps
+		enableCompression        = sp.config.EnableCompression
 		trackTimestampsStaleness = sp.config.TrackTimestampsStaleness
 		mrc                      = sp.config.MetricRelabelConfigs
 	)
@ -294,11 +295,12 @@ func (sp *scrapePool) reload(cfg *config.ScrapeConfig) error {
 		interval, timeout, err := t.intervalAndTimeout(interval, timeout)
 		var (
 			s = &targetScraper{
-				Target:        t,
-				client:        sp.client,
-				timeout:       timeout,
-				bodySizeLimit: bodySizeLimit,
-				acceptHeader:  acceptHeader(cfg.ScrapeProtocols),
+				Target:               t,
+				client:               sp.client,
+				timeout:              timeout,
+				bodySizeLimit:        bodySizeLimit,
+				acceptHeader:         acceptHeader(cfg.ScrapeProtocols),
+				acceptEncodingHeader: acceptEncodingHeader(enableCompression),
 			}
 			newLoop = sp.newLoop(scrapeLoopOptions{
 				target:                   t,
@ -308,6 +310,7 @@ func (sp *scrapePool) reload(cfg *config.ScrapeConfig) error {
 				labelLimits:              labelLimits,
 				honorLabels:              honorLabels,
 				honorTimestamps:          honorTimestamps,
+				enableCompression:        enableCompression,
 				trackTimestampsStaleness: trackTimestampsStaleness,
 				mrc:                      mrc,
 				cache:                    cache,
@ -402,6 +405,7 @@ func (sp *scrapePool) sync(targets []*Target) {
 		}
 		honorLabels              = sp.config.HonorLabels
 		honorTimestamps          = sp.config.HonorTimestamps
+		enableCompression        = sp.config.EnableCompression
 		trackTimestampsStaleness = sp.config.TrackTimestampsStaleness
 		mrc                      = sp.config.MetricRelabelConfigs
 		scrapeClassicHistograms  = sp.config.ScrapeClassicHistograms
@ -418,12 +422,13 @@ func (sp *scrapePool) sync(targets []*Target) {
 			var err error
 			interval, timeout, err = t.intervalAndTimeout(interval, timeout)
 			s := &targetScraper{
-				Target:        t,
-				client:        sp.client,
-				timeout:       timeout,
-				bodySizeLimit: bodySizeLimit,
-				acceptHeader:  acceptHeader(sp.config.ScrapeProtocols),
-				metrics:       sp.metrics,
+				Target:               t,
+				client:               sp.client,
+				timeout:              timeout,
+				bodySizeLimit:        bodySizeLimit,
+				acceptHeader:         acceptHeader(sp.config.ScrapeProtocols),
+				acceptEncodingHeader: acceptEncodingHeader(enableCompression),
+				metrics:              sp.metrics,
 			}
 			l := sp.newLoop(scrapeLoopOptions{
 				target:                   t,
@ -433,6 +438,7 @@ func (sp *scrapePool) sync(targets []*Target) {
 				labelLimits:              labelLimits,
 				honorLabels:              honorLabels,
 				honorTimestamps:          honorTimestamps,
+				enableCompression:        enableCompression,
 				trackTimestampsStaleness: trackTimestampsStaleness,
 				mrc:                      mrc,
 				interval:                 interval,
@ -646,8 +652,9 @@ type targetScraper struct {
 	gzipr *gzip.Reader
 	buf   *bufio.Reader

-	bodySizeLimit int64
-	acceptHeader  string
+	bodySizeLimit        int64
+	acceptHeader         string
+	acceptEncodingHeader string

 	metrics *scrapeMetrics
 }
@ -669,6 +676,13 @@ func acceptHeader(sps []config.ScrapeProtocol) string {
 	return strings.Join(vals, ",")
 }

+func acceptEncodingHeader(enableCompression bool) string {
+	if enableCompression {
+		return "gzip"
+	}
+	return "identity"
+}
+
 var UserAgent = fmt.Sprintf("Prometheus/%s", version.Version)

 func (s *targetScraper) scrape(ctx context.Context) (*http.Response, error) {
@ -678,7 +692,7 @@ func (s *targetScraper) scrape(ctx context.Context) (*http.Response, error) {
 			return nil, err
 		}
 		req.Header.Add("Accept", s.acceptHeader)
-		req.Header.Add("Accept-Encoding", "gzip")
+		req.Header.Add("Accept-Encoding", s.acceptEncodingHeader)
 		req.Header.Set("User-Agent", UserAgent)
 		req.Header.Set("X-Prometheus-Scrape-Timeout-Seconds", strconv.FormatFloat(s.timeout.Seconds(), 'f', -1, 64))

@ -764,6 +778,7 @@ type scrapeLoop struct {
 	offsetSeed               uint64
 	honorTimestamps          bool
 	trackTimestampsStaleness bool
+	enableCompression        bool
 	forcedErr                error
 	forcedErrMtx             sync.Mutex
 	sampleLimit              int
@ -1054,6 +1069,7 @@ func newScrapeLoop(ctx context.Context,
 	offsetSeed uint64,
 	honorTimestamps bool,
 	trackTimestampsStaleness bool,
+	enableCompression bool,
 	sampleLimit int,
 	bucketLimit int,
 	labelLimits *labelLimits,
@ -1101,6 +1117,7 @@ func newScrapeLoop(ctx context.Context,
 		appenderCtx:              appenderCtx,
 		honorTimestamps:          honorTimestamps,
 		trackTimestampsStaleness: trackTimestampsStaleness,
+		enableCompression:        enableCompression,
 		sampleLimit:              sampleLimit,
 		bucketLimit:              bucketLimit,
 		labelLimits:              labelLimits,
@ -1404,6 +1421,8 @@ func (sl *scrapeLoop) append(app storage.Appender, b []byte, contentType string,
 		metadataChanged bool
 	)

+	exemplars := make([]exemplar.Exemplar, 1)
+
 	// updateMetadata updates the current iteration's metadata object and the
 	// metadataChanged value if we have metadata in the scrape cache AND the
 	// labelset is for a new series or the metadata for this series has just
@ -1569,18 +1588,46 @@ loop:
 		// Increment added even if there's an error so we correctly report the
 		// number of samples remaining after relabeling.
 		added++
-
+		exemplars = exemplars[:0] // Reset and reuse the exemplar slice.
 		for hasExemplar := p.Exemplar(&e); hasExemplar; hasExemplar = p.Exemplar(&e) {
 			if !e.HasTs {
+				if isHistogram {
+					// We drop exemplars for native histograms if they don't have a timestamp.
+					// Missing timestamps are deliberately not supported as we want to start
+					// enforcing timestamps for exemplars as otherwise proper deduplication
+					// is inefficient and purely based on heuristics: we cannot distinguish
+					// between repeated exemplars and new instances with the same values.
+					// This is done silently without logs as it is not an error but out of spec.
+					// This does not affect classic histograms so that behaviour is unchanged.
+					e = exemplar.Exemplar{} // Reset for next time round loop.
+					continue
+				}
 				e.Ts = t
 			}
+			exemplars = append(exemplars, e)
+			e = exemplar.Exemplar{} // Reset for next time round loop.
+		}
+		// Sort so that checking for duplicates / out of order is more efficient during validation.
+		slices.SortFunc(exemplars, exemplar.Compare)
+		outOfOrderExemplars := 0
+		for _, e := range exemplars {
 			_, exemplarErr := app.AppendExemplar(ref, lset, e)
-			exemplarErr = sl.checkAddExemplarError(exemplarErr, e, &appErrs)
-			if exemplarErr != nil {
+			switch {
+			case exemplarErr == nil:
+				// Do nothing.
+			case errors.Is(exemplarErr, storage.ErrOutOfOrderExemplar):
+				outOfOrderExemplars++
+			default:
 				// Since exemplar storage is still experimental, we don't fail the scrape on ingestion errors.
 				level.Debug(sl.l).Log("msg", "Error while adding exemplar in AddExemplar", "exemplar", fmt.Sprintf("%+v", e), "err", exemplarErr)
 			}
-			e = exemplar.Exemplar{} // reset for next time round loop
+		}
+		if outOfOrderExemplars > 0 && outOfOrderExemplars == len(exemplars) {
+			// Only report out of order exemplars if all are out of order, otherwise this was a partial update
+			// to some existing set of exemplars.
+			appErrs.numExemplarOutOfOrder += outOfOrderExemplars
+			level.Debug(sl.l).Log("msg", "Out of order exemplars", "count", outOfOrderExemplars, "latest", fmt.Sprintf("%+v", exemplars[len(exemplars)-1]))
+			sl.metrics.targetScrapeExemplarOutOfOrder.Add(float64(outOfOrderExemplars))
 		}

 		if sl.appendMetadataToWAL && metadataChanged {
@ -1673,20 +1720,6 @@ func (sl *scrapeLoop) checkAddError(ce *cacheEntry, met []byte, tp *int64, err e
 	}
 }

-func (sl *scrapeLoop) checkAddExemplarError(err error, e exemplar.Exemplar, appErrs *appendErrors) error {
-	switch {
-	case errors.Is(err, storage.ErrNotFound):
-		return storage.ErrNotFound
-	case errors.Is(err, storage.ErrOutOfOrderExemplar):
-		appErrs.numExemplarOutOfOrder++
-		level.Debug(sl.l).Log("msg", "Out of order exemplar", "exemplar", fmt.Sprintf("%+v", e))
-		sl.metrics.targetScrapeExemplarOutOfOrder.Inc()
-		return nil
-	default:
-		return err
-	}
-}
-
 // The constants are suffixed with the invalid \xff unicode rune to avoid collisions
 // with scraped metrics in the cache.
 var (
--- a/scrape/scrape_test.go
+++ b/scrape/scrape_test.go
--- a/scrape/target.go
+++ b/scrape/target.go
@ -14,6 +14,7 @@
 package scrape

 import (
+	"errors"
 	"fmt"
 	"hash/fnv"
 	"net"
@ -22,7 +23,6 @@ import (
 	"sync"
 	"time"

-	"github.com/pkg/errors"
 	"github.com/prometheus/common/model"

 	"github.com/prometheus/prometheus/config"
@ -289,12 +289,12 @@ func (t *Target) intervalAndTimeout(defaultInterval, defaultDuration time.Durati
 	intervalLabel := t.labels.Get(model.ScrapeIntervalLabel)
 	interval, err := model.ParseDuration(intervalLabel)
 	if err != nil {
-		return defaultInterval, defaultDuration, errors.Errorf("Error parsing interval label %q: %v", intervalLabel, err)
+		return defaultInterval, defaultDuration, fmt.Errorf("Error parsing interval label %q: %w", intervalLabel, err)
 	}
 	timeoutLabel := t.labels.Get(model.ScrapeTimeoutLabel)
 	timeout, err := model.ParseDuration(timeoutLabel)
 	if err != nil {
-		return defaultInterval, defaultDuration, errors.Errorf("Error parsing timeout label %q: %v", timeoutLabel, err)
+		return defaultInterval, defaultDuration, fmt.Errorf("Error parsing timeout label %q: %w", timeoutLabel, err)
 	}

 	return time.Duration(interval), time.Duration(timeout), nil
@ -444,7 +444,7 @@ func PopulateLabels(lb *labels.Builder, cfg *config.ScrapeConfig, noDefaultPort
 		case "https":
 			addr += ":443"
 		default:
-			return labels.EmptyLabels(), labels.EmptyLabels(), errors.Errorf("invalid scheme: %q", cfg.Scheme)
+			return labels.EmptyLabels(), labels.EmptyLabels(), fmt.Errorf("invalid scheme: %q", cfg.Scheme)
 		}
 		lb.Set(model.AddressLabel, addr)
 	}
@ -471,7 +471,7 @@ func PopulateLabels(lb *labels.Builder, cfg *config.ScrapeConfig, noDefaultPort
 	interval := lb.Get(model.ScrapeIntervalLabel)
 	intervalDuration, err := model.ParseDuration(interval)
 	if err != nil {
-		return labels.EmptyLabels(), labels.EmptyLabels(), errors.Errorf("error parsing scrape interval: %v", err)
+		return labels.EmptyLabels(), labels.EmptyLabels(), fmt.Errorf("error parsing scrape interval: %w", err)
 	}
 	if time.Duration(intervalDuration) == 0 {
 		return labels.EmptyLabels(), labels.EmptyLabels(), errors.New("scrape interval cannot be 0")
@ -480,14 +480,14 @@ func PopulateLabels(lb *labels.Builder, cfg *config.ScrapeConfig, noDefaultPort
 	timeout := lb.Get(model.ScrapeTimeoutLabel)
 	timeoutDuration, err := model.ParseDuration(timeout)
 	if err != nil {
-		return labels.EmptyLabels(), labels.EmptyLabels(), errors.Errorf("error parsing scrape timeout: %v", err)
+		return labels.EmptyLabels(), labels.EmptyLabels(), fmt.Errorf("error parsing scrape timeout: %w", err)
 	}
 	if time.Duration(timeoutDuration) == 0 {
 		return labels.EmptyLabels(), labels.EmptyLabels(), errors.New("scrape timeout cannot be 0")
 	}

 	if timeoutDuration > intervalDuration {
-		return labels.EmptyLabels(), labels.EmptyLabels(), errors.Errorf("scrape timeout cannot be greater than scrape interval (%q > %q)", timeout, interval)
+		return labels.EmptyLabels(), labels.EmptyLabels(), fmt.Errorf("scrape timeout cannot be greater than scrape interval (%q > %q)", timeout, interval)
 	}

 	// Meta labels are deleted after relabelling. Other internal labels propagate to
@ -507,7 +507,7 @@ func PopulateLabels(lb *labels.Builder, cfg *config.ScrapeConfig, noDefaultPort
 	err = res.Validate(func(l labels.Label) error {
 		// Check label values are valid, drop the target if not.
 		if !model.LabelValue(l.Value).IsValid() {
-			return errors.Errorf("invalid label value for %q: %q", l.Name, l.Value)
+			return fmt.Errorf("invalid label value for %q: %q", l.Name, l.Value)
 		}
 		return nil
 	})
@ -536,7 +536,7 @@ func TargetsFromGroup(tg *targetgroup.Group, cfg *config.ScrapeConfig, noDefault

 		lset, origLabels, err := PopulateLabels(lb, cfg, noDefaultPort)
 		if err != nil {
-			failures = append(failures, errors.Wrapf(err, "instance %d in group %s", i, tg))
+			failures = append(failures, fmt.Errorf("instance %d in group %s: %w", i, tg, err))
 		}
 		if !lset.IsEmpty() || !origLabels.IsEmpty() {
 			targets = append(targets, NewTarget(lset, origLabels, cfg.Params))
--- a/scripts/golangci-lint.yml
+++ b/scripts/golangci-lint.yml
@ -18,11 +18,11 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
-        uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608 # v4.1.0
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
      - name: install Go
        uses: actions/setup-go@6edd4406fa81c3da01a34fa6f6343087c207a568 # v3.5.0
        with:
-          go-version: 1.20.x
+          go-version: 1.21.x
      - name: Install snmp_exporter/generator dependencies
        run: sudo apt-get update && sudo apt-get -y install libsnmp-dev
        if: github.repository == 'prometheus/snmp_exporter'
--- a/storage/fanout_test.go
+++ b/storage/fanout_test.go
@ -15,9 +15,9 @@ package storage_test

 import (
 	"context"
+	"errors"
 	"testing"

-	"github.com/pkg/errors"
 	"github.com/prometheus/common/model"
 	"github.com/stretchr/testify/require"

--- a/storage/interface.go
+++ b/storage/interface.go
@ -37,16 +37,12 @@ var (
 	// ErrTooOldSample is when out of order support is enabled but the sample is outside the time window allowed.
 	ErrTooOldSample = errors.New("too old sample")
 	// ErrDuplicateSampleForTimestamp is when the sample has same timestamp but different value.
-	ErrDuplicateSampleForTimestamp   = errors.New("duplicate sample for timestamp")
-	ErrOutOfOrderExemplar            = errors.New("out of order exemplar")
-	ErrDuplicateExemplar             = errors.New("duplicate exemplar")
-	ErrExemplarLabelLength           = fmt.Errorf("label length for exemplar exceeds maximum of %d UTF-8 characters", exemplar.ExemplarMaxLabelSetLength)
-	ErrExemplarsDisabled             = fmt.Errorf("exemplar storage is disabled or max exemplars is less than or equal to 0")
-	ErrNativeHistogramsDisabled      = fmt.Errorf("native histograms are disabled")
-	ErrHistogramCountNotBigEnough    = errors.New("histogram's observation count should be at least the number of observations found in the buckets")
-	ErrHistogramNegativeBucketCount  = errors.New("histogram has a bucket whose observation count is negative")
-	ErrHistogramSpanNegativeOffset   = errors.New("histogram has a span whose offset is negative")
-	ErrHistogramSpansBucketsMismatch = errors.New("histogram spans specify different number of buckets than provided")
+	ErrDuplicateSampleForTimestamp = errors.New("duplicate sample for timestamp")
+	ErrOutOfOrderExemplar          = errors.New("out of order exemplar")
+	ErrDuplicateExemplar           = errors.New("duplicate exemplar")
+	ErrExemplarLabelLength         = fmt.Errorf("label length for exemplar exceeds maximum of %d UTF-8 characters", exemplar.ExemplarMaxLabelSetLength)
+	ErrExemplarsDisabled           = fmt.Errorf("exemplar storage is disabled or max exemplars is less than or equal to 0")
+	ErrNativeHistogramsDisabled    = fmt.Errorf("native histograms are disabled")
 )

 // SeriesRef is a generic series reference. In prometheus it is either a
--- a/storage/merge.go
+++ b/storage/merge.go
@ -473,10 +473,10 @@ func ChainSampleIteratorFromSeries(it chunkenc.Iterator, series []Series) chunke
 	return csi
 }

-func ChainSampleIteratorFromMetas(it chunkenc.Iterator, chunks []chunks.Meta) chunkenc.Iterator {
-	csi := getChainSampleIterator(it, len(chunks))
-	for i, c := range chunks {
-		csi.iterators[i] = c.Chunk.Iterator(csi.iterators[i])
+func ChainSampleIteratorFromIterables(it chunkenc.Iterator, iterables []chunkenc.Iterable) chunkenc.Iterator {
+	csi := getChainSampleIterator(it, len(iterables))
+	for i, c := range iterables {
+		csi.iterators[i] = c.Iterator(csi.iterators[i])
 	}
 	return csi
 }
@ -895,6 +895,9 @@ func (c *concatenatingChunkIterator) Next() bool {
 		c.curr = c.iterators[c.idx].At()
 		return true
 	}
+	if c.iterators[c.idx].Err() != nil {
+		return false
+	}
 	c.idx++
 	return c.Next()
 }
--- a/storage/merge_test.go
+++ b/storage/merge_test.go
@ -868,6 +868,65 @@ func TestConcatenatingChunkSeriesMerger(t *testing.T) {
 	}
 }

+func TestConcatenatingChunkIterator(t *testing.T) {
+	chunk1, err := chunks.ChunkFromSamples([]chunks.Sample{fSample{t: 1, f: 10}})
+	require.NoError(t, err)
+	chunk2, err := chunks.ChunkFromSamples([]chunks.Sample{fSample{t: 2, f: 20}})
+	require.NoError(t, err)
+	chunk3, err := chunks.ChunkFromSamples([]chunks.Sample{fSample{t: 3, f: 30}})
+	require.NoError(t, err)
+
+	testError := errors.New("something went wrong")
+
+	testCases := map[string]struct {
+		iterators      []chunks.Iterator
+		expectedChunks []chunks.Meta
+		expectedError  error
+	}{
+		"many successful iterators": {
+			iterators: []chunks.Iterator{
+				NewListChunkSeriesIterator(chunk1, chunk2),
+				NewListChunkSeriesIterator(chunk3),
+			},
+			expectedChunks: []chunks.Meta{chunk1, chunk2, chunk3},
+		},
+		"single failing iterator": {
+			iterators: []chunks.Iterator{
+				errChunksIterator{err: testError},
+			},
+			expectedError: testError,
+		},
+		"some failing and some successful iterators": {
+			iterators: []chunks.Iterator{
+				NewListChunkSeriesIterator(chunk1, chunk2),
+				errChunksIterator{err: testError},
+				NewListChunkSeriesIterator(chunk3),
+			},
+			expectedChunks: []chunks.Meta{chunk1, chunk2}, // Should stop before advancing to last iterator.
+			expectedError:  testError,
+		},
+	}
+
+	for name, testCase := range testCases {
+		t.Run(name, func(t *testing.T) {
+			it := concatenatingChunkIterator{iterators: testCase.iterators}
+			var chks []chunks.Meta
+
+			for it.Next() {
+				chks = append(chks, it.At())
+			}
+
+			require.Equal(t, testCase.expectedChunks, chks)
+
+			if testCase.expectedError == nil {
+				require.NoError(t, it.Err())
+			} else {
+				require.EqualError(t, it.Err(), testCase.expectedError.Error())
+			}
+		})
+	}
+}
+
 type mockQuerier struct {
 	LabelQuerier

--- a/storage/remote/client_test.go
+++ b/storage/remote/client_test.go
@ -168,3 +168,43 @@ func TestRetryAfterDuration(t *testing.T) {
 		require.Equal(t, c.expected, retryAfterDuration(c.tInput), c.name)
 	}
 }
+
+func TestClientHeaders(t *testing.T) {
+	headersToSend := map[string]string{"Foo": "Bar", "Baz": "qux"}
+
+	var called bool
+	server := httptest.NewServer(
+		http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			called = true
+			receivedHeaders := r.Header
+			for name, value := range headersToSend {
+				require.Equal(
+					t,
+					[]string{value},
+					receivedHeaders.Values(name),
+					"expected %v to be part of the received headers %v",
+					headersToSend,
+					receivedHeaders,
+				)
+			}
+		}),
+	)
+	defer server.Close()
+
+	serverURL, err := url.Parse(server.URL)
+	require.NoError(t, err)
+
+	conf := &ClientConfig{
+		URL:     &config_util.URL{URL: serverURL},
+		Timeout: model.Duration(time.Second),
+		Headers: headersToSend,
+	}
+
+	c, err := NewWriteClient("c", conf)
+	require.NoError(t, err)
+
+	err = c.Store(context.Background(), []byte{}, 0)
+	require.NoError(t, err)
+
+	require.True(t, called, "The remote server wasn't called")
+}
--- a/storage/remote/otlptranslator/prometheus/normalize_label.go
+++ b/storage/remote/otlptranslator/prometheus/normalize_label.go
@ -1,21 +1,31 @@
 // Copyright The OpenTelemetry Authors
 // SPDX-License-Identifier: Apache-2.0

-package normalize
+package prometheus // import "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/translator/prometheus"

 import (
 	"strings"
 	"unicode"
+
+	"go.opentelemetry.io/collector/featuregate"
 )

-// Normalizes the specified label to follow Prometheus label names standard.
+var dropSanitizationGate = featuregate.GlobalRegistry().MustRegister(
+	"pkg.translator.prometheus.PermissiveLabelSanitization",
+	featuregate.StageAlpha,
+	featuregate.WithRegisterDescription("Controls whether to change labels starting with '_' to 'key_'."),
+	featuregate.WithRegisterReferenceURL("https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/8950"),
+)
+
+// Normalizes the specified label to follow Prometheus label names standard
 //
 // See rules at https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels
 //
-// Labels that start with non-letter rune will be prefixed with "key_".
+// Labels that start with non-letter rune will be prefixed with "key_"
 //
-// Exception is made for double-underscores which are allowed.
+// Exception is made for double-underscores which are allowed
 func NormalizeLabel(label string) string {
+
 	// Trivial case
 	if len(label) == 0 {
 		return label
@ -27,12 +37,14 @@ func NormalizeLabel(label string) string {
 	// If label starts with a number, prepend with "key_"
 	if unicode.IsDigit(rune(label[0])) {
 		label = "key_" + label
+	} else if strings.HasPrefix(label, "_") && !strings.HasPrefix(label, "__") && !dropSanitizationGate.IsEnabled() {
+		label = "key" + label
 	}

 	return label
 }

-// Return '_' for anything non-alphanumeric.
+// Return '_' for anything non-alphanumeric
 func sanitizeRune(r rune) rune {
 	if unicode.IsLetter(r) || unicode.IsDigit(r) {
 		return r
--- a/storage/remote/otlptranslator/prometheus/normalize_label_test.go
+++ b/storage/remote/otlptranslator/prometheus/normalize_label_test.go
@ -1,19 +0,0 @@
-// Copyright The OpenTelemetry Authors
-// SPDX-License-Identifier: Apache-2.0
-
-package normalize
-
-import (
-	"testing"
-
-	"github.com/stretchr/testify/require"
-)
-
-func TestSanitizeDropSanitization(t *testing.T) {
-	require.Equal(t, "", NormalizeLabel(""))
-	require.Equal(t, "_test", NormalizeLabel("_test"))
-	require.Equal(t, "key_0test", NormalizeLabel("0test"))
-	require.Equal(t, "test", NormalizeLabel("test"))
-	require.Equal(t, "test__", NormalizeLabel("test_/"))
-	require.Equal(t, "__test", NormalizeLabel("__test"))
-}
--- a/storage/remote/otlptranslator/prometheus/normalize_name.go
+++ b/storage/remote/otlptranslator/prometheus/normalize_name.go
@ -1,21 +1,23 @@
 // Copyright The OpenTelemetry Authors
 // SPDX-License-Identifier: Apache-2.0

-package normalize
+package prometheus // import "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/translator/prometheus"

 import (
 	"strings"
 	"unicode"

+	"go.opentelemetry.io/collector/featuregate"
 	"go.opentelemetry.io/collector/pdata/pmetric"
 )

-// The map to translate OTLP units to Prometheus units.
+// The map to translate OTLP units to Prometheus units
 // OTLP metrics use the c/s notation as specified at https://ucum.org/ucum.html
 // (See also https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/metrics/semantic_conventions/README.md#instrument-units)
 // Prometheus best practices for units: https://prometheus.io/docs/practices/naming/#base-units
 // OpenMetrics specification for units: https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#units-and-base-units
 var unitMap = map[string]string{
+
 	// Time
 	"d":   "days",
 	"h":   "hours",
@ -35,11 +37,6 @@ var unitMap = map[string]string{
 	"MBy":  "megabytes",
 	"GBy":  "gigabytes",
 	"TBy":  "terabytes",
-	"B":    "bytes",
-	"KB":   "kilobytes",
-	"MB":   "megabytes",
-	"GB":   "gigabytes",
-	"TB":   "terabytes",

 	// SI
 	"m": "meters",
@ -54,11 +51,10 @@ var unitMap = map[string]string{
 	"Hz":  "hertz",
 	"1":   "",
 	"%":   "percent",
-	"$":   "dollars",
 }

-// The map that translates the "per" unit.
-// Example: s => per second (singular).
+// The map that translates the "per" unit
+// Example: s => per second (singular)
 var perUnitMap = map[string]string{
 	"s":  "second",
 	"m":  "minute",
@ -69,7 +65,14 @@ var perUnitMap = map[string]string{
 	"y":  "year",
 }

-// Build a Prometheus-compliant metric name for the specified metric.
+var normalizeNameGate = featuregate.GlobalRegistry().MustRegister(
+	"pkg.translator.prometheus.NormalizeName",
+	featuregate.StageBeta,
+	featuregate.WithRegisterDescription("Controls whether metrics names are automatically normalized to follow Prometheus naming convention"),
+	featuregate.WithRegisterReferenceURL("https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/8950"),
+)
+
+// BuildCompliantName builds a Prometheus-compliant metric name for the specified metric
 //
 // Metric name is prefixed with specified namespace and underscore (if any).
 // Namespace is not cleaned up. Make sure specified namespace follows Prometheus
@ -77,7 +80,33 @@ var perUnitMap = map[string]string{
 //
 // See rules at https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels
 // and https://prometheus.io/docs/practices/naming/#metric-and-label-naming
-func BuildPromCompliantName(metric pmetric.Metric, namespace string) string {
+func BuildCompliantName(metric pmetric.Metric, namespace string, addMetricSuffixes bool) string {
+	var metricName string
+
+	// Full normalization following standard Prometheus naming conventions
+	if addMetricSuffixes && normalizeNameGate.IsEnabled() {
+		return normalizeName(metric, namespace)
+	}
+
+	// Simple case (no full normalization, no units, etc.), we simply trim out forbidden chars
+	metricName = RemovePromForbiddenRunes(metric.Name())
+
+	// Namespace?
+	if namespace != "" {
+		return namespace + "_" + metricName
+	}
+
+	// Metric name starts with a digit? Prefix it with an underscore
+	if metricName != "" && unicode.IsDigit(rune(metricName[0])) {
+		metricName = "_" + metricName
+	}
+
+	return metricName
+}
+
+// Build a normalized name for the specified metric
+func normalizeName(metric pmetric.Metric, namespace string) string {
+
 	// Split metric name in "tokens" (remove all non-alphanumeric)
 	nameTokens := strings.FieldsFunc(
 		metric.Name(),
@ -202,7 +231,7 @@ func removeSuffix(tokens []string, suffix string) []string {
 	return tokens
 }

-// Clean up specified string so it's Prometheus compliant.
+// Clean up specified string so it's Prometheus compliant
 func CleanUpString(s string) string {
 	return strings.Join(strings.FieldsFunc(s, func(r rune) bool { return !unicode.IsLetter(r) && !unicode.IsDigit(r) }), "_")
 }
@ -211,8 +240,8 @@ func RemovePromForbiddenRunes(s string) string {
 	return strings.Join(strings.FieldsFunc(s, func(r rune) bool { return !unicode.IsLetter(r) && !unicode.IsDigit(r) && r != '_' && r != ':' }), "_")
 }

-// Retrieve the Prometheus "basic" unit corresponding to the specified "basic" unit.
-// Returns the specified unit if not found in unitMap.
+// Retrieve the Prometheus "basic" unit corresponding to the specified "basic" unit
+// Returns the specified unit if not found in unitMap
 func unitMapGetOrDefault(unit string) string {
 	if promUnit, ok := unitMap[unit]; ok {
 		return promUnit
@ -220,8 +249,8 @@ func unitMapGetOrDefault(unit string) string {
 	return unit
 }

-// Retrieve the Prometheus "per" unit corresponding to the specified "per" unit.
-// Returns the specified unit if not found in perUnitMap.
+// Retrieve the Prometheus "per" unit corresponding to the specified "per" unit
+// Returns the specified unit if not found in perUnitMap
 func perUnitMapGetOrDefault(perUnit string) string {
 	if promPerUnit, ok := perUnitMap[perUnit]; ok {
 		return promPerUnit
@ -229,7 +258,7 @@ func perUnitMapGetOrDefault(perUnit string) string {
 	return perUnit
 }

-// Returns whether the slice contains the specified value.
+// Returns whether the slice contains the specified value
 func contains(slice []string, value string) bool {
 	for _, sliceEntry := range slice {
 		if sliceEntry == value {
@ -239,7 +268,7 @@ func contains(slice []string, value string) bool {
 	return false
 }

-// Remove the specified value from the slice.
+// Remove the specified value from the slice
 func removeItem(slice []string, value string) []string {
 	newSlice := make([]string, 0, len(slice))
 	for _, sliceEntry := range slice {
--- a/storage/remote/otlptranslator/prometheus/normalize_name_test.go
+++ b/storage/remote/otlptranslator/prometheus/normalize_name_test.go
@ -1,180 +0,0 @@
-// Copyright The OpenTelemetry Authors
-// SPDX-License-Identifier: Apache-2.0
-
-package normalize
-
-import (
-	"testing"
-
-	"github.com/stretchr/testify/require"
-	"go.opentelemetry.io/collector/pdata/pmetric"
-)
-
-func TestByte(t *testing.T) {
-	require.Equal(t, "system_filesystem_usage_bytes", BuildPromCompliantName(createGauge("system.filesystem.usage", "By"), ""))
-}
-
-func TestByteCounter(t *testing.T) {
-	require.Equal(t, "system_io_bytes_total", BuildPromCompliantName(createCounter("system.io", "By"), ""))
-	require.Equal(t, "network_transmitted_bytes_total", BuildPromCompliantName(createCounter("network_transmitted_bytes_total", "By"), ""))
-}
-
-func TestWhiteSpaces(t *testing.T) {
-	require.Equal(t, "system_filesystem_usage_bytes", BuildPromCompliantName(createGauge("\t system.filesystem.usage       ", "  By\t"), ""))
-}
-
-func TestNonStandardUnit(t *testing.T) {
-	require.Equal(t, "system_network_dropped", BuildPromCompliantName(createGauge("system.network.dropped", "{packets}"), ""))
-}
-
-func TestNonStandardUnitCounter(t *testing.T) {
-	require.Equal(t, "system_network_dropped_total", BuildPromCompliantName(createCounter("system.network.dropped", "{packets}"), ""))
-}
-
-func TestBrokenUnit(t *testing.T) {
-	require.Equal(t, "system_network_dropped_packets", BuildPromCompliantName(createGauge("system.network.dropped", "packets"), ""))
-	require.Equal(t, "system_network_packets_dropped", BuildPromCompliantName(createGauge("system.network.packets.dropped", "packets"), ""))
-	require.Equal(t, "system_network_packets", BuildPromCompliantName(createGauge("system.network.packets", "packets"), ""))
-}
-
-func TestBrokenUnitCounter(t *testing.T) {
-	require.Equal(t, "system_network_dropped_packets_total", BuildPromCompliantName(createCounter("system.network.dropped", "packets"), ""))
-	require.Equal(t, "system_network_packets_dropped_total", BuildPromCompliantName(createCounter("system.network.packets.dropped", "packets"), ""))
-	require.Equal(t, "system_network_packets_total", BuildPromCompliantName(createCounter("system.network.packets", "packets"), ""))
-}
-
-func TestRatio(t *testing.T) {
-	require.Equal(t, "hw_gpu_memory_utilization_ratio", BuildPromCompliantName(createGauge("hw.gpu.memory.utilization", "1"), ""))
-	require.Equal(t, "hw_fan_speed_ratio", BuildPromCompliantName(createGauge("hw.fan.speed_ratio", "1"), ""))
-	require.Equal(t, "objects_total", BuildPromCompliantName(createCounter("objects", "1"), ""))
-}
-
-func TestHertz(t *testing.T) {
-	require.Equal(t, "hw_cpu_speed_limit_hertz", BuildPromCompliantName(createGauge("hw.cpu.speed_limit", "Hz"), ""))
-}
-
-func TestPer(t *testing.T) {
-	require.Equal(t, "broken_metric_speed_km_per_hour", BuildPromCompliantName(createGauge("broken.metric.speed", "km/h"), ""))
-	require.Equal(t, "astro_light_speed_limit_meters_per_second", BuildPromCompliantName(createGauge("astro.light.speed_limit", "m/s"), ""))
-}
-
-func TestPercent(t *testing.T) {
-	require.Equal(t, "broken_metric_success_ratio_percent", BuildPromCompliantName(createGauge("broken.metric.success_ratio", "%"), ""))
-	require.Equal(t, "broken_metric_success_percent", BuildPromCompliantName(createGauge("broken.metric.success_percent", "%"), ""))
-}
-
-func TestDollar(t *testing.T) {
-	require.Equal(t, "crypto_bitcoin_value_dollars", BuildPromCompliantName(createGauge("crypto.bitcoin.value", "$"), ""))
-	require.Equal(t, "crypto_bitcoin_value_dollars", BuildPromCompliantName(createGauge("crypto.bitcoin.value.dollars", "$"), ""))
-}
-
-func TestEmpty(t *testing.T) {
-	require.Equal(t, "test_metric_no_unit", BuildPromCompliantName(createGauge("test.metric.no_unit", ""), ""))
-	require.Equal(t, "test_metric_spaces", BuildPromCompliantName(createGauge("test.metric.spaces", "   \t  "), ""))
-}
-
-func TestUnsupportedRunes(t *testing.T) {
-	require.Equal(t, "unsupported_metric_temperature_F", BuildPromCompliantName(createGauge("unsupported.metric.temperature", "°F"), ""))
-	require.Equal(t, "unsupported_metric_weird", BuildPromCompliantName(createGauge("unsupported.metric.weird", "+=.:,!* & #"), ""))
-	require.Equal(t, "unsupported_metric_redundant_test_per_C", BuildPromCompliantName(createGauge("unsupported.metric.redundant", "__test $/°C"), ""))
-}
-
-func TestOtelReceivers(t *testing.T) {
-	require.Equal(t, "active_directory_ds_replication_network_io_bytes_total", BuildPromCompliantName(createCounter("active_directory.ds.replication.network.io", "By"), ""))
-	require.Equal(t, "active_directory_ds_replication_sync_object_pending_total", BuildPromCompliantName(createCounter("active_directory.ds.replication.sync.object.pending", "{objects}"), ""))
-	require.Equal(t, "active_directory_ds_replication_object_rate_per_second", BuildPromCompliantName(createGauge("active_directory.ds.replication.object.rate", "{objects}/s"), ""))
-	require.Equal(t, "active_directory_ds_name_cache_hit_rate_percent", BuildPromCompliantName(createGauge("active_directory.ds.name_cache.hit_rate", "%"), ""))
-	require.Equal(t, "active_directory_ds_ldap_bind_last_successful_time_milliseconds", BuildPromCompliantName(createGauge("active_directory.ds.ldap.bind.last_successful.time", "ms"), ""))
-	require.Equal(t, "apache_current_connections", BuildPromCompliantName(createGauge("apache.current_connections", "connections"), ""))
-	require.Equal(t, "apache_workers_connections", BuildPromCompliantName(createGauge("apache.workers", "connections"), ""))
-	require.Equal(t, "apache_requests_total", BuildPromCompliantName(createCounter("apache.requests", "1"), ""))
-	require.Equal(t, "bigip_virtual_server_request_count_total", BuildPromCompliantName(createCounter("bigip.virtual_server.request.count", "{requests}"), ""))
-	require.Equal(t, "system_cpu_utilization_ratio", BuildPromCompliantName(createGauge("system.cpu.utilization", "1"), ""))
-	require.Equal(t, "system_disk_operation_time_seconds_total", BuildPromCompliantName(createCounter("system.disk.operation_time", "s"), ""))
-	require.Equal(t, "system_cpu_load_average_15m_ratio", BuildPromCompliantName(createGauge("system.cpu.load_average.15m", "1"), ""))
-	require.Equal(t, "memcached_operation_hit_ratio_percent", BuildPromCompliantName(createGauge("memcached.operation_hit_ratio", "%"), ""))
-	require.Equal(t, "mongodbatlas_process_asserts_per_second", BuildPromCompliantName(createGauge("mongodbatlas.process.asserts", "{assertions}/s"), ""))
-	require.Equal(t, "mongodbatlas_process_journaling_data_files_mebibytes", BuildPromCompliantName(createGauge("mongodbatlas.process.journaling.data_files", "MiBy"), ""))
-	require.Equal(t, "mongodbatlas_process_network_io_bytes_per_second", BuildPromCompliantName(createGauge("mongodbatlas.process.network.io", "By/s"), ""))
-	require.Equal(t, "mongodbatlas_process_oplog_rate_gibibytes_per_hour", BuildPromCompliantName(createGauge("mongodbatlas.process.oplog.rate", "GiBy/h"), ""))
-	require.Equal(t, "mongodbatlas_process_db_query_targeting_scanned_per_returned", BuildPromCompliantName(createGauge("mongodbatlas.process.db.query_targeting.scanned_per_returned", "{scanned}/{returned}"), ""))
-	require.Equal(t, "nginx_requests", BuildPromCompliantName(createGauge("nginx.requests", "requests"), ""))
-	require.Equal(t, "nginx_connections_accepted", BuildPromCompliantName(createGauge("nginx.connections_accepted", "connections"), ""))
-	require.Equal(t, "nsxt_node_memory_usage_kilobytes", BuildPromCompliantName(createGauge("nsxt.node.memory.usage", "KBy"), ""))
-	require.Equal(t, "redis_latest_fork_microseconds", BuildPromCompliantName(createGauge("redis.latest_fork", "us"), ""))
-}
-
-func TestTrimPromSuffixes(t *testing.T) {
-	require.Equal(t, "active_directory_ds_replication_network_io", TrimPromSuffixes("active_directory_ds_replication_network_io_bytes_total", pmetric.MetricTypeSum, "bytes"))
-	require.Equal(t, "active_directory_ds_name_cache_hit_rate", TrimPromSuffixes("active_directory_ds_name_cache_hit_rate_percent", pmetric.MetricTypeGauge, "percent"))
-	require.Equal(t, "active_directory_ds_ldap_bind_last_successful_time", TrimPromSuffixes("active_directory_ds_ldap_bind_last_successful_time_milliseconds", pmetric.MetricTypeGauge, "milliseconds"))
-	require.Equal(t, "apache_requests", TrimPromSuffixes("apache_requests_total", pmetric.MetricTypeSum, "1"))
-	require.Equal(t, "system_cpu_utilization", TrimPromSuffixes("system_cpu_utilization_ratio", pmetric.MetricTypeGauge, "ratio"))
-	require.Equal(t, "mongodbatlas_process_journaling_data_files", TrimPromSuffixes("mongodbatlas_process_journaling_data_files_mebibytes", pmetric.MetricTypeGauge, "mebibytes"))
-	require.Equal(t, "mongodbatlas_process_network_io", TrimPromSuffixes("mongodbatlas_process_network_io_bytes_per_second", pmetric.MetricTypeGauge, "bytes_per_second"))
-	require.Equal(t, "mongodbatlas_process_oplog_rate", TrimPromSuffixes("mongodbatlas_process_oplog_rate_gibibytes_per_hour", pmetric.MetricTypeGauge, "gibibytes_per_hour"))
-	require.Equal(t, "nsxt_node_memory_usage", TrimPromSuffixes("nsxt_node_memory_usage_kilobytes", pmetric.MetricTypeGauge, "kilobytes"))
-	require.Equal(t, "redis_latest_fork", TrimPromSuffixes("redis_latest_fork_microseconds", pmetric.MetricTypeGauge, "microseconds"))
-	require.Equal(t, "up", TrimPromSuffixes("up", pmetric.MetricTypeGauge, ""))
-
-	// These are not necessarily valid OM units, only tested for the sake of completeness.
-	require.Equal(t, "active_directory_ds_replication_sync_object_pending", TrimPromSuffixes("active_directory_ds_replication_sync_object_pending_total", pmetric.MetricTypeSum, "{objects}"))
-	require.Equal(t, "apache_current", TrimPromSuffixes("apache_current_connections", pmetric.MetricTypeGauge, "connections"))
-	require.Equal(t, "bigip_virtual_server_request_count", TrimPromSuffixes("bigip_virtual_server_request_count_total", pmetric.MetricTypeSum, "{requests}"))
-	require.Equal(t, "mongodbatlas_process_db_query_targeting_scanned_per_returned", TrimPromSuffixes("mongodbatlas_process_db_query_targeting_scanned_per_returned", pmetric.MetricTypeGauge, "{scanned}/{returned}"))
-	require.Equal(t, "nginx_connections_accepted", TrimPromSuffixes("nginx_connections_accepted", pmetric.MetricTypeGauge, "connections"))
-	require.Equal(t, "apache_workers", TrimPromSuffixes("apache_workers_connections", pmetric.MetricTypeGauge, "connections"))
-	require.Equal(t, "nginx", TrimPromSuffixes("nginx_requests", pmetric.MetricTypeGauge, "requests"))
-
-	// Units shouldn't be trimmed if the unit is not a direct match with the suffix, i.e, a suffix "_seconds" shouldn't be removed if unit is "sec" or "s"
-	require.Equal(t, "system_cpu_load_average_15m_ratio", TrimPromSuffixes("system_cpu_load_average_15m_ratio", pmetric.MetricTypeGauge, "1"))
-	require.Equal(t, "mongodbatlas_process_asserts_per_second", TrimPromSuffixes("mongodbatlas_process_asserts_per_second", pmetric.MetricTypeGauge, "{assertions}/s"))
-	require.Equal(t, "memcached_operation_hit_ratio_percent", TrimPromSuffixes("memcached_operation_hit_ratio_percent", pmetric.MetricTypeGauge, "%"))
-	require.Equal(t, "active_directory_ds_replication_object_rate_per_second", TrimPromSuffixes("active_directory_ds_replication_object_rate_per_second", pmetric.MetricTypeGauge, "{objects}/s"))
-	require.Equal(t, "system_disk_operation_time_seconds", TrimPromSuffixes("system_disk_operation_time_seconds_total", pmetric.MetricTypeSum, "s"))
-}
-
-func TestNamespace(t *testing.T) {
-	require.Equal(t, "space_test", BuildPromCompliantName(createGauge("test", ""), "space"))
-	require.Equal(t, "space_test", BuildPromCompliantName(createGauge("#test", ""), "space"))
-}
-
-func TestCleanUpString(t *testing.T) {
-	require.Equal(t, "", CleanUpString(""))
-	require.Equal(t, "a_b", CleanUpString("a b"))
-	require.Equal(t, "hello_world", CleanUpString("hello, world!"))
-	require.Equal(t, "hello_you_2", CleanUpString("hello you 2"))
-	require.Equal(t, "1000", CleanUpString("$1000"))
-	require.Equal(t, "", CleanUpString("*+$^=)"))
-}
-
-func TestUnitMapGetOrDefault(t *testing.T) {
-	require.Equal(t, "", unitMapGetOrDefault(""))
-	require.Equal(t, "seconds", unitMapGetOrDefault("s"))
-	require.Equal(t, "invalid", unitMapGetOrDefault("invalid"))
-}
-
-func TestPerUnitMapGetOrDefault(t *testing.T) {
-	require.Equal(t, "", perUnitMapGetOrDefault(""))
-	require.Equal(t, "second", perUnitMapGetOrDefault("s"))
-	require.Equal(t, "invalid", perUnitMapGetOrDefault("invalid"))
-}
-
-func TestRemoveItem(t *testing.T) {
-	require.Equal(t, []string{}, removeItem([]string{}, "test"))
-	require.Equal(t, []string{}, removeItem([]string{}, ""))
-	require.Equal(t, []string{"a", "b", "c"}, removeItem([]string{"a", "b", "c"}, "d"))
-	require.Equal(t, []string{"a", "b", "c"}, removeItem([]string{"a", "b", "c"}, ""))
-	require.Equal(t, []string{"a", "b"}, removeItem([]string{"a", "b", "c"}, "c"))
-	require.Equal(t, []string{"a", "c"}, removeItem([]string{"a", "b", "c"}, "b"))
-	require.Equal(t, []string{"b", "c"}, removeItem([]string{"a", "b", "c"}, "a"))
-}
-
-func TestBuildPromCompliantName(t *testing.T) {
-	require.Equal(t, "system_io_bytes_total", BuildPromCompliantName(createCounter("system.io", "By"), ""))
-	require.Equal(t, "system_network_io_bytes_total", BuildPromCompliantName(createCounter("network.io", "By"), "system"))
-	require.Equal(t, "_3_14_digits", BuildPromCompliantName(createGauge("3.14 digits", ""), ""))
-	require.Equal(t, "envoy_rule_engine_zlib_buf_error", BuildPromCompliantName(createGauge("envoy__rule_engine_zlib_buf_error", ""), ""))
-	require.Equal(t, "foo_bar", BuildPromCompliantName(createGauge(":foo::bar", ""), ""))
-	require.Equal(t, "foo_bar_total", BuildPromCompliantName(createCounter(":foo::bar", ""), ""))
-}
--- a/storage/remote/otlptranslator/prometheus/testutils_test.go
+++ b/storage/remote/otlptranslator/prometheus/testutils_test.go
@ -1,34 +0,0 @@
-// Copyright The OpenTelemetry Authors
-// SPDX-License-Identifier: Apache-2.0
-
-package normalize
-
-import (
-	"go.opentelemetry.io/collector/pdata/pmetric"
-)
-
-var ilm pmetric.ScopeMetrics
-
-func init() {
-	metrics := pmetric.NewMetrics()
-	resourceMetrics := metrics.ResourceMetrics().AppendEmpty()
-	ilm = resourceMetrics.ScopeMetrics().AppendEmpty()
-}
-
-// Returns a new Metric of type "Gauge" with specified name and unit.
-func createGauge(name, unit string) pmetric.Metric {
-	gauge := ilm.Metrics().AppendEmpty()
-	gauge.SetName(name)
-	gauge.SetUnit(unit)
-	gauge.SetEmptyGauge()
-	return gauge
-}
-
-// Returns a new Metric of type Monotonic Sum with specified name and unit.
-func createCounter(name, unit string) pmetric.Metric {
-	counter := ilm.Metrics().AppendEmpty()
-	counter.SetEmptySum().SetIsMonotonic(true)
-	counter.SetName(name)
-	counter.SetUnit(unit)
-	return counter
-}
--- a/storage/remote/otlptranslator/prometheus/unit_to_ucum.go
+++ b/storage/remote/otlptranslator/prometheus/unit_to_ucum.go
@ -0,0 +1,90 @@
+// Copyright The OpenTelemetry Authors
+// SPDX-License-Identifier: Apache-2.0
+
+package prometheus // import "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/translator/prometheus"
+
+import "strings"
+
+var wordToUCUM = map[string]string{
+
+	// Time
+	"days":         "d",
+	"hours":        "h",
+	"minutes":      "min",
+	"seconds":      "s",
+	"milliseconds": "ms",
+	"microseconds": "us",
+	"nanoseconds":  "ns",
+
+	// Bytes
+	"bytes":     "By",
+	"kibibytes": "KiBy",
+	"mebibytes": "MiBy",
+	"gibibytes": "GiBy",
+	"tibibytes": "TiBy",
+	"kilobytes": "KBy",
+	"megabytes": "MBy",
+	"gigabytes": "GBy",
+	"terabytes": "TBy",
+
+	// SI
+	"meters":  "m",
+	"volts":   "V",
+	"amperes": "A",
+	"joules":  "J",
+	"watts":   "W",
+	"grams":   "g",
+
+	// Misc
+	"celsius": "Cel",
+	"hertz":   "Hz",
+	"ratio":   "1",
+	"percent": "%",
+}
+
+// The map that translates the "per" unit
+// Example: per_second (singular) => /s
+var perWordToUCUM = map[string]string{
+	"second": "s",
+	"minute": "m",
+	"hour":   "h",
+	"day":    "d",
+	"week":   "w",
+	"month":  "mo",
+	"year":   "y",
+}
+
+// UnitWordToUCUM converts english unit words to UCUM units:
+// https://ucum.org/ucum#section-Alphabetic-Index-By-Symbol
+// It also handles rates, such as meters_per_second, by translating the first
+// word to UCUM, and the "per" word to UCUM. It joins them with a "/" between.
+func UnitWordToUCUM(unit string) string {
+	unitTokens := strings.SplitN(unit, "_per_", 2)
+	if len(unitTokens) == 0 {
+		return ""
+	}
+	ucumUnit := wordToUCUMOrDefault(unitTokens[0])
+	if len(unitTokens) > 1 && unitTokens[1] != "" {
+		ucumUnit += "/" + perWordToUCUMOrDefault(unitTokens[1])
+	}
+	return ucumUnit
+}
+
+// wordToUCUMOrDefault retrieves the Prometheus "basic" unit corresponding to
+// the specified "basic" unit. Returns the specified unit if not found in
+// wordToUCUM.
+func wordToUCUMOrDefault(unit string) string {
+	if promUnit, ok := wordToUCUM[unit]; ok {
+		return promUnit
+	}
+	return unit
+}
+
+// perWordToUCUMOrDefault retrieve the Prometheus "per" unit corresponding to
+// the specified "per" unit. Returns the specified unit if not found in perWordToUCUM.
+func perWordToUCUMOrDefault(perUnit string) string {
+	if promPerUnit, ok := perWordToUCUM[perUnit]; ok {
+		return promPerUnit
+	}
+	return perUnit
+}
--- a/storage/remote/otlptranslator/prometheusremotewrite/helper.go
+++ b/storage/remote/otlptranslator/prometheusremotewrite/helper.go
@ -71,8 +71,8 @@ func (a ByLabelName) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 // creates a new TimeSeries in the map if not found and returns the time series signature.
 // tsMap will be unmodified if either labels or sample is nil, but can still be modified if the exemplar is nil.
 func addSample(tsMap map[string]*prompb.TimeSeries, sample *prompb.Sample, labels []prompb.Label,
-	datatype string,
-) string {
+	datatype string) string {
+
 	if sample == nil || labels == nil || tsMap == nil {
 		return ""
 	}
@ -132,7 +132,14 @@ func addExemplar(tsMap map[string]*prompb.TimeSeries, bucketBounds []bucketBound
 // the label slice should not contain duplicate label names; this method sorts the slice by label name before creating
 // the signature.
 func timeSeriesSignature(datatype string, labels *[]prompb.Label) string {
+	length := len(datatype)
+
+	for _, lb := range *labels {
+		length += 2 + len(lb.GetName()) + len(lb.GetValue())
+	}
+
 	b := strings.Builder{}
+	b.Grow(length)
 	b.WriteString(datatype)

 	sort.Sort(ByLabelName(*labels))
@ -151,8 +158,22 @@ func timeSeriesSignature(datatype string, labels *[]prompb.Label) string {
 // Unpaired string value is ignored. String pairs overwrites OTLP labels if collision happens, and the overwrite is
 // logged. Resultant label names are sanitized.
 func createAttributes(resource pcommon.Resource, attributes pcommon.Map, externalLabels map[string]string, extras ...string) []prompb.Label {
+	serviceName, haveServiceName := resource.Attributes().Get(conventions.AttributeServiceName)
+	instance, haveInstanceID := resource.Attributes().Get(conventions.AttributeServiceInstanceID)
+
+	// Calculate the maximum possible number of labels we could return so we can preallocate l
+	maxLabelCount := attributes.Len() + len(externalLabels) + len(extras)/2
+
+	if haveServiceName {
+		maxLabelCount++
+	}
+
+	if haveInstanceID {
+		maxLabelCount++
+	}
+
 	// map ensures no duplicate label name
-	l := map[string]prompb.Label{}
+	l := make(map[string]string, maxLabelCount)

 	// Ensure attributes are sorted by key for consistent merging of keys which
 	// collide when sanitized.
@ -164,35 +185,25 @@ func createAttributes(resource pcommon.Resource, attributes pcommon.Map, externa
 	sort.Stable(ByLabelName(labels))

 	for _, label := range labels {
-		finalKey := prometheustranslator.NormalizeLabel(label.Name)
+		var finalKey = prometheustranslator.NormalizeLabel(label.Name)
 		if existingLabel, alreadyExists := l[finalKey]; alreadyExists {
-			existingLabel.Value = existingLabel.Value + ";" + label.Value
-			l[finalKey] = existingLabel
+			l[finalKey] = existingLabel + ";" + label.Value
 		} else {
-			l[finalKey] = prompb.Label{
-				Name:  finalKey,
-				Value: label.Value,
-			}
+			l[finalKey] = label.Value
 		}
 	}

 	// Map service.name + service.namespace to job
-	if serviceName, ok := resource.Attributes().Get(conventions.AttributeServiceName); ok {
+	if haveServiceName {
 		val := serviceName.AsString()
 		if serviceNamespace, ok := resource.Attributes().Get(conventions.AttributeServiceNamespace); ok {
 			val = fmt.Sprintf("%s/%s", serviceNamespace.AsString(), val)
 		}
-		l[model.JobLabel] = prompb.Label{
-			Name:  model.JobLabel,
-			Value: val,
-		}
+		l[model.JobLabel] = val
 	}
 	// Map service.instance.id to instance
-	if instance, ok := resource.Attributes().Get(conventions.AttributeServiceInstanceID); ok {
-		l[model.InstanceLabel] = prompb.Label{
-			Name:  model.InstanceLabel,
-			Value: instance.AsString(),
-		}
+	if haveInstanceID {
+		l[model.InstanceLabel] = instance.AsString()
 	}
 	for key, value := range externalLabels {
 		// External labels have already been sanitized
@ -200,10 +211,7 @@ func createAttributes(resource pcommon.Resource, attributes pcommon.Map, externa
 			// Skip external labels if they are overridden by metric attributes
 			continue
 		}
-		l[key] = prompb.Label{
-			Name:  key,
-			Value: value,
-		}
+		l[key] = value
 	}

 	for i := 0; i < len(extras); i += 2 {
@ -219,15 +227,12 @@ func createAttributes(resource pcommon.Resource, attributes pcommon.Map, externa
 		if !(len(name) > 4 && name[:2] == "__" && name[len(name)-2:] == "__") {
 			name = prometheustranslator.NormalizeLabel(name)
 		}
-		l[name] = prompb.Label{
-			Name:  name,
-			Value: extras[i+1],
-		}
+		l[name] = extras[i+1]
 	}

 	s := make([]prompb.Label, 0, len(l))
-	for _, lb := range l {
-		s = append(s, lb)
+	for k, v := range l {
+		s = append(s, prompb.Label{Name: k, Value: v})
 	}

 	return s
@ -236,6 +241,7 @@ func createAttributes(resource pcommon.Resource, attributes pcommon.Map, externa
 // isValidAggregationTemporality checks whether an OTel metric has a valid
 // aggregation temporality for conversion to a Prometheus metric.
 func isValidAggregationTemporality(metric pmetric.Metric) bool {
+	//exhaustive:enforce
 	switch metric.Type() {
 	case pmetric.MetricTypeGauge, pmetric.MetricTypeSummary:
 		return true
@ -254,7 +260,22 @@ func isValidAggregationTemporality(metric pmetric.Metric) bool {
 func addSingleHistogramDataPoint(pt pmetric.HistogramDataPoint, resource pcommon.Resource, metric pmetric.Metric, settings Settings, tsMap map[string]*prompb.TimeSeries) {
 	timestamp := convertTimeStamp(pt.Timestamp())
 	// sum, count, and buckets of the histogram should append suffix to baseName
-	baseName := prometheustranslator.BuildPromCompliantName(metric, settings.Namespace)
+	baseName := prometheustranslator.BuildCompliantName(metric, settings.Namespace, settings.AddMetricSuffixes)
+	baseLabels := createAttributes(resource, pt.Attributes(), settings.ExternalLabels)
+
+	createLabels := func(nameSuffix string, extras ...string) []prompb.Label {
+		extraLabelCount := len(extras) / 2
+		labels := make([]prompb.Label, len(baseLabels), len(baseLabels)+extraLabelCount+1) // +1 for name
+		copy(labels, baseLabels)
+
+		for extrasIdx := 0; extrasIdx < extraLabelCount; extrasIdx++ {
+			labels = append(labels, prompb.Label{Name: extras[extrasIdx], Value: extras[extrasIdx+1]})
+		}
+
+		labels = append(labels, prompb.Label{Name: nameStr, Value: baseName + nameSuffix})
+
+		return labels
+	}

 	// If the sum is unset, it indicates the _sum metric point should be
 	// omitted
@ -268,7 +289,7 @@ func addSingleHistogramDataPoint(pt pmetric.HistogramDataPoint, resource pcommon
 			sum.Value = math.Float64frombits(value.StaleNaN)
 		}

-		sumlabels := createAttributes(resource, pt.Attributes(), settings.ExternalLabels, nameStr, baseName+sumStr)
+		sumlabels := createLabels(sumStr)
 		addSample(tsMap, sum, sumlabels, metric.Type().String())

 	}
@ -282,7 +303,7 @@ func addSingleHistogramDataPoint(pt pmetric.HistogramDataPoint, resource pcommon
 		count.Value = math.Float64frombits(value.StaleNaN)
 	}

-	countlabels := createAttributes(resource, pt.Attributes(), settings.ExternalLabels, nameStr, baseName+countStr)
+	countlabels := createLabels(countStr)
 	addSample(tsMap, count, countlabels, metric.Type().String())

 	// cumulative count for conversion to cumulative histogram
@ -304,7 +325,7 @@ func addSingleHistogramDataPoint(pt pmetric.HistogramDataPoint, resource pcommon
 			bucket.Value = math.Float64frombits(value.StaleNaN)
 		}
 		boundStr := strconv.FormatFloat(bound, 'f', -1, 64)
-		labels := createAttributes(resource, pt.Attributes(), settings.ExternalLabels, nameStr, baseName+bucketStr, leStr, boundStr)
+		labels := createLabels(bucketStr, leStr, boundStr)
 		sig := addSample(tsMap, bucket, labels, metric.Type().String())

 		bucketBounds = append(bucketBounds, bucketBoundsData{sig: sig, bound: bound})
@ -318,7 +339,7 @@ func addSingleHistogramDataPoint(pt pmetric.HistogramDataPoint, resource pcommon
 	} else {
 		infBucket.Value = float64(pt.Count())
 	}
-	infLabels := createAttributes(resource, pt.Attributes(), settings.ExternalLabels, nameStr, baseName+bucketStr, leStr, pInfStr)
+	infLabels := createLabels(bucketStr, leStr, pInfStr)
 	sig := addSample(tsMap, infBucket, infLabels, metric.Type().String())

 	bucketBounds = append(bucketBounds, bucketBoundsData{sig: sig, bound: math.Inf(1)})
@ -327,14 +348,8 @@ func addSingleHistogramDataPoint(pt pmetric.HistogramDataPoint, resource pcommon
 	// add _created time series if needed
 	startTimestamp := pt.StartTimestamp()
 	if settings.ExportCreatedMetric && startTimestamp != 0 {
-		createdLabels := createAttributes(
-			resource,
-			pt.Attributes(),
-			settings.ExternalLabels,
-			nameStr,
-			baseName+createdSuffix,
-		)
-		addCreatedTimeSeriesIfNeeded(tsMap, createdLabels, startTimestamp, metric.Type().String())
+		labels := createLabels(createdSuffix)
+		addCreatedTimeSeriesIfNeeded(tsMap, labels, startTimestamp, metric.Type().String())
 	}
 }

@ -402,6 +417,7 @@ func getPromExemplars[T exemplarType](pt T) []prompb.Exemplar {
 func mostRecentTimestampInMetric(metric pmetric.Metric) pcommon.Timestamp {
 	var ts pcommon.Timestamp
 	// handle individual metric based on type
+	//exhaustive:enforce
 	switch metric.Type() {
 	case pmetric.MetricTypeGauge:
 		dataPoints := metric.Gauge().DataPoints()
@ -441,11 +457,26 @@ func maxTimestamp(a, b pcommon.Timestamp) pcommon.Timestamp {

 // addSingleSummaryDataPoint converts pt to len(QuantileValues) + 2 samples.
 func addSingleSummaryDataPoint(pt pmetric.SummaryDataPoint, resource pcommon.Resource, metric pmetric.Metric, settings Settings,
-	tsMap map[string]*prompb.TimeSeries,
-) {
+	tsMap map[string]*prompb.TimeSeries) {
 	timestamp := convertTimeStamp(pt.Timestamp())
 	// sum and count of the summary should append suffix to baseName
-	baseName := prometheustranslator.BuildPromCompliantName(metric, settings.Namespace)
+	baseName := prometheustranslator.BuildCompliantName(metric, settings.Namespace, settings.AddMetricSuffixes)
+	baseLabels := createAttributes(resource, pt.Attributes(), settings.ExternalLabels)
+
+	createLabels := func(name string, extras ...string) []prompb.Label {
+		extraLabelCount := len(extras) / 2
+		labels := make([]prompb.Label, len(baseLabels), len(baseLabels)+extraLabelCount+1) // +1 for name
+		copy(labels, baseLabels)
+
+		for extrasIdx := 0; extrasIdx < extraLabelCount; extrasIdx++ {
+			labels = append(labels, prompb.Label{Name: extras[extrasIdx], Value: extras[extrasIdx+1]})
+		}
+
+		labels = append(labels, prompb.Label{Name: nameStr, Value: name})
+
+		return labels
+	}
+
 	// treat sum as a sample in an individual TimeSeries
 	sum := &prompb.Sample{
 		Value:     pt.Sum(),
@ -454,7 +485,7 @@ func addSingleSummaryDataPoint(pt pmetric.SummaryDataPoint, resource pcommon.Res
 	if pt.Flags().NoRecordedValue() {
 		sum.Value = math.Float64frombits(value.StaleNaN)
 	}
-	sumlabels := createAttributes(resource, pt.Attributes(), settings.ExternalLabels, nameStr, baseName+sumStr)
+	sumlabels := createLabels(baseName + sumStr)
 	addSample(tsMap, sum, sumlabels, metric.Type().String())

 	// treat count as a sample in an individual TimeSeries
@ -465,7 +496,7 @@ func addSingleSummaryDataPoint(pt pmetric.SummaryDataPoint, resource pcommon.Res
 	if pt.Flags().NoRecordedValue() {
 		count.Value = math.Float64frombits(value.StaleNaN)
 	}
-	countlabels := createAttributes(resource, pt.Attributes(), settings.ExternalLabels, nameStr, baseName+countStr)
+	countlabels := createLabels(baseName + countStr)
 	addSample(tsMap, count, countlabels, metric.Type().String())

 	// process each percentile/quantile
@ -479,20 +510,14 @@ func addSingleSummaryDataPoint(pt pmetric.SummaryDataPoint, resource pcommon.Res
 			quantile.Value = math.Float64frombits(value.StaleNaN)
 		}
 		percentileStr := strconv.FormatFloat(qt.Quantile(), 'f', -1, 64)
-		qtlabels := createAttributes(resource, pt.Attributes(), settings.ExternalLabels, nameStr, baseName, quantileStr, percentileStr)
+		qtlabels := createLabels(baseName, quantileStr, percentileStr)
 		addSample(tsMap, quantile, qtlabels, metric.Type().String())
 	}

 	// add _created time series if needed
 	startTimestamp := pt.StartTimestamp()
 	if settings.ExportCreatedMetric && startTimestamp != 0 {
-		createdLabels := createAttributes(
-			resource,
-			pt.Attributes(),
-			settings.ExternalLabels,
-			nameStr,
-			baseName+createdSuffix,
-		)
+		createdLabels := createLabels(baseName + createdSuffix)
 		addCreatedTimeSeriesIfNeeded(tsMap, createdLabels, startTimestamp, metric.Type().String())
 	}
 }
--- a/storage/remote/otlptranslator/prometheusremotewrite/histograms.go
+++ b/storage/remote/otlptranslator/prometheusremotewrite/histograms.go
@ -60,15 +60,20 @@ func addSingleExponentialHistogramDataPoint(
 // to Prometheus Native Histogram.
 func exponentialToNativeHistogram(p pmetric.ExponentialHistogramDataPoint) (prompb.Histogram, error) {
 	scale := p.Scale()
-	if scale < -4 || scale > 8 {
+	if scale < -4 {
 		return prompb.Histogram{},
 			fmt.Errorf("cannot convert exponential to native histogram."+
-				" Scale must be <= 8 and >= -4, was %d", scale)
-		// TODO: downscale to 8 if scale > 8
+				" Scale must be >= -4, was %d", scale)
 	}

-	pSpans, pDeltas := convertBucketsLayout(p.Positive())
-	nSpans, nDeltas := convertBucketsLayout(p.Negative())
+	var scaleDown int32
+	if scale > 8 {
+		scaleDown = scale - 8
+		scale = 8
+	}
+
+	pSpans, pDeltas := convertBucketsLayout(p.Positive(), scaleDown)
+	nSpans, nDeltas := convertBucketsLayout(p.Negative(), scaleDown)

 	h := prompb.Histogram{
 		Schema: scale,
@ -106,17 +111,19 @@ func exponentialToNativeHistogram(p pmetric.ExponentialHistogramDataPoint) (prom
 // The bucket indexes conversion was adjusted, since OTel exp. histogram bucket
 // index 0 corresponds to the range (1, base] while Prometheus bucket index 0
 // to the range (base 1].
-func convertBucketsLayout(buckets pmetric.ExponentialHistogramDataPointBuckets) ([]prompb.BucketSpan, []int64) {
+//
+// scaleDown is the factor by which the buckets are scaled down. In other words 2^scaleDown buckets will be merged into one.
+func convertBucketsLayout(buckets pmetric.ExponentialHistogramDataPointBuckets, scaleDown int32) ([]prompb.BucketSpan, []int64) {
 	bucketCounts := buckets.BucketCounts()
 	if bucketCounts.Len() == 0 {
 		return nil, nil
 	}

 	var (
-		spans         []prompb.BucketSpan
-		deltas        []int64
-		prevCount     int64
-		nextBucketIdx int32
+		spans     []prompb.BucketSpan
+		deltas    []int64
+		count     int64
+		prevCount int64
 	)

 	appendDelta := func(count int64) {
@ -125,34 +132,67 @@ func convertBucketsLayout(buckets pmetric.ExponentialHistogramDataPointBuckets)
 		prevCount = count
 	}

-	for i := 0; i < bucketCounts.Len(); i++ {
-		count := int64(bucketCounts.At(i))
+	// Let the compiler figure out that this is const during this function by
+	// moving it into a local variable.
+	numBuckets := bucketCounts.Len()
+
+	// The offset is scaled and adjusted by 1 as described above.
+	bucketIdx := buckets.Offset()>>scaleDown + 1
+	spans = append(spans, prompb.BucketSpan{
+		Offset: bucketIdx,
+		Length: 0,
+	})
+
+	for i := 0; i < numBuckets; i++ {
+		// The offset is scaled and adjusted by 1 as described above.
+		nextBucketIdx := (int32(i)+buckets.Offset())>>scaleDown + 1
+		if bucketIdx == nextBucketIdx { // We have not collected enough buckets to merge yet.
+			count += int64(bucketCounts.At(i))
+			continue
+		}
 		if count == 0 {
+			count = int64(bucketCounts.At(i))
 			continue
 		}

-		// The offset is adjusted by 1 as described above.
-		bucketIdx := int32(i) + buckets.Offset() + 1
-		delta := bucketIdx - nextBucketIdx
-		if i == 0 || delta > 2 {
-			// We have to create a new span, either because we are
-			// at the very beginning, or because we have found a gap
+		gap := nextBucketIdx - bucketIdx - 1
+		if gap > 2 {
+			// We have to create a new span, because we have found a gap
 			// of more than two buckets. The constant 2 is copied from the logic in
 			// https://github.com/prometheus/client_golang/blob/27f0506d6ebbb117b6b697d0552ee5be2502c5f2/prometheus/histogram.go#L1296
 			spans = append(spans, prompb.BucketSpan{
-				Offset: delta,
+				Offset: gap,
 				Length: 0,
 			})
 		} else {
 			// We have found a small gap (or no gap at all).
 			// Insert empty buckets as needed.
-			for j := int32(0); j < delta; j++ {
+			for j := int32(0); j < gap; j++ {
 				appendDelta(0)
 			}
 		}
 		appendDelta(count)
-		nextBucketIdx = bucketIdx + 1
+		count = int64(bucketCounts.At(i))
+		bucketIdx = nextBucketIdx
 	}
+	// Need to use the last item's index. The offset is scaled and adjusted by 1 as described above.
+	gap := (int32(numBuckets)+buckets.Offset()-1)>>scaleDown + 1 - bucketIdx
+	if gap > 2 {
+		// We have to create a new span, because we have found a gap
+		// of more than two buckets. The constant 2 is copied from the logic in
+		// https://github.com/prometheus/client_golang/blob/27f0506d6ebbb117b6b697d0552ee5be2502c5f2/prometheus/histogram.go#L1296
+		spans = append(spans, prompb.BucketSpan{
+			Offset: gap,
+			Length: 0,
+		})
+	} else {
+		// We have found a small gap (or no gap at all).
+		// Insert empty buckets as needed.
+		for j := int32(0); j < gap; j++ {
+			appendDelta(0)
+		}
+	}
+	appendDelta(count)

 	return spans, deltas
 }
--- a/storage/remote/otlptranslator/prometheusremotewrite/metrics_to_prw.go
+++ b/storage/remote/otlptranslator/prometheusremotewrite/metrics_to_prw.go
@ -22,6 +22,7 @@ type Settings struct {
 	ExternalLabels      map[string]string
 	DisableTargetInfo   bool
 	ExportCreatedMetric bool
+	AddMetricSuffixes   bool
 }

 // FromMetrics converts pmetric.Metrics to prometheus remote write format.
@ -51,6 +52,7 @@ func FromMetrics(md pmetric.Metrics, settings Settings) (tsMap map[string]*promp
 				}

 				// handle individual metric based on type
+				//exhaustive:enforce
 				switch metric.Type() {
 				case pmetric.MetricTypeGauge:
 					dataPoints := metric.Gauge().DataPoints()
@ -81,7 +83,7 @@ func FromMetrics(md pmetric.Metrics, settings Settings) (tsMap map[string]*promp
 					if dataPoints.Len() == 0 {
 						errs = multierr.Append(errs, fmt.Errorf("empty data points. %s is dropped", metric.Name()))
 					}
-					name := prometheustranslator.BuildPromCompliantName(metric, settings.Namespace)
+					name := prometheustranslator.BuildCompliantName(metric, settings.Namespace, settings.AddMetricSuffixes)
 					for x := 0; x < dataPoints.Len(); x++ {
 						errs = multierr.Append(
 							errs,
--- a/storage/remote/otlptranslator/prometheusremotewrite/number_data_points.go
+++ b/storage/remote/otlptranslator/prometheusremotewrite/number_data_points.go
@ -27,7 +27,7 @@ func addSingleGaugeNumberDataPoint(
 	settings Settings,
 	series map[string]*prompb.TimeSeries,
 ) {
-	name := prometheustranslator.BuildPromCompliantName(metric, settings.Namespace)
+	name := prometheustranslator.BuildCompliantName(metric, settings.Namespace, settings.AddMetricSuffixes)
 	labels := createAttributes(
 		resource,
 		pt.Attributes(),
@ -60,7 +60,7 @@ func addSingleSumNumberDataPoint(
 	settings Settings,
 	series map[string]*prompb.TimeSeries,
 ) {
-	name := prometheustranslator.BuildPromCompliantName(metric, settings.Namespace)
+	name := prometheustranslator.BuildCompliantName(metric, settings.Namespace, settings.AddMetricSuffixes)
 	labels := createAttributes(
 		resource,
 		pt.Attributes(),
--- a/storage/remote/otlptranslator/update-copy.sh
+++ b/storage/remote/otlptranslator/update-copy.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-OTEL_VERSION=v0.81.0
+OTEL_VERSION=v0.88.0

 git clone https://github.com/open-telemetry/opentelemetry-collector-contrib ./tmp
 cd ./tmp
@ -8,7 +8,8 @@ git checkout $OTEL_VERSION
 cd ..
 rm -rf ./prometheusremotewrite/*
 cp -r ./tmp/pkg/translator/prometheusremotewrite/*.go ./prometheusremotewrite
-rm -rf ./prometheusremotewrite/*_test.go
+cp -r ./tmp/pkg/translator/prometheus/*.go ./prometheus
+rm -rf ./prometheus/*_test.go
 rm -rf ./tmp

 sed -i '' 's#github.com/open-telemetry/opentelemetry-collector-contrib/pkg/translator/prometheus#github.com/prometheus/prometheus/storage/remote/otlptranslator/prometheus#g' ./prometheusremotewrite/*.go
--- a/storage/remote/storage.go
+++ b/storage/remote/storage.go
@ -77,10 +77,7 @@ func NewStorage(l log.Logger, reg prometheus.Registerer, stCallback startTimeCal
 }

 func (s *Storage) Notify() {
-	for _, q := range s.rws.queues {
-		// These should all be non blocking
-		q.watcher.Notify()
-	}
+	s.rws.Notify()
 }

 // ApplyConfig updates the state as the new config requires.
--- a/storage/remote/storage_test.go
+++ b/storage/remote/storage_test.go
@ -14,7 +14,9 @@
 package remote

 import (
+	"fmt"
 	"net/url"
+	"sync"
 	"testing"

 	common_config "github.com/prometheus/common/config"
@ -147,3 +149,39 @@ func baseRemoteReadConfig(host string) *config.RemoteReadConfig {
 	}
 	return &cfg
 }
+
+// TestWriteStorageApplyConfigsDuringCommit helps detecting races when
+// ApplyConfig runs concurrently with Notify
+// See https://github.com/prometheus/prometheus/issues/12747
+func TestWriteStorageApplyConfigsDuringCommit(t *testing.T) {
+	s := NewStorage(nil, nil, nil, t.TempDir(), defaultFlushDeadline, nil)
+
+	var wg sync.WaitGroup
+	wg.Add(2000)
+
+	start := make(chan struct{})
+	for i := 0; i < 1000; i++ {
+		go func(i int) {
+			<-start
+			conf := &config.Config{
+				GlobalConfig: config.DefaultGlobalConfig,
+				RemoteWriteConfigs: []*config.RemoteWriteConfig{
+					baseRemoteWriteConfig(fmt.Sprintf("http://test-%d.com", i)),
+				},
+			}
+			require.NoError(t, s.ApplyConfig(conf))
+			wg.Done()
+		}(i)
+	}
+
+	for i := 0; i < 1000; i++ {
+		go func() {
+			<-start
+			s.Notify()
+			wg.Done()
+		}()
+	}
+
+	close(start)
+	wg.Wait()
+}
--- a/storage/remote/write.go
+++ b/storage/remote/write.go
@ -121,6 +121,16 @@ func (rws *WriteStorage) run() {
 	}
 }

+func (rws *WriteStorage) Notify() {
+	rws.mtx.Lock()
+	defer rws.mtx.Unlock()
+
+	for _, q := range rws.queues {
+		// These should all be non blocking
+		q.watcher.Notify()
+	}
+}
+
 // ApplyConfig updates the state as the new config requires.
 // Only stop & create queues which have changes.
 func (rws *WriteStorage) ApplyConfig(conf *config.Config) error {
--- a/storage/remote/write_handler.go
+++ b/storage/remote/write_handler.go
@ -207,7 +207,9 @@ func (h *otlpWriteHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 		return
 	}

-	prwMetricsMap, errs := otlptranslator.FromMetrics(req.Metrics(), otlptranslator.Settings{})
+	prwMetricsMap, errs := otlptranslator.FromMetrics(req.Metrics(), otlptranslator.Settings{
+		AddMetricSuffixes: true,
+	})
 	if errs != nil {
 		level.Warn(h.logger).Log("msg", "Error translating OTLP metrics to Prometheus write request", "err", errs)
 	}
--- a/tsdb/agent/db.go
+++ b/tsdb/agent/db.go
@ -15,6 +15,7 @@ package agent

 import (
 	"context"
+	"errors"
 	"fmt"
 	"math"
 	"path/filepath"
@ -24,7 +25,6 @@ import (

 	"github.com/go-kit/log"
 	"github.com/go-kit/log/level"
-	"github.com/pkg/errors"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/common/model"
 	"go.uber.org/atomic"
@ -263,7 +263,7 @@ func Open(l log.Logger, reg prometheus.Registerer, rs *remote.Storage, dir strin

 	w, err := wlog.NewSize(l, reg, dir, opts.WALSegmentSize, opts.WALCompression)
 	if err != nil {
-		return nil, errors.Wrap(err, "creating WAL")
+		return nil, fmt.Errorf("creating WAL: %w", err)
 	}

 	db := &DB{
@ -302,7 +302,7 @@ func Open(l log.Logger, reg prometheus.Registerer, rs *remote.Storage, dir strin
 	if err := db.replayWAL(); err != nil {
 		level.Warn(db.logger).Log("msg", "encountered WAL read error, attempting repair", "err", err)
 		if err := w.Repair(err); err != nil {
-			return nil, errors.Wrap(err, "repair corrupted WAL")
+			return nil, fmt.Errorf("repair corrupted WAL: %w", err)
 		}
 		level.Info(db.logger).Log("msg", "successfully repaired WAL")
 	}
@ -352,7 +352,7 @@ func (db *DB) replayWAL() error {

 	dir, startFrom, err := wlog.LastCheckpoint(db.wal.Dir())
 	if err != nil && err != record.ErrNotFound {
-		return errors.Wrap(err, "find last checkpoint")
+		return fmt.Errorf("find last checkpoint: %w", err)
 	}

 	multiRef := map[chunks.HeadSeriesRef]chunks.HeadSeriesRef{}
@ -360,7 +360,7 @@ func (db *DB) replayWAL() error {
 	if err == nil {
 		sr, err := wlog.NewSegmentsReader(dir)
 		if err != nil {
-			return errors.Wrap(err, "open checkpoint")
+			return fmt.Errorf("open checkpoint: %w", err)
 		}
 		defer func() {
 			if err := sr.Close(); err != nil {
@ -371,7 +371,7 @@ func (db *DB) replayWAL() error {
 		// A corrupted checkpoint is a hard error for now and requires user
 		// intervention. There's likely little data that can be recovered anyway.
 		if err := db.loadWAL(wlog.NewReader(sr), multiRef); err != nil {
-			return errors.Wrap(err, "backfill checkpoint")
+			return fmt.Errorf("backfill checkpoint: %w", err)
 		}
 		startFrom++
 		level.Info(db.logger).Log("msg", "WAL checkpoint loaded")
@ -380,14 +380,14 @@ func (db *DB) replayWAL() error {
 	// Find the last segment.
 	_, last, err := wlog.Segments(db.wal.Dir())
 	if err != nil {
-		return errors.Wrap(err, "finding WAL segments")
+		return fmt.Errorf("finding WAL segments: %w", err)
 	}

 	// Backfil segments from the most recent checkpoint onwards.
 	for i := startFrom; i <= last; i++ {
 		seg, err := wlog.OpenReadSegment(wlog.SegmentName(db.wal.Dir(), i))
 		if err != nil {
-			return errors.Wrap(err, fmt.Sprintf("open WAL segment: %d", i))
+			return fmt.Errorf("open WAL segment: %d: %w", i, err)
 		}

 		sr := wlog.NewSegmentBufReader(seg)
@ -432,7 +432,7 @@ func (db *DB) loadWAL(r *wlog.Reader, multiRef map[chunks.HeadSeriesRef]chunks.H
 				series, err = dec.Series(rec, series)
 				if err != nil {
 					errCh <- &wlog.CorruptionErr{
-						Err:     errors.Wrap(err, "decode series"),
+						Err:     fmt.Errorf("decode series: %w", err),
 						Segment: r.Segment(),
 						Offset:  r.Offset(),
 					}
@ -444,7 +444,7 @@ func (db *DB) loadWAL(r *wlog.Reader, multiRef map[chunks.HeadSeriesRef]chunks.H
 				samples, err = dec.Samples(rec, samples)
 				if err != nil {
 					errCh <- &wlog.CorruptionErr{
-						Err:     errors.Wrap(err, "decode samples"),
+						Err:     fmt.Errorf("decode samples: %w", err),
 						Segment: r.Segment(),
 						Offset:  r.Offset(),
 					}
@ -456,7 +456,7 @@ func (db *DB) loadWAL(r *wlog.Reader, multiRef map[chunks.HeadSeriesRef]chunks.H
 				histograms, err = dec.HistogramSamples(rec, histograms)
 				if err != nil {
 					errCh <- &wlog.CorruptionErr{
-						Err:     errors.Wrap(err, "decode histogram samples"),
+						Err:     fmt.Errorf("decode histogram samples: %w", err),
 						Segment: r.Segment(),
 						Offset:  r.Offset(),
 					}
@ -468,7 +468,7 @@ func (db *DB) loadWAL(r *wlog.Reader, multiRef map[chunks.HeadSeriesRef]chunks.H
 				floatHistograms, err = dec.FloatHistogramSamples(rec, floatHistograms)
 				if err != nil {
 					errCh <- &wlog.CorruptionErr{
-						Err:     errors.Wrap(err, "decode float histogram samples"),
+						Err:     fmt.Errorf("decode float histogram samples: %w", err),
 						Segment: r.Segment(),
 						Offset:  r.Offset(),
 					}
@ -482,7 +482,7 @@ func (db *DB) loadWAL(r *wlog.Reader, multiRef map[chunks.HeadSeriesRef]chunks.H
 				continue
 			default:
 				errCh <- &wlog.CorruptionErr{
-					Err:     errors.Errorf("invalid record type %v", dec.Type(rec)),
+					Err:     fmt.Errorf("invalid record type %v", dec.Type(rec)),
 					Segment: r.Segment(),
 					Offset:  r.Offset(),
 				}
@ -568,7 +568,7 @@ func (db *DB) loadWAL(r *wlog.Reader, multiRef map[chunks.HeadSeriesRef]chunks.H
 		return err
 	default:
 		if r.Err() != nil {
-			return errors.Wrap(r.Err(), "read records")
+			return fmt.Errorf("read records: %w", r.Err())
 		}
 		return nil
 	}
@ -622,13 +622,13 @@ func (db *DB) truncate(mint int64) error {

 	first, last, err := wlog.Segments(db.wal.Dir())
 	if err != nil {
-		return errors.Wrap(err, "get segment range")
+		return fmt.Errorf("get segment range: %w", err)
 	}

 	// Start a new segment so low ingestion volume instances don't have more WAL
 	// than needed.
 	if _, err := db.wal.NextSegment(); err != nil {
-		return errors.Wrap(err, "next segment")
+		return fmt.Errorf("next segment: %w", err)
 	}

 	last-- // Never consider most recent segment for checkpoint
@ -656,10 +656,11 @@ func (db *DB) truncate(mint int64) error {

 	if _, err = wlog.Checkpoint(db.logger, db.wal, first, last, keep, mint); err != nil {
 		db.metrics.checkpointCreationFail.Inc()
-		if _, ok := errors.Cause(err).(*wlog.CorruptionErr); ok {
+		var cerr *wlog.CorruptionErr
+		if errors.As(err, &cerr) {
 			db.metrics.walCorruptionsTotal.Inc()
 		}
-		return errors.Wrap(err, "create checkpoint")
+		return fmt.Errorf("create checkpoint: %w", err)
 	}
 	if err := db.wal.Truncate(last + 1); err != nil {
 		// If truncating fails, we'll just try it again at the next checkpoint.
@ -780,11 +781,11 @@ func (a *appender) Append(ref storage.SeriesRef, l labels.Labels, t int64, v flo
 		// equivalent validation code in the TSDB's headAppender.
 		l = l.WithoutEmpty()
 		if l.IsEmpty() {
-			return 0, errors.Wrap(tsdb.ErrInvalidSample, "empty labelset")
+			return 0, fmt.Errorf("empty labelset: %w", tsdb.ErrInvalidSample)
 		}

 		if lbl, dup := l.HasDuplicateLabelNames(); dup {
-			return 0, errors.Wrap(tsdb.ErrInvalidSample, fmt.Sprintf(`label name "%s" is not unique`, lbl))
+			return 0, fmt.Errorf(`label name "%s" is not unique: %w`, lbl, tsdb.ErrInvalidSample)
 		}

 		var created bool
@ -841,7 +842,7 @@ func (a *appender) AppendExemplar(ref storage.SeriesRef, _ labels.Labels, e exem
 	e.Labels = e.Labels.WithoutEmpty()

 	if lbl, dup := e.Labels.HasDuplicateLabelNames(); dup {
-		return 0, errors.Wrap(tsdb.ErrInvalidExemplar, fmt.Sprintf(`label name "%s" is not unique`, lbl))
+		return 0, fmt.Errorf(`label name "%s" is not unique: %w`, lbl, tsdb.ErrInvalidExemplar)
 	}

 	// Exemplar label length does not include chars involved in text rendering such as quotes
@ -883,13 +884,13 @@ func (a *appender) AppendExemplar(ref storage.SeriesRef, _ labels.Labels, e exem

 func (a *appender) AppendHistogram(ref storage.SeriesRef, l labels.Labels, t int64, h *histogram.Histogram, fh *histogram.FloatHistogram) (storage.SeriesRef, error) {
 	if h != nil {
-		if err := tsdb.ValidateHistogram(h); err != nil {
+		if err := h.Validate(); err != nil {
 			return 0, err
 		}
 	}

 	if fh != nil {
-		if err := tsdb.ValidateFloatHistogram(fh); err != nil {
+		if err := fh.Validate(); err != nil {
 			return 0, err
 		}
 	}
@ -903,11 +904,11 @@ func (a *appender) AppendHistogram(ref storage.SeriesRef, l labels.Labels, t int
 		// equivalent validation code in the TSDB's headAppender.
 		l = l.WithoutEmpty()
 		if l.IsEmpty() {
-			return 0, errors.Wrap(tsdb.ErrInvalidSample, "empty labelset")
+			return 0, fmt.Errorf("empty labelset: %w", tsdb.ErrInvalidSample)
 		}

 		if lbl, dup := l.HasDuplicateLabelNames(); dup {
-			return 0, errors.Wrap(tsdb.ErrInvalidSample, fmt.Sprintf(`label name "%s" is not unique`, lbl))
+			return 0, fmt.Errorf(`label name "%s" is not unique: %w`, lbl, tsdb.ErrInvalidSample)
 		}

 		var created bool
--- a/tsdb/block.go
+++ b/tsdb/block.go
@ -17,6 +17,7 @@ package tsdb
 import (
 	"context"
 	"encoding/json"
+	"fmt"
 	"io"
 	"os"
 	"path/filepath"
@ -128,8 +129,19 @@ type ChunkWriter interface {

 // ChunkReader provides reading access of serialized time series data.
 type ChunkReader interface {
-	// Chunk returns the series data chunk with the given reference.
-	Chunk(meta chunks.Meta) (chunkenc.Chunk, error)
+	// ChunkOrIterable returns the series data for the given chunks.Meta.
+	// Either a single chunk will be returned, or an iterable.
+	// A single chunk should be returned if chunks.Meta maps to a chunk that
+	// already exists and doesn't need modifications.
+	// An iterable should be returned if chunks.Meta maps to a subset of the
+	// samples in a stored chunk, or multiple chunks. (E.g. OOOHeadChunkReader
+	// could return an iterable where multiple histogram samples have counter
+	// resets. There can only be one counter reset per histogram chunk so
+	// multiple chunks would be created from the iterable in this case.)
+	// Only one of chunk or iterable should be returned. In some cases you may
+	// always expect a chunk to be returned. You can check that iterable is nil
+	// in those cases.
+	ChunkOrIterable(meta chunks.Meta) (chunkenc.Chunk, chunkenc.Iterable, error)

 	// Close releases all underlying resources of the reader.
 	Close() error
@ -253,7 +265,7 @@ func readMetaFile(dir string) (*BlockMeta, int64, error) {
 		return nil, 0, err
 	}
 	if m.Version != metaVersion1 {
-		return nil, 0, errors.Errorf("unexpected meta file version %d", m.Version)
+		return nil, 0, fmt.Errorf("unexpected meta file version %d", m.Version)
 	}

 	return &m, int64(len(b)), nil
--- a/tsdb/block_test.go
+++ b/tsdb/block_test.go
@ -504,6 +504,19 @@ func createBlockFromHead(tb testing.TB, dir string, head *Head) string {
 	return filepath.Join(dir, ulid.String())
 }

+func createBlockFromOOOHead(tb testing.TB, dir string, head *OOOCompactionHead) string {
+	compactor, err := NewLeveledCompactor(context.Background(), nil, log.NewNopLogger(), []int64{1000000}, nil, nil, true)
+	require.NoError(tb, err)
+
+	require.NoError(tb, os.MkdirAll(dir, 0o777))
+
+	// Add +1 millisecond to block maxt because block intervals are half-open: [b.MinTime, b.MaxTime).
+	// Because of this block intervals are always +1 than the total samples it includes.
+	ulid, err := compactor.Write(dir, head, head.MinTime(), head.MaxTime()+1, nil)
+	require.NoError(tb, err)
+	return filepath.Join(dir, ulid.String())
+}
+
 func createHead(tb testing.TB, w *wlog.WL, series []storage.Series, chunkDir string) *Head {
 	opts := DefaultHeadOptions()
 	opts.ChunkDirRoot = chunkDir
--- a/tsdb/chunkenc/chunk.go
+++ b/tsdb/chunkenc/chunk.go
@ -14,11 +14,10 @@
 package chunkenc

 import (
+	"fmt"
 	"math"
 	"sync"

-	"github.com/pkg/errors"
-
 	"github.com/prometheus/prometheus/model/histogram"
 )

@ -68,6 +67,8 @@ const (

 // Chunk holds a sequence of sample pairs that can be iterated over and appended to.
 type Chunk interface {
+	Iterable
+
 	// Bytes returns the underlying byte slice of the chunk.
 	Bytes() []byte

@ -77,11 +78,6 @@ type Chunk interface {
 	// Appender returns an appender to append samples to the chunk.
 	Appender() (Appender, error)

-	// The iterator passed as argument is for re-use.
-	// Depending on implementation, the iterator can
-	// be re-used or a new iterator can be allocated.
-	Iterator(Iterator) Iterator
-
 	// NumSamples returns the number of samples in the chunk.
 	NumSamples() int

@ -93,6 +89,13 @@ type Chunk interface {
 	Compact()
 }

+type Iterable interface {
+	// The iterator passed as argument is for re-use.
+	// Depending on implementation, the iterator can
+	// be re-used or a new iterator can be allocated.
+	Iterator(Iterator) Iterator
+}
+
 // Appender adds sample pairs to a chunk.
 type Appender interface {
 	Append(int64, float64)
@ -185,6 +188,19 @@ func (v ValueType) ChunkEncoding() Encoding {
 	}
 }

+func (v ValueType) NewChunk() (Chunk, error) {
+	switch v {
+	case ValFloat:
+		return NewXORChunk(), nil
+	case ValHistogram:
+		return NewHistogramChunk(), nil
+	case ValFloatHistogram:
+		return NewFloatHistogramChunk(), nil
+	default:
+		return nil, fmt.Errorf("value type %v unsupported", v)
+	}
+}
+
 // MockSeriesIterator returns an iterator for a mock series with custom timeStamps and values.
 func MockSeriesIterator(timestamps []int64, values []float64) Iterator {
 	return &mockSeriesIterator{
@ -293,7 +309,7 @@ func (p *pool) Get(e Encoding, b []byte) (Chunk, error) {
 		c.b.count = 0
 		return c, nil
 	}
-	return nil, errors.Errorf("invalid chunk encoding %q", e)
+	return nil, fmt.Errorf("invalid chunk encoding %q", e)
 }

 func (p *pool) Put(c Chunk) error {
@ -332,7 +348,7 @@ func (p *pool) Put(c Chunk) error {
 		sh.b.count = 0
 		p.floatHistogram.Put(c)
 	default:
-		return errors.Errorf("invalid chunk encoding %q", c.Encoding())
+		return fmt.Errorf("invalid chunk encoding %q", c.Encoding())
 	}
 	return nil
 }
@ -349,7 +365,7 @@ func FromData(e Encoding, d []byte) (Chunk, error) {
 	case EncFloatHistogram:
 		return &FloatHistogramChunk{b: bstream{count: 0, stream: d}}, nil
 	}
-	return nil, errors.Errorf("invalid chunk encoding %q", e)
+	return nil, fmt.Errorf("invalid chunk encoding %q", e)
 }

 // NewEmptyChunk returns an empty chunk for the given encoding.
@ -362,5 +378,5 @@ func NewEmptyChunk(e Encoding) (Chunk, error) {
 	case EncFloatHistogram:
 		return NewFloatHistogramChunk(), nil
 	}
-	return nil, errors.Errorf("invalid chunk encoding %q", e)
+	return nil, fmt.Errorf("invalid chunk encoding %q", e)
 }
--- a/tsdb/chunkenc/varbit.go
+++ b/tsdb/chunkenc/varbit.go
@ -14,9 +14,8 @@
 package chunkenc

 import (
+	"fmt"
 	"math/bits"
-
-	"github.com/pkg/errors"
 )

 // putVarbitInt writes an int64 using varbit encoding with a bit bucketing
@ -109,7 +108,7 @@ func readVarbitInt(b *bstreamReader) (int64, error) {

 		val = int64(bits)
 	default:
-		return 0, errors.Errorf("invalid bit pattern %b", d)
+		return 0, fmt.Errorf("invalid bit pattern %b", d)
 	}

 	if sz != 0 {
@ -215,7 +214,7 @@ func readVarbitUint(b *bstreamReader) (uint64, error) {
 			return 0, err
 		}
 	default:
-		return 0, errors.Errorf("invalid bit pattern %b", d)
+		return 0, fmt.Errorf("invalid bit pattern %b", d)
 	}

 	if sz != 0 {
--- a/tsdb/chunks/chunks.go
+++ b/tsdb/chunks/chunks.go
@ -24,8 +24,6 @@ import (
 	"path/filepath"
 	"strconv"

-	"github.com/pkg/errors"
-
 	"github.com/prometheus/prometheus/tsdb/chunkenc"
 	tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
 	"github.com/prometheus/prometheus/tsdb/fileutil"
@ -119,11 +117,16 @@ func (b BlockChunkRef) Unpack() (int, int) {
 	return sgmIndex, chkStart
 }

-// Meta holds information about a chunk of data.
+// Meta holds information about one or more chunks.
+// For examples of when chunks.Meta could refer to multiple chunks, see
+// ChunkReader.ChunkOrIterable().
 type Meta struct {
 	// Ref and Chunk hold either a reference that can be used to retrieve
 	// chunk data or the data itself.
-	// If Chunk is nil, call ChunkReader.Chunk(Meta.Ref) to get the chunk and assign it to the Chunk field
+	// If Chunk is nil, call ChunkReader.ChunkOrIterable(Meta.Ref) to get the
+	// chunk and assign it to the Chunk field. If an iterable is returned from
+	// that method, then it may not be possible to set Chunk as the iterable
+	// might form several chunks.
 	Ref   ChunkRef
 	Chunk chunkenc.Chunk

@ -285,7 +288,7 @@ func checkCRC32(data, sum []byte) error {
 	// This combination of shifts is the inverse of digest.Sum() in go/src/hash/crc32.
 	want := uint32(sum[0])<<24 + uint32(sum[1])<<16 + uint32(sum[2])<<8 + uint32(sum[3])
 	if got != want {
-		return errors.Errorf("checksum mismatch expected:%x, actual:%x", want, got)
+		return fmt.Errorf("checksum mismatch expected:%x, actual:%x", want, got)
 	}
 	return nil
 }
@ -398,12 +401,12 @@ func (w *Writer) cut() error {
 func cutSegmentFile(dirFile *os.File, magicNumber uint32, chunksFormat byte, allocSize int64) (headerSize int, newFile *os.File, seq int, returnErr error) {
 	p, seq, err := nextSequenceFile(dirFile.Name())
 	if err != nil {
-		return 0, nil, 0, errors.Wrap(err, "next sequence file")
+		return 0, nil, 0, fmt.Errorf("next sequence file: %w", err)
 	}
 	ptmp := p + ".tmp"
 	f, err := os.OpenFile(ptmp, os.O_WRONLY|os.O_CREATE, 0o666)
 	if err != nil {
-		return 0, nil, 0, errors.Wrap(err, "open temp file")
+		return 0, nil, 0, fmt.Errorf("open temp file: %w", err)
 	}
 	defer func() {
 		if returnErr != nil {
@ -418,11 +421,11 @@ func cutSegmentFile(dirFile *os.File, magicNumber uint32, chunksFormat byte, all
 	}()
 	if allocSize > 0 {
 		if err = fileutil.Preallocate(f, allocSize, true); err != nil {
-			return 0, nil, 0, errors.Wrap(err, "preallocate")
+			return 0, nil, 0, fmt.Errorf("preallocate: %w", err)
 		}
 	}
 	if err = dirFile.Sync(); err != nil {
-		return 0, nil, 0, errors.Wrap(err, "sync directory")
+		return 0, nil, 0, fmt.Errorf("sync directory: %w", err)
 	}

 	// Write header metadata for new file.
@ -432,24 +435,24 @@ func cutSegmentFile(dirFile *os.File, magicNumber uint32, chunksFormat byte, all

 	n, err := f.Write(metab)
 	if err != nil {
-		return 0, nil, 0, errors.Wrap(err, "write header")
+		return 0, nil, 0, fmt.Errorf("write header: %w", err)
 	}
 	if err := f.Close(); err != nil {
-		return 0, nil, 0, errors.Wrap(err, "close temp file")
+		return 0, nil, 0, fmt.Errorf("close temp file: %w", err)
 	}
 	f = nil

 	if err := fileutil.Rename(ptmp, p); err != nil {
-		return 0, nil, 0, errors.Wrap(err, "replace file")
+		return 0, nil, 0, fmt.Errorf("replace file: %w", err)
 	}

 	f, err = os.OpenFile(p, os.O_WRONLY, 0o666)
 	if err != nil {
-		return 0, nil, 0, errors.Wrap(err, "open final file")
+		return 0, nil, 0, fmt.Errorf("open final file: %w", err)
 	}
 	// Skip header for further writes.
 	if _, err := f.Seek(int64(n), 0); err != nil {
-		return 0, nil, 0, errors.Wrap(err, "seek in final file")
+		return 0, nil, 0, fmt.Errorf("seek in final file: %w", err)
 	}
 	return n, f, seq, nil
 }
@ -606,16 +609,16 @@ func newReader(bs []ByteSlice, cs []io.Closer, pool chunkenc.Pool) (*Reader, err
 	cr := Reader{pool: pool, bs: bs, cs: cs}
 	for i, b := range cr.bs {
 		if b.Len() < SegmentHeaderSize {
-			return nil, errors.Wrapf(errInvalidSize, "invalid segment header in segment %d", i)
+			return nil, fmt.Errorf("invalid segment header in segment %d: %w", i, errInvalidSize)
 		}
 		// Verify magic number.
 		if m := binary.BigEndian.Uint32(b.Range(0, MagicChunksSize)); m != MagicChunks {
-			return nil, errors.Errorf("invalid magic number %x", m)
+			return nil, fmt.Errorf("invalid magic number %x", m)
 		}

 		// Verify chunk format version.
 		if v := int(b.Range(MagicChunksSize, MagicChunksSize+ChunksFormatVersionSize)[0]); v != chunksFormatV1 {
-			return nil, errors.Errorf("invalid chunk format version %d", v)
+			return nil, fmt.Errorf("invalid chunk format version %d", v)
 		}
 		cr.size += int64(b.Len())
 	}
@ -641,7 +644,7 @@ func NewDirReader(dir string, pool chunkenc.Pool) (*Reader, error) {
 		f, err := fileutil.OpenMmapFile(fn)
 		if err != nil {
 			return nil, tsdb_errors.NewMulti(
-				errors.Wrap(err, "mmap files"),
+				fmt.Errorf("mmap files: %w", err),
 				tsdb_errors.CloseAll(cs),
 			).Err()
 		}
@ -669,24 +672,24 @@ func (s *Reader) Size() int64 {
 }

 // Chunk returns a chunk from a given reference.
-func (s *Reader) Chunk(meta Meta) (chunkenc.Chunk, error) {
+func (s *Reader) ChunkOrIterable(meta Meta) (chunkenc.Chunk, chunkenc.Iterable, error) {
 	sgmIndex, chkStart := BlockChunkRef(meta.Ref).Unpack()

 	if sgmIndex >= len(s.bs) {
-		return nil, errors.Errorf("segment index %d out of range", sgmIndex)
+		return nil, nil, fmt.Errorf("segment index %d out of range", sgmIndex)
 	}

 	sgmBytes := s.bs[sgmIndex]

 	if chkStart+MaxChunkLengthFieldSize > sgmBytes.Len() {
-		return nil, errors.Errorf("segment doesn't include enough bytes to read the chunk size data field - required:%v, available:%v", chkStart+MaxChunkLengthFieldSize, sgmBytes.Len())
+		return nil, nil, fmt.Errorf("segment doesn't include enough bytes to read the chunk size data field - required:%v, available:%v", chkStart+MaxChunkLengthFieldSize, sgmBytes.Len())
 	}
 	// With the minimum chunk length this should never cause us reading
 	// over the end of the slice.
 	c := sgmBytes.Range(chkStart, chkStart+MaxChunkLengthFieldSize)
 	chkDataLen, n := binary.Uvarint(c)
 	if n <= 0 {
-		return nil, errors.Errorf("reading chunk length failed with %d", n)
+		return nil, nil, fmt.Errorf("reading chunk length failed with %d", n)
 	}

 	chkEncStart := chkStart + n
@ -695,17 +698,18 @@ func (s *Reader) Chunk(meta Meta) (chunkenc.Chunk, error) {
 	chkDataEnd := chkEnd - crc32.Size

 	if chkEnd > sgmBytes.Len() {
-		return nil, errors.Errorf("segment doesn't include enough bytes to read the chunk - required:%v, available:%v", chkEnd, sgmBytes.Len())
+		return nil, nil, fmt.Errorf("segment doesn't include enough bytes to read the chunk - required:%v, available:%v", chkEnd, sgmBytes.Len())
 	}

 	sum := sgmBytes.Range(chkDataEnd, chkEnd)
 	if err := checkCRC32(sgmBytes.Range(chkEncStart, chkDataEnd), sum); err != nil {
-		return nil, err
+		return nil, nil, err
 	}

 	chkData := sgmBytes.Range(chkDataStart, chkDataEnd)
 	chkEnc := sgmBytes.Range(chkEncStart, chkEncStart+ChunkEncodingSize)[0]
-	return s.pool.Get(chunkenc.Encoding(chkEnc), chkData)
+	chk, err := s.pool.Get(chunkenc.Encoding(chkEnc), chkData)
+	return chk, nil, err
 }

 func nextSequenceFile(dir string) (string, int, error) {
--- a/tsdb/chunks/chunks_test.go
+++ b/tsdb/chunks/chunks_test.go
@ -23,6 +23,6 @@ func TestReaderWithInvalidBuffer(t *testing.T) {
 	b := realByteSlice([]byte{0x81, 0x81, 0x81, 0x81, 0x81, 0x81})
 	r := &Reader{bs: []ByteSlice{b}}

-	_, err := r.Chunk(Meta{Ref: 0})
+	_, _, err := r.ChunkOrIterable(Meta{Ref: 0})
 	require.Error(t, err)
 }
--- a/tsdb/chunks/head_chunks.go
+++ b/tsdb/chunks/head_chunks.go
@ -17,6 +17,8 @@ import (
 	"bufio"
 	"bytes"
 	"encoding/binary"
+	"errors"
+	"fmt"
 	"hash"
 	"io"
 	"os"
@ -25,7 +27,6 @@ import (
 	"sync"

 	"github.com/dennwc/varint"
-	"github.com/pkg/errors"
 	"github.com/prometheus/client_golang/prometheus"
 	"go.uber.org/atomic"
 	"golang.org/x/exp/slices"
@ -107,7 +108,7 @@ type CorruptionErr struct {
 }

 func (e *CorruptionErr) Error() string {
-	return errors.Wrapf(e.Err, "corruption in head chunk file %s", segmentFile(e.Dir, e.FileIndex)).Error()
+	return fmt.Errorf("corruption in head chunk file %s: %w", segmentFile(e.Dir, e.FileIndex), e.Err).Error()
 }

 // chunkPos keeps track of the position in the head chunk files.
@ -240,10 +241,10 @@ type mmappedChunkFile struct {
 func NewChunkDiskMapper(reg prometheus.Registerer, dir string, pool chunkenc.Pool, writeBufferSize, writeQueueSize int) (*ChunkDiskMapper, error) {
 	// Validate write buffer size.
 	if writeBufferSize < MinWriteBufferSize || writeBufferSize > MaxWriteBufferSize {
-		return nil, errors.Errorf("ChunkDiskMapper write buffer size should be between %d and %d (actual: %d)", MinWriteBufferSize, MaxWriteBufferSize, writeBufferSize)
+		return nil, fmt.Errorf("ChunkDiskMapper write buffer size should be between %d and %d (actual: %d)", MinWriteBufferSize, MaxWriteBufferSize, writeBufferSize)
 	}
 	if writeBufferSize%1024 != 0 {
-		return nil, errors.Errorf("ChunkDiskMapper write buffer size should be a multiple of 1024 (actual: %d)", writeBufferSize)
+		return nil, fmt.Errorf("ChunkDiskMapper write buffer size should be a multiple of 1024 (actual: %d)", writeBufferSize)
 	}

 	if err := os.MkdirAll(dir, 0o777); err != nil {
@ -320,7 +321,7 @@ func (cdm *ChunkDiskMapper) openMMapFiles() (returnErr error) {
 	for seq, fn := range files {
 		f, err := fileutil.OpenMmapFile(fn)
 		if err != nil {
-			return errors.Wrapf(err, "mmap files, file: %s", fn)
+			return fmt.Errorf("mmap files, file: %s: %w", fn, err)
 		}
 		cdm.closers[seq] = f
 		cdm.mmappedChunkFiles[seq] = &mmappedChunkFile{byteSlice: realByteSlice(f.Bytes())}
@ -335,23 +336,23 @@ func (cdm *ChunkDiskMapper) openMMapFiles() (returnErr error) {
 	lastSeq := chkFileIndices[0]
 	for _, seq := range chkFileIndices[1:] {
 		if seq != lastSeq+1 {
-			return errors.Errorf("found unsequential head chunk files %s (index: %d) and %s (index: %d)", files[lastSeq], lastSeq, files[seq], seq)
+			return fmt.Errorf("found unsequential head chunk files %s (index: %d) and %s (index: %d)", files[lastSeq], lastSeq, files[seq], seq)
 		}
 		lastSeq = seq
 	}

 	for i, b := range cdm.mmappedChunkFiles {
 		if b.byteSlice.Len() < HeadChunkFileHeaderSize {
-			return errors.Wrapf(errInvalidSize, "%s: invalid head chunk file header", files[i])
+			return fmt.Errorf("%s: invalid head chunk file header: %w", files[i], errInvalidSize)
 		}
 		// Verify magic number.
 		if m := binary.BigEndian.Uint32(b.byteSlice.Range(0, MagicChunksSize)); m != MagicHeadChunks {
-			return errors.Errorf("%s: invalid magic number %x", files[i], m)
+			return fmt.Errorf("%s: invalid magic number %x", files[i], m)
 		}

 		// Verify chunk format version.
 		if v := int(b.byteSlice.Range(MagicChunksSize, MagicChunksSize+ChunksFormatVersionSize)[0]); v != chunksFormatV1 {
-			return errors.Errorf("%s: invalid chunk format version %d", files[i], v)
+			return fmt.Errorf("%s: invalid chunk format version %d", files[i], v)
 		}
 	}

@ -394,16 +395,16 @@ func repairLastChunkFile(files map[int]string) (_ map[int]string, returnErr erro

 	f, err := os.Open(files[lastFile])
 	if err != nil {
-		return files, errors.Wrap(err, "open file during last head chunk file repair")
+		return files, fmt.Errorf("open file during last head chunk file repair: %w", err)
 	}

 	buf := make([]byte, MagicChunksSize)
 	size, err := f.Read(buf)
 	if err != nil && err != io.EOF {
-		return files, errors.Wrap(err, "failed to read magic number during last head chunk file repair")
+		return files, fmt.Errorf("failed to read magic number during last head chunk file repair: %w", err)
 	}
 	if err := f.Close(); err != nil {
-		return files, errors.Wrap(err, "close file during last head chunk file repair")
+		return files, fmt.Errorf("close file during last head chunk file repair: %w", err)
 	}

 	// We either don't have enough bytes for the magic number or the magic number is 0.
@ -413,7 +414,7 @@ func repairLastChunkFile(files map[int]string) (_ map[int]string, returnErr erro
 	if size < MagicChunksSize || binary.BigEndian.Uint32(buf) == 0 {
 		// Corrupt file, hence remove it.
 		if err := os.RemoveAll(files[lastFile]); err != nil {
-			return files, errors.Wrap(err, "delete corrupted, empty head chunk file during last file repair")
+			return files, fmt.Errorf("delete corrupted, empty head chunk file during last file repair: %w", err)
 		}
 		delete(files, lastFile)
 	}
@ -560,7 +561,7 @@ func (cdm *ChunkDiskMapper) cutAndExpectRef(chkRef ChunkDiskMapperRef) (err erro
 	}

 	if expSeq, expOffset := chkRef.Unpack(); seq != expSeq || offset != expOffset {
-		return errors.Errorf("expected newly cut file to have sequence:offset %d:%d, got %d:%d", expSeq, expOffset, seq, offset)
+		return fmt.Errorf("expected newly cut file to have sequence:offset %d:%d, got %d:%d", expSeq, expOffset, seq, offset)
 	}

 	return nil
@ -702,13 +703,13 @@ func (cdm *ChunkDiskMapper) Chunk(ref ChunkDiskMapperRef) (chunkenc.Chunk, error
 			return nil, &CorruptionErr{
 				Dir:       cdm.dir.Name(),
 				FileIndex: -1,
-				Err:       errors.Errorf("head chunk file index %d more than current open file", sgmIndex),
+				Err:       fmt.Errorf("head chunk file index %d more than current open file", sgmIndex),
 			}
 		}
 		return nil, &CorruptionErr{
 			Dir:       cdm.dir.Name(),
 			FileIndex: sgmIndex,
-			Err:       errors.Errorf("head chunk file index %d does not exist on disk", sgmIndex),
+			Err:       fmt.Errorf("head chunk file index %d does not exist on disk", sgmIndex),
 		}
 	}

@ -716,7 +717,7 @@ func (cdm *ChunkDiskMapper) Chunk(ref ChunkDiskMapperRef) (chunkenc.Chunk, error
 		return nil, &CorruptionErr{
 			Dir:       cdm.dir.Name(),
 			FileIndex: sgmIndex,
-			Err:       errors.Errorf("head chunk file doesn't include enough bytes to read the chunk size data field - required:%v, available:%v", chkStart+MaxChunkLengthFieldSize, mmapFile.byteSlice.Len()),
+			Err:       fmt.Errorf("head chunk file doesn't include enough bytes to read the chunk size data field - required:%v, available:%v", chkStart+MaxChunkLengthFieldSize, mmapFile.byteSlice.Len()),
 		}
 	}

@ -735,7 +736,7 @@ func (cdm *ChunkDiskMapper) Chunk(ref ChunkDiskMapperRef) (chunkenc.Chunk, error
 		return nil, &CorruptionErr{
 			Dir:       cdm.dir.Name(),
 			FileIndex: sgmIndex,
-			Err:       errors.Errorf("reading chunk length failed with %d", n),
+			Err:       fmt.Errorf("reading chunk length failed with %d", n),
 		}
 	}

@ -745,7 +746,7 @@ func (cdm *ChunkDiskMapper) Chunk(ref ChunkDiskMapperRef) (chunkenc.Chunk, error
 		return nil, &CorruptionErr{
 			Dir:       cdm.dir.Name(),
 			FileIndex: sgmIndex,
-			Err:       errors.Errorf("head chunk file doesn't include enough bytes to read the chunk - required:%v, available:%v", chkDataEnd, mmapFile.byteSlice.Len()),
+			Err:       fmt.Errorf("head chunk file doesn't include enough bytes to read the chunk - required:%v, available:%v", chkDataEnd, mmapFile.byteSlice.Len()),
 		}
 	}

@ -762,7 +763,7 @@ func (cdm *ChunkDiskMapper) Chunk(ref ChunkDiskMapperRef) (chunkenc.Chunk, error
 		return nil, &CorruptionErr{
 			Dir:       cdm.dir.Name(),
 			FileIndex: sgmIndex,
-			Err:       errors.Errorf("checksum mismatch expected:%x, actual:%x", sum, act),
+			Err:       fmt.Errorf("checksum mismatch expected:%x, actual:%x", sum, act),
 		}
 	}

@ -830,7 +831,7 @@ func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chu
 				return &CorruptionErr{
 					Dir:       cdm.dir.Name(),
 					FileIndex: segID,
-					Err: errors.Errorf("head chunk file has some unread data, but doesn't include enough bytes to read the chunk header"+
+					Err: fmt.Errorf("head chunk file has some unread data, but doesn't include enough bytes to read the chunk header"+
 						" - required:%v, available:%v, file:%d", idx+MaxHeadChunkMetaSize, fileEnd, segID),
 				}
 			}
@ -867,7 +868,7 @@ func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chu
 				return &CorruptionErr{
 					Dir:       cdm.dir.Name(),
 					FileIndex: segID,
-					Err:       errors.Errorf("head chunk file doesn't include enough bytes to read the chunk header - required:%v, available:%v, file:%d", idx+CRCSize, fileEnd, segID),
+					Err:       fmt.Errorf("head chunk file doesn't include enough bytes to read the chunk header - required:%v, available:%v, file:%d", idx+CRCSize, fileEnd, segID),
 				}
 			}

@ -880,7 +881,7 @@ func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chu
 				return &CorruptionErr{
 					Dir:       cdm.dir.Name(),
 					FileIndex: segID,
-					Err:       errors.Errorf("checksum mismatch expected:%x, actual:%x", sum, act),
+					Err:       fmt.Errorf("checksum mismatch expected:%x, actual:%x", sum, act),
 				}
 			}
 			idx += CRCSize
@ -906,7 +907,7 @@ func (cdm *ChunkDiskMapper) IterateAllChunks(f func(seriesRef HeadSeriesRef, chu
 			return &CorruptionErr{
 				Dir:       cdm.dir.Name(),
 				FileIndex: segID,
-				Err:       errors.Errorf("head chunk file doesn't include enough bytes to read the last chunk data - required:%v, available:%v, file:%d", idx, fileEnd, segID),
+				Err:       fmt.Errorf("head chunk file doesn't include enough bytes to read the last chunk data - required:%v, available:%v, file:%d", idx, fileEnd, segID),
 			}
 		}
 	}
@ -999,10 +1000,9 @@ func (cdm *ChunkDiskMapper) deleteFiles(removedFiles []int) ([]int, error) {
 // DeleteCorrupted deletes all the head chunk files after the one which had the corruption
 // (including the corrupt file).
 func (cdm *ChunkDiskMapper) DeleteCorrupted(originalErr error) error {
-	err := errors.Cause(originalErr) // So that we can pick up errors even if wrapped.
-	cerr, ok := err.(*CorruptionErr)
-	if !ok {
-		return errors.Wrap(originalErr, "cannot handle error")
+	var cerr *CorruptionErr
+	if !errors.As(originalErr, &cerr) {
+		return fmt.Errorf("cannot handle error: %w", originalErr)
 	}

 	// Delete all the head chunk files following the corrupt head chunk file.
--- a/tsdb/compact.go
+++ b/tsdb/compact.go
@ -162,7 +162,7 @@ func NewLeveledCompactor(ctx context.Context, r prometheus.Registerer, l log.Log

 func NewLeveledCompactorWithChunkSize(ctx context.Context, r prometheus.Registerer, l log.Logger, ranges []int64, pool chunkenc.Pool, maxBlockChunkSegmentSize int64, mergeFunc storage.VerticalChunkSeriesMergeFunc, enableOverlappingCompaction bool) (*LeveledCompactor, error) {
 	if len(ranges) == 0 {
-		return nil, errors.Errorf("at least one range must be provided")
+		return nil, fmt.Errorf("at least one range must be provided")
 	}
 	if pool == nil {
 		pool = chunkenc.NewPool()
--- a/tsdb/compact_test.go
+++ b/tsdb/compact_test.go
@ -1348,6 +1348,46 @@ func BenchmarkCompactionFromHead(b *testing.B) {
 	}
 }

+func BenchmarkCompactionFromOOOHead(b *testing.B) {
+	dir := b.TempDir()
+	totalSeries := 100000
+	totalSamples := 100
+	for labelNames := 1; labelNames < totalSeries; labelNames *= 10 {
+		labelValues := totalSeries / labelNames
+		b.Run(fmt.Sprintf("labelnames=%d,labelvalues=%d", labelNames, labelValues), func(b *testing.B) {
+			chunkDir := b.TempDir()
+			opts := DefaultHeadOptions()
+			opts.ChunkRange = 1000
+			opts.ChunkDirRoot = chunkDir
+			opts.OutOfOrderTimeWindow.Store(int64(totalSamples))
+			h, err := NewHead(nil, nil, nil, nil, opts, nil)
+			require.NoError(b, err)
+			for ln := 0; ln < labelNames; ln++ {
+				app := h.Appender(context.Background())
+				for lv := 0; lv < labelValues; lv++ {
+					lbls := labels.FromStrings(fmt.Sprintf("%d", ln), fmt.Sprintf("%d%s%d", lv, postingsBenchSuffix, ln))
+					_, err = app.Append(0, lbls, int64(totalSamples), 0)
+					require.NoError(b, err)
+					for ts := 0; ts < totalSamples; ts++ {
+						_, err = app.Append(0, lbls, int64(ts), float64(ts))
+						require.NoError(b, err)
+					}
+				}
+				require.NoError(b, app.Commit())
+			}
+
+			b.ResetTimer()
+			b.ReportAllocs()
+			for i := 0; i < b.N; i++ {
+				oooHead, err := NewOOOCompactionHead(context.TODO(), h)
+				require.NoError(b, err)
+				createBlockFromOOOHead(b, filepath.Join(dir, fmt.Sprintf("%d-%d", i, labelNames)), oooHead)
+			}
+			h.Close()
+		})
+	}
+}
+
 // TestDisableAutoCompactions checks that we can
 // disable and enable the auto compaction.
 // This is needed for unit tests that rely on
--- a/tsdb/db.go
+++ b/tsdb/db.go
@ -257,10 +257,14 @@ type DB struct {
 	compactor      Compactor
 	blocksToDelete BlocksToDeleteFunc

-	// Mutex for that must be held when modifying the general block layout.
+	// Mutex for that must be held when modifying the general block layout or lastGarbageCollectedMmapRef.
 	mtx    sync.RWMutex
 	blocks []*Block

+	// The last OOO chunk that was compacted and written to disk. New queriers must not read chunks less
+	// than or equal to this reference, as these chunks could be garbage collected at any time.
+	lastGarbageCollectedMmapRef chunks.ChunkDiskMapperRef
+
 	head *Head

 	compactc chan struct{}
@ -717,7 +721,7 @@ func (db *DBReadOnly) Block(blockID string) (BlockReader, error) {

 	_, err := os.Stat(filepath.Join(db.dir, blockID))
 	if os.IsNotExist(err) {
-		return nil, errors.Errorf("invalid block ID %s", blockID)
+		return nil, fmt.Errorf("invalid block ID %s", blockID)
 	}

 	block, err := OpenBlock(db.logger, filepath.Join(db.dir, blockID), nil)
@ -1307,6 +1311,20 @@ func (db *DB) compactOOOHead(ctx context.Context) error {

 	lastWBLFile, minOOOMmapRef := oooHead.LastWBLFile(), oooHead.LastMmapRef()
 	if lastWBLFile != 0 || minOOOMmapRef != 0 {
+		if minOOOMmapRef != 0 {
+			// Ensure that no more queriers are created that will reference chunks we're about to garbage collect.
+			// truncateOOO waits for any existing queriers that reference chunks we're about to garbage collect to
+			// complete before running garbage collection, so we don't need to do that here.
+			//
+			// We take mtx to ensure that Querier() and ChunkQuerier() don't miss blocks: without this, they could
+			// capture the list of blocks before the call to reloadBlocks() above runs, but then capture
+			// lastGarbageCollectedMmapRef after we update it here, and therefore not query either the blocks we've just
+			// written or the head chunks those blocks were created from.
+			db.mtx.Lock()
+			db.lastGarbageCollectedMmapRef = minOOOMmapRef
+			db.mtx.Unlock()
+		}
+
 		if err := db.head.truncateOOO(lastWBLFile, minOOOMmapRef); err != nil {
 			return errors.Wrap(err, "truncate ooo wbl")
 		}
@ -1903,10 +1921,10 @@ func (db *DB) ForceHeadMMap() {
 // will create a new block containing all data that's currently in the memory buffer/WAL.
 func (db *DB) Snapshot(dir string, withHead bool) error {
 	if dir == db.dir {
-		return errors.Errorf("cannot snapshot into base directory")
+		return fmt.Errorf("cannot snapshot into base directory")
 	}
 	if _, err := ulid.ParseStrict(dir); err == nil {
-		return errors.Errorf("dir must not be a valid ULID")
+		return fmt.Errorf("dir must not be a valid ULID")
 	}

 	db.cmtx.Lock()
@ -1938,7 +1956,7 @@ func (db *DB) Snapshot(dir string, withHead bool) error {
 }

 // Querier returns a new querier over the data partition for the given time range.
-func (db *DB) Querier(mint, maxt int64) (storage.Querier, error) {
+func (db *DB) Querier(mint, maxt int64) (_ storage.Querier, err error) {
 	var blocks []BlockReader

 	db.mtx.RLock()
@ -1949,11 +1967,23 @@ func (db *DB) Querier(mint, maxt int64) (storage.Querier, error) {
 			blocks = append(blocks, b)
 		}
 	}
-	var inOrderHeadQuerier storage.Querier
+
+	blockQueriers := make([]storage.Querier, 0, len(blocks)+2) // +2 to allow for possible in-order and OOO head queriers
+
+	defer func() {
+		if err != nil {
+			// If we fail, all previously opened queriers must be closed.
+			for _, q := range blockQueriers {
+				// TODO(bwplotka): Handle error.
+				_ = q.Close()
+			}
+		}
+	}()
+
 	if maxt >= db.head.MinTime() {
 		rh := NewRangeHead(db.head, mint, maxt)
 		var err error
-		inOrderHeadQuerier, err = NewBlockQuerier(rh, mint, maxt)
+		inOrderHeadQuerier, err := NewBlockQuerier(rh, mint, maxt)
 		if err != nil {
 			return nil, errors.Wrapf(err, "open block querier for head %s", rh)
 		}
@ -1975,44 +2005,40 @@ func (db *DB) Querier(mint, maxt int64) (storage.Querier, error) {
 				return nil, errors.Wrapf(err, "open block querier for head while getting new querier %s", rh)
 			}
 		}
+
+		if inOrderHeadQuerier != nil {
+			blockQueriers = append(blockQueriers, inOrderHeadQuerier)
+		}
 	}

-	var outOfOrderHeadQuerier storage.Querier
 	if overlapsClosedInterval(mint, maxt, db.head.MinOOOTime(), db.head.MaxOOOTime()) {
-		rh := NewOOORangeHead(db.head, mint, maxt)
+		rh := NewOOORangeHead(db.head, mint, maxt, db.lastGarbageCollectedMmapRef)
 		var err error
-		outOfOrderHeadQuerier, err = NewBlockQuerier(rh, mint, maxt)
+		outOfOrderHeadQuerier, err := NewBlockQuerier(rh, mint, maxt)
 		if err != nil {
+			// If NewBlockQuerier() failed, make sure to clean up the pending read created by NewOOORangeHead.
+			rh.isoState.Close()
+
 			return nil, errors.Wrapf(err, "open block querier for ooo head %s", rh)
 		}
-	}

-	blockQueriers := make([]storage.Querier, 0, len(blocks))
-	for _, b := range blocks {
-		q, err := NewBlockQuerier(b, mint, maxt)
-		if err == nil {
-			blockQueriers = append(blockQueriers, q)
-			continue
-		}
-		// If we fail, all previously opened queriers must be closed.
-		for _, q := range blockQueriers {
-			// TODO(bwplotka): Handle error.
-			_ = q.Close()
-		}
-		return nil, errors.Wrapf(err, "open querier for block %s", b)
-	}
-	if inOrderHeadQuerier != nil {
-		blockQueriers = append(blockQueriers, inOrderHeadQuerier)
-	}
-	if outOfOrderHeadQuerier != nil {
 		blockQueriers = append(blockQueriers, outOfOrderHeadQuerier)
 	}
+
+	for _, b := range blocks {
+		q, err := NewBlockQuerier(b, mint, maxt)
+		if err != nil {
+			return nil, errors.Wrapf(err, "open querier for block %s", b)
+		}
+		blockQueriers = append(blockQueriers, q)
+	}
+
 	return storage.NewMergeQuerier(blockQueriers, nil, storage.ChainedSeriesMerge), nil
 }

 // blockChunkQuerierForRange returns individual block chunk queriers from the persistent blocks, in-order head block, and the
 // out-of-order head block, overlapping with the given time range.
-func (db *DB) blockChunkQuerierForRange(mint, maxt int64) ([]storage.ChunkQuerier, error) {
+func (db *DB) blockChunkQuerierForRange(mint, maxt int64) (_ []storage.ChunkQuerier, err error) {
 	var blocks []BlockReader

 	db.mtx.RLock()
@ -2023,11 +2049,22 @@ func (db *DB) blockChunkQuerierForRange(mint, maxt int64) ([]storage.ChunkQuerie
 			blocks = append(blocks, b)
 		}
 	}
-	var inOrderHeadQuerier storage.ChunkQuerier
+
+	blockQueriers := make([]storage.ChunkQuerier, 0, len(blocks)+2) // +2 to allow for possible in-order and OOO head queriers
+
+	defer func() {
+		if err != nil {
+			// If we fail, all previously opened queriers must be closed.
+			for _, q := range blockQueriers {
+				// TODO(bwplotka): Handle error.
+				_ = q.Close()
+			}
+		}
+	}()
+
 	if maxt >= db.head.MinTime() {
 		rh := NewRangeHead(db.head, mint, maxt)
-		var err error
-		inOrderHeadQuerier, err = NewBlockChunkQuerier(rh, mint, maxt)
+		inOrderHeadQuerier, err := NewBlockChunkQuerier(rh, mint, maxt)
 		if err != nil {
 			return nil, errors.Wrapf(err, "open querier for head %s", rh)
 		}
@ -2049,37 +2086,28 @@ func (db *DB) blockChunkQuerierForRange(mint, maxt int64) ([]storage.ChunkQuerie
 				return nil, errors.Wrapf(err, "open querier for head while getting new querier %s", rh)
 			}
 		}
+
+		if inOrderHeadQuerier != nil {
+			blockQueriers = append(blockQueriers, inOrderHeadQuerier)
+		}
 	}

-	var outOfOrderHeadQuerier storage.ChunkQuerier
 	if overlapsClosedInterval(mint, maxt, db.head.MinOOOTime(), db.head.MaxOOOTime()) {
-		rh := NewOOORangeHead(db.head, mint, maxt)
-		var err error
-		outOfOrderHeadQuerier, err = NewBlockChunkQuerier(rh, mint, maxt)
+		rh := NewOOORangeHead(db.head, mint, maxt, db.lastGarbageCollectedMmapRef)
+		outOfOrderHeadQuerier, err := NewBlockChunkQuerier(rh, mint, maxt)
 		if err != nil {
 			return nil, errors.Wrapf(err, "open block chunk querier for ooo head %s", rh)
 		}
+
+		blockQueriers = append(blockQueriers, outOfOrderHeadQuerier)
 	}

-	blockQueriers := make([]storage.ChunkQuerier, 0, len(blocks))
 	for _, b := range blocks {
 		q, err := NewBlockChunkQuerier(b, mint, maxt)
-		if err == nil {
-			blockQueriers = append(blockQueriers, q)
-			continue
+		if err != nil {
+			return nil, errors.Wrapf(err, "open querier for block %s", b)
 		}
-		// If we fail, all previously opened queriers must be closed.
-		for _, q := range blockQueriers {
-			// TODO(bwplotka): Handle error.
-			_ = q.Close()
-		}
-		return nil, errors.Wrapf(err, "open querier for block %s", b)
-	}
-	if inOrderHeadQuerier != nil {
-		blockQueriers = append(blockQueriers, inOrderHeadQuerier)
-	}
-	if outOfOrderHeadQuerier != nil {
-		blockQueriers = append(blockQueriers, outOfOrderHeadQuerier)
+		blockQueriers = append(blockQueriers, q)
 	}

 	return blockQueriers, nil
--- a/tsdb/db_test.go
+++ b/tsdb/db_test.go
@ -38,6 +38,7 @@ import (
 	"github.com/prometheus/client_golang/prometheus"
 	prom_testutil "github.com/prometheus/client_golang/prometheus/testutil"
 	"github.com/stretchr/testify/require"
+	"go.uber.org/atomic"
 	"go.uber.org/goleak"

 	"github.com/prometheus/prometheus/config"
@ -515,7 +516,7 @@ func TestAmendHistogramDatapointCausesError(t *testing.T) {

 	h := histogram.Histogram{
 		Schema:        3,
-		Count:         61,
+		Count:         52,
 		Sum:           2.7,
 		ZeroThreshold: 0.1,
 		ZeroCount:     42,
@ -2915,8 +2916,9 @@ func TestChunkWriter_ReadAfterWrite(t *testing.T) {

 			for _, chks := range test.chks {
 				for _, chkExp := range chks {
-					chkAct, err := r.Chunk(chkExp)
+					chkAct, iterable, err := r.ChunkOrIterable(chkExp)
 					require.NoError(t, err)
+					require.Nil(t, iterable)
 					require.Equal(t, chkExp.Chunk.Bytes(), chkAct.Bytes())
 				}
 			}
@ -2975,8 +2977,9 @@ func TestChunkReader_ConcurrentReads(t *testing.T) {
 			go func(chunk chunks.Meta) {
 				defer wg.Done()

-				chkAct, err := r.Chunk(chunk)
+				chkAct, iterable, err := r.ChunkOrIterable(chunk)
 				require.NoError(t, err)
+				require.Nil(t, iterable)
 				require.Equal(t, chunk.Chunk.Bytes(), chkAct.Bytes())
 			}(chk)
 		}
@ -3089,7 +3092,7 @@ func deleteNonBlocks(dbDir string) error {
 	}
 	for _, dir := range dirs {
 		if ok := isBlockDir(dir); !ok {
-			return errors.Errorf("root folder:%v still hase non block directory:%v", dbDir, dir.Name())
+			return fmt.Errorf("root folder:%v still hase non block directory:%v", dbDir, dir.Name())
 		}
 	}
 	return nil
@ -3618,6 +3621,264 @@ func testChunkQuerierShouldNotPanicIfHeadChunkIsTruncatedWhileReadingQueriedChun
 	}
 }

+func TestQuerierShouldNotFailIfOOOCompactionOccursAfterRetrievingQuerier(t *testing.T) {
+	opts := DefaultOptions()
+	opts.OutOfOrderTimeWindow = 3 * DefaultBlockDuration
+	db := openTestDB(t, opts, nil)
+	defer func() {
+		require.NoError(t, db.Close())
+	}()
+
+	// Disable compactions so we can control it.
+	db.DisableCompactions()
+
+	metric := labels.FromStrings(labels.MetricName, "test_metric")
+	ctx := context.Background()
+	interval := int64(15 * time.Second / time.Millisecond)
+	ts := int64(0)
+	samplesWritten := 0
+
+	// Capture the first timestamp - this will be the timestamp of the OOO sample we'll append below.
+	oooTS := ts
+	ts += interval
+
+	// Push samples after the OOO sample we'll write below.
+	for ; ts < 10*interval; ts += interval {
+		app := db.Appender(ctx)
+		_, err := app.Append(0, metric, ts, float64(ts))
+		require.NoError(t, err)
+		require.NoError(t, app.Commit())
+		samplesWritten++
+	}
+
+	// Push a single OOO sample.
+	app := db.Appender(ctx)
+	_, err := app.Append(0, metric, oooTS, float64(ts))
+	require.NoError(t, err)
+	require.NoError(t, app.Commit())
+	samplesWritten++
+
+	// Get a querier.
+	querierCreatedBeforeCompaction, err := db.ChunkQuerier(0, math.MaxInt64)
+	require.NoError(t, err)
+
+	// Start OOO head compaction.
+	compactionComplete := atomic.NewBool(false)
+	go func() {
+		defer compactionComplete.Store(true)
+
+		require.NoError(t, db.CompactOOOHead(ctx))
+		require.Equal(t, float64(1), prom_testutil.ToFloat64(db.Head().metrics.chunksRemoved))
+	}()
+
+	// Give CompactOOOHead time to start work.
+	// If it does not wait for querierCreatedBeforeCompaction to be closed, then the query will return incorrect results or fail.
+	time.Sleep(time.Second)
+	require.False(t, compactionComplete.Load(), "compaction completed before reading chunks or closing querier created before compaction")
+
+	// Get another querier. This one should only use the compacted blocks from disk and ignore the chunks that will be garbage collected.
+	querierCreatedAfterCompaction, err := db.ChunkQuerier(0, math.MaxInt64)
+	require.NoError(t, err)
+
+	testQuerier := func(q storage.ChunkQuerier) {
+		// Query back the series.
+		hints := &storage.SelectHints{Start: 0, End: math.MaxInt64, Step: interval}
+		seriesSet := q.Select(ctx, true, hints, labels.MustNewMatcher(labels.MatchEqual, labels.MetricName, "test_metric"))
+
+		// Collect the iterator for the series.
+		var iterators []chunks.Iterator
+		for seriesSet.Next() {
+			iterators = append(iterators, seriesSet.At().Iterator(nil))
+		}
+		require.NoError(t, seriesSet.Err())
+		require.Len(t, iterators, 1)
+		iterator := iterators[0]
+
+		// Check that we can still successfully read all samples.
+		samplesRead := 0
+		for iterator.Next() {
+			samplesRead += iterator.At().Chunk.NumSamples()
+		}
+
+		require.NoError(t, iterator.Err())
+		require.Equal(t, samplesWritten, samplesRead)
+	}
+
+	testQuerier(querierCreatedBeforeCompaction)
+
+	require.False(t, compactionComplete.Load(), "compaction completed before closing querier created before compaction")
+	require.NoError(t, querierCreatedBeforeCompaction.Close())
+	require.Eventually(t, compactionComplete.Load, time.Second, 10*time.Millisecond, "compaction should complete after querier created before compaction was closed, and not wait for querier created after compaction")
+
+	// Use the querier created after compaction and confirm it returns the expected results (ie. from the disk block created from OOO head and in-order head) without error.
+	testQuerier(querierCreatedAfterCompaction)
+	require.NoError(t, querierCreatedAfterCompaction.Close())
+}
+
+func TestQuerierShouldNotFailIfOOOCompactionOccursAfterSelecting(t *testing.T) {
+	opts := DefaultOptions()
+	opts.OutOfOrderTimeWindow = 3 * DefaultBlockDuration
+	db := openTestDB(t, opts, nil)
+	defer func() {
+		require.NoError(t, db.Close())
+	}()
+
+	// Disable compactions so we can control it.
+	db.DisableCompactions()
+
+	metric := labels.FromStrings(labels.MetricName, "test_metric")
+	ctx := context.Background()
+	interval := int64(15 * time.Second / time.Millisecond)
+	ts := int64(0)
+	samplesWritten := 0
+
+	// Capture the first timestamp - this will be the timestamp of the OOO sample we'll append below.
+	oooTS := ts
+	ts += interval
+
+	// Push samples after the OOO sample we'll write below.
+	for ; ts < 10*interval; ts += interval {
+		app := db.Appender(ctx)
+		_, err := app.Append(0, metric, ts, float64(ts))
+		require.NoError(t, err)
+		require.NoError(t, app.Commit())
+		samplesWritten++
+	}
+
+	// Push a single OOO sample.
+	app := db.Appender(ctx)
+	_, err := app.Append(0, metric, oooTS, float64(ts))
+	require.NoError(t, err)
+	require.NoError(t, app.Commit())
+	samplesWritten++
+
+	// Get a querier.
+	querier, err := db.ChunkQuerier(0, math.MaxInt64)
+	require.NoError(t, err)
+
+	// Query back the series.
+	hints := &storage.SelectHints{Start: 0, End: math.MaxInt64, Step: interval}
+	seriesSet := querier.Select(ctx, true, hints, labels.MustNewMatcher(labels.MatchEqual, labels.MetricName, "test_metric"))
+
+	// Start OOO head compaction.
+	compactionComplete := atomic.NewBool(false)
+	go func() {
+		defer compactionComplete.Store(true)
+
+		require.NoError(t, db.CompactOOOHead(ctx))
+		require.Equal(t, float64(1), prom_testutil.ToFloat64(db.Head().metrics.chunksRemoved))
+	}()
+
+	// Give CompactOOOHead time to start work.
+	// If it does not wait for the querier to be closed, then the query will return incorrect results or fail.
+	time.Sleep(time.Second)
+	require.False(t, compactionComplete.Load(), "compaction completed before reading chunks or closing querier")
+
+	// Collect the iterator for the series.
+	var iterators []chunks.Iterator
+	for seriesSet.Next() {
+		iterators = append(iterators, seriesSet.At().Iterator(nil))
+	}
+	require.NoError(t, seriesSet.Err())
+	require.Len(t, iterators, 1)
+	iterator := iterators[0]
+
+	// Check that we can still successfully read all samples.
+	samplesRead := 0
+	for iterator.Next() {
+		samplesRead += iterator.At().Chunk.NumSamples()
+	}
+
+	require.NoError(t, iterator.Err())
+	require.Equal(t, samplesWritten, samplesRead)
+
+	require.False(t, compactionComplete.Load(), "compaction completed before closing querier")
+	require.NoError(t, querier.Close())
+	require.Eventually(t, compactionComplete.Load, time.Second, 10*time.Millisecond, "compaction should complete after querier was closed")
+}
+
+func TestQuerierShouldNotFailIfOOOCompactionOccursAfterRetrievingIterators(t *testing.T) {
+	opts := DefaultOptions()
+	opts.OutOfOrderTimeWindow = 3 * DefaultBlockDuration
+	db := openTestDB(t, opts, nil)
+	defer func() {
+		require.NoError(t, db.Close())
+	}()
+
+	// Disable compactions so we can control it.
+	db.DisableCompactions()
+
+	metric := labels.FromStrings(labels.MetricName, "test_metric")
+	ctx := context.Background()
+	interval := int64(15 * time.Second / time.Millisecond)
+	ts := int64(0)
+	samplesWritten := 0
+
+	// Capture the first timestamp - this will be the timestamp of the OOO sample we'll append below.
+	oooTS := ts
+	ts += interval
+
+	// Push samples after the OOO sample we'll write below.
+	for ; ts < 10*interval; ts += interval {
+		app := db.Appender(ctx)
+		_, err := app.Append(0, metric, ts, float64(ts))
+		require.NoError(t, err)
+		require.NoError(t, app.Commit())
+		samplesWritten++
+	}
+
+	// Push a single OOO sample.
+	app := db.Appender(ctx)
+	_, err := app.Append(0, metric, oooTS, float64(ts))
+	require.NoError(t, err)
+	require.NoError(t, app.Commit())
+	samplesWritten++
+
+	// Get a querier.
+	querier, err := db.ChunkQuerier(0, math.MaxInt64)
+	require.NoError(t, err)
+
+	// Query back the series.
+	hints := &storage.SelectHints{Start: 0, End: math.MaxInt64, Step: interval}
+	seriesSet := querier.Select(ctx, true, hints, labels.MustNewMatcher(labels.MatchEqual, labels.MetricName, "test_metric"))
+
+	// Collect the iterator for the series.
+	var iterators []chunks.Iterator
+	for seriesSet.Next() {
+		iterators = append(iterators, seriesSet.At().Iterator(nil))
+	}
+	require.NoError(t, seriesSet.Err())
+	require.Len(t, iterators, 1)
+	iterator := iterators[0]
+
+	// Start OOO head compaction.
+	compactionComplete := atomic.NewBool(false)
+	go func() {
+		defer compactionComplete.Store(true)
+
+		require.NoError(t, db.CompactOOOHead(ctx))
+		require.Equal(t, float64(1), prom_testutil.ToFloat64(db.Head().metrics.chunksRemoved))
+	}()
+
+	// Give CompactOOOHead time to start work.
+	// If it does not wait for the querier to be closed, then the query will return incorrect results or fail.
+	time.Sleep(time.Second)
+	require.False(t, compactionComplete.Load(), "compaction completed before reading chunks or closing querier")
+
+	// Check that we can still successfully read all samples.
+	samplesRead := 0
+	for iterator.Next() {
+		samplesRead += iterator.At().Chunk.NumSamples()
+	}
+
+	require.NoError(t, iterator.Err())
+	require.Equal(t, samplesWritten, samplesRead)
+
+	require.False(t, compactionComplete.Load(), "compaction completed before closing querier")
+	require.NoError(t, querier.Close())
+	require.Eventually(t, compactionComplete.Load, time.Second, 10*time.Millisecond, "compaction should complete after querier was closed")
+}
+
 func newTestDB(t *testing.T) *DB {
 	dir := t.TempDir()

@ -6321,6 +6582,7 @@ func testHistogramAppendAndQueryHelper(t *testing.T, floatHistogram bool) {
 		t.Run("buckets disappearing", func(t *testing.T) {
 			h.PositiveSpans[1].Length--
 			h.PositiveBuckets = h.PositiveBuckets[:len(h.PositiveBuckets)-1]
+			h.Count -= 3
 			appendHistogram(series1, 110, h, &exp1, histogram.CounterReset)
 			testQuery("foo", "bar1", map[string][]chunks.Sample{series1.String(): exp1})
 		})
@ -6540,7 +6802,7 @@ func TestNativeHistogramFlag(t *testing.T) {
 		require.NoError(t, db.Close())
 	})
 	h := &histogram.Histogram{
-		Count:         10,
+		Count:         9,
 		ZeroCount:     4,
 		ZeroThreshold: 0.001,
 		Sum:           35.5,
--- a/tsdb/encoding/encoding.go
+++ b/tsdb/encoding/encoding.go
@ -15,13 +15,14 @@ package encoding

 import (
 	"encoding/binary"
+	"errors"
+	"fmt"
 	"hash"
 	"hash/crc32"
 	"math"
 	"unsafe"

 	"github.com/dennwc/varint"
-	"github.com/pkg/errors"
 )

 var (
@ -153,7 +154,7 @@ func NewDecbufUvarintAt(bs ByteSlice, off int, castagnoliTable *crc32.Table) Dec

 	l, n := varint.Uvarint(b)
 	if n <= 0 || n > binary.MaxVarintLen32 {
-		return Decbuf{E: errors.Errorf("invalid uvarint %d", n)}
+		return Decbuf{E: fmt.Errorf("invalid uvarint %d", n)}
 	}

 	if bs.Len() < off+n+int(l)+4 {
--- a/tsdb/errors/errors.go
+++ b/tsdb/errors/errors.go
@ -38,7 +38,8 @@ func (es *multiError) Add(errs ...error) {
 		if err == nil {
 			continue
 		}
-		if merr, ok := err.(nonNilMultiError); ok {
+		var merr nonNilMultiError
+		if errors.As(err, &merr) {
 			*es = append(*es, merr.errs...)
 			continue
 		}
--- a/tsdb/exemplar.go
+++ b/tsdb/exemplar.go
@ -245,11 +245,26 @@ func (ce *CircularExemplarStorage) validateExemplar(key []byte, e exemplar.Exemp

 	// Check for duplicate vs last stored exemplar for this series.
 	// NB these are expected, and appending them is a no-op.
-	if ce.exemplars[idx.newest].exemplar.Equals(e) {
+	// For floats and classic histograms, there is only 1 exemplar per series,
+	// so this is sufficient. For native histograms with multiple exemplars per series,
+	// we have another check below.
+	newestExemplar := ce.exemplars[idx.newest].exemplar
+	if newestExemplar.Equals(e) {
 		return storage.ErrDuplicateExemplar
 	}

-	if e.Ts <= ce.exemplars[idx.newest].exemplar.Ts {
+	// Since during the scrape the exemplars are sorted first by timestamp, then value, then labels,
+	// if any of these conditions are true, we know that the exemplar is either a duplicate
+	// of a previous one (but not the most recent one as that is checked above) or out of order.
+	// We now allow exemplars with duplicate timestamps as long as they have different values and/or labels
+	// since that can happen for different buckets of a native histogram.
+	// We do not distinguish between duplicates and out of order as iterating through the exemplars
+	// to check for that would be expensive (versus just comparing with the most recent one) especially
+	// since this is run under a lock, and not worth it as we just need to return an error so we do not
+	// append the exemplar.
+	if e.Ts < newestExemplar.Ts ||
+		(e.Ts == newestExemplar.Ts && e.Value < newestExemplar.Value) ||
+		(e.Ts == newestExemplar.Ts && e.Value == newestExemplar.Value && e.Labels.Hash() < newestExemplar.Labels.Hash()) {
 		if appended {
 			ce.metrics.outOfOrderExemplars.Inc()
 		}
--- a/tsdb/fileutil/mmap.go
+++ b/tsdb/fileutil/mmap.go
@ -14,9 +14,8 @@
 package fileutil

 import (
+	"fmt"
 	"os"
-
-	"github.com/pkg/errors"
 )

 type MmapFile struct {
@ -31,7 +30,7 @@ func OpenMmapFile(path string) (*MmapFile, error) {
 func OpenMmapFileWithSize(path string, size int) (mf *MmapFile, retErr error) {
 	f, err := os.Open(path)
 	if err != nil {
-		return nil, errors.Wrap(err, "try lock file")
+		return nil, fmt.Errorf("try lock file: %w", err)
 	}
 	defer func() {
 		if retErr != nil {
@ -41,14 +40,14 @@ func OpenMmapFileWithSize(path string, size int) (mf *MmapFile, retErr error) {
 	if size <= 0 {
 		info, err := f.Stat()
 		if err != nil {
-			return nil, errors.Wrap(err, "stat")
+			return nil, fmt.Errorf("stat: %w", err)
 		}
 		size = int(info.Size())
 	}

 	b, err := mmap(f, size)
 	if err != nil {
-		return nil, errors.Wrapf(err, "mmap, size %d", size)
+		return nil, fmt.Errorf("mmap, size %d: %w", size, err)
 	}

 	return &MmapFile{f: f, b: b}, nil
--- a/tsdb/fileutil/preallocate_linux.go
+++ b/tsdb/fileutil/preallocate_linux.go
@ -15,6 +15,7 @@
 package fileutil

 import (
+	"errors"
 	"os"
 	"syscall"
 )
@ -23,10 +24,10 @@ func preallocExtend(f *os.File, sizeInBytes int64) error {
 	// use mode = 0 to change size
 	err := syscall.Fallocate(int(f.Fd()), 0, 0, sizeInBytes)
 	if err != nil {
-		errno, ok := err.(syscall.Errno)
+		var errno syscall.Errno
 		// not supported; fallback
 		// fallocate EINTRs frequently in some environments; fallback
-		if ok && (errno == syscall.ENOTSUP || errno == syscall.EINTR) {
+		if errors.As(err, &errno) && (errno == syscall.ENOTSUP || errno == syscall.EINTR) {
 			return preallocExtendTrunc(f, sizeInBytes)
 		}
 	}
@ -37,9 +38,9 @@ func preallocFixed(f *os.File, sizeInBytes int64) error {
 	// use mode = 1 to keep size; see FALLOC_FL_KEEP_SIZE
 	err := syscall.Fallocate(int(f.Fd()), 1, 0, sizeInBytes)
 	if err != nil {
-		errno, ok := err.(syscall.Errno)
+		var errno syscall.Errno
 		// treat not supported as nil error
-		if ok && errno == syscall.ENOTSUP {
+		if errors.As(err, &errno) && errno == syscall.ENOTSUP {
 			return nil
 		}
 	}
--- a/tsdb/head.go
+++ b/tsdb/head.go
@ -120,6 +120,8 @@ type Head struct {

 	iso *isolation

+	oooIso *oooIsolation
+
 	cardinalityMutex      sync.Mutex
 	cardinalityCache      *index.PostingsStats // Posting stats cache which will expire after 30sec.
 	lastPostingsStatsCall time.Duration        // Last posting stats call (PostingsCardinalityStats()) time for caching.
@ -255,11 +257,11 @@ func NewHead(r prometheus.Registerer, l log.Logger, wal, wbl *wlog.WL, opts *Hea
 	// even if ooo is not enabled yet.
 	capMax := opts.OutOfOrderCapMax.Load()
 	if capMax <= 0 || capMax > 255 {
-		return nil, errors.Errorf("OOOCapMax of %d is invalid. must be > 0 and <= 255", capMax)
+		return nil, fmt.Errorf("OOOCapMax of %d is invalid. must be > 0 and <= 255", capMax)
 	}

 	if opts.ChunkRange < 1 {
-		return nil, errors.Errorf("invalid chunk range %d", opts.ChunkRange)
+		return nil, fmt.Errorf("invalid chunk range %d", opts.ChunkRange)
 	}
 	if opts.SeriesCallback == nil {
 		opts.SeriesCallback = &noopSeriesLifecycleCallback{}
@ -340,6 +342,7 @@ func (h *Head) resetInMemoryState() error {
 	}

 	h.iso = newIsolation(h.opts.IsolationDisabled)
+	h.oooIso = newOOOIsolation()

 	h.exemplarMetrics = em
 	h.exemplars = es
@ -898,7 +901,7 @@ func (h *Head) loadMmappedChunks(refSeries map[chunks.HeadSeriesRef]*memSeries)
 			slice := mmappedChunks[seriesRef]
 			if len(slice) > 0 && slice[len(slice)-1].maxTime >= mint {
 				h.metrics.mmapChunkCorruptionTotal.Inc()
-				return errors.Errorf("out of sequence m-mapped chunk for series ref %d, last chunk: [%d, %d], new: [%d, %d]",
+				return fmt.Errorf("out of sequence m-mapped chunk for series ref %d, last chunk: [%d, %d], new: [%d, %d]",
 					seriesRef, slice[len(slice)-1].minTime, slice[len(slice)-1].maxTime, mint, maxt)
 			}
 			slice = append(slice, &mmappedChunk{
@ -913,7 +916,7 @@ func (h *Head) loadMmappedChunks(refSeries map[chunks.HeadSeriesRef]*memSeries)

 		if len(ms.mmappedChunks) > 0 && ms.mmappedChunks[len(ms.mmappedChunks)-1].maxTime >= mint {
 			h.metrics.mmapChunkCorruptionTotal.Inc()
-			return errors.Errorf("out of sequence m-mapped chunk for series ref %d, last chunk: [%d, %d], new: [%d, %d]",
+			return fmt.Errorf("out of sequence m-mapped chunk for series ref %d, last chunk: [%d, %d], new: [%d, %d]",
 				seriesRef, ms.mmappedChunks[len(ms.mmappedChunks)-1].minTime, ms.mmappedChunks[len(ms.mmappedChunks)-1].maxTime,
 				mint, maxt)
 		}
@ -1174,6 +1177,14 @@ func (h *Head) WaitForPendingReadersInTimeRange(mint, maxt int64) {
 	}
 }

+// WaitForPendingReadersForOOOChunksAtOrBefore is like WaitForPendingReadersInTimeRange, except it waits for
+// queries touching OOO chunks less than or equal to chunk to finish querying.
+func (h *Head) WaitForPendingReadersForOOOChunksAtOrBefore(chunk chunks.ChunkDiskMapperRef) {
+	for h.oooIso.HasOpenReadsAtOrBefore(chunk) {
+		time.Sleep(500 * time.Millisecond)
+	}
+}
+
 // WaitForAppendersOverlapping waits for appends overlapping maxt to finish.
 func (h *Head) WaitForAppendersOverlapping(maxt int64) {
 	for maxt >= h.iso.lowestAppendTime() {
@ -1312,13 +1323,19 @@ func (h *Head) truncateWAL(mint int64) error {
 }

 // truncateOOO
+//   - waits for any pending reads that potentially touch chunks less than or equal to newMinOOOMmapRef
 //   - truncates the OOO WBL files whose index is strictly less than lastWBLFile.
-//   - garbage collects all the m-map chunks from the memory that are less than or equal to minOOOMmapRef
+//   - garbage collects all the m-map chunks from the memory that are less than or equal to newMinOOOMmapRef
 //     and then deletes the series that do not have any data anymore.
-func (h *Head) truncateOOO(lastWBLFile int, minOOOMmapRef chunks.ChunkDiskMapperRef) error {
+//
+// The caller is responsible for ensuring that no further queriers will be created that reference chunks less
+// than or equal to newMinOOOMmapRef before calling truncateOOO.
+func (h *Head) truncateOOO(lastWBLFile int, newMinOOOMmapRef chunks.ChunkDiskMapperRef) error {
 	curMinOOOMmapRef := chunks.ChunkDiskMapperRef(h.minOOOMmapRef.Load())
-	if minOOOMmapRef.GreaterThan(curMinOOOMmapRef) {
-		h.minOOOMmapRef.Store(uint64(minOOOMmapRef))
+	if newMinOOOMmapRef.GreaterThan(curMinOOOMmapRef) {
+		h.WaitForPendingReadersForOOOChunksAtOrBefore(newMinOOOMmapRef)
+		h.minOOOMmapRef.Store(uint64(newMinOOOMmapRef))
+
 		if err := h.truncateSeriesAndChunkDiskMapper("truncateOOO"); err != nil {
 			return err
 		}
@ -1448,11 +1465,13 @@ func (h *RangeHead) NumSeries() uint64 {
 	return h.head.NumSeries()
 }

+var rangeHeadULID = ulid.MustParse("0000000000XXXXXXXRANGEHEAD")
+
 func (h *RangeHead) Meta() BlockMeta {
 	return BlockMeta{
 		MinTime: h.MinTime(),
 		MaxTime: h.MaxTime(),
-		ULID:    h.head.Meta().ULID,
+		ULID:    rangeHeadULID,
 		Stats: BlockStats{
 			NumSeries: h.NumSeries(),
 		},
@ -1578,15 +1597,15 @@ func (h *Head) NumSeries() uint64 {
 	return h.numSeries.Load()
 }

+var headULID = ulid.MustParse("0000000000XXXXXXXXXXXXHEAD")
+
 // Meta returns meta information about the head.
 // The head is dynamic so will return dynamic results.
 func (h *Head) Meta() BlockMeta {
-	var id [16]byte
-	copy(id[:], "______head______")
 	return BlockMeta{
 		MinTime: h.MinTime(),
 		MaxTime: h.MaxTime(),
-		ULID:    ulid.ULID(id),
+		ULID:    headULID,
 		Stats: BlockStats{
 			NumSeries: h.NumSeries(),
 		},
@ -1634,9 +1653,6 @@ func (h *Head) Close() error {
 	h.mmapHeadChunks()

 	errs := tsdb_errors.NewMulti(h.chunkDiskMapper.Close())
-	if errs.Err() == nil && h.opts.EnableMemorySnapshotOnShutdown {
-		errs.Add(h.performChunkSnapshot())
-	}
 	if h.wal != nil {
 		errs.Add(h.wal.Close())
 	}
@ -1708,26 +1724,34 @@ func (h *Head) mmapHeadChunks() {
 	var count int
 	for i := 0; i < h.series.size; i++ {
 		h.series.locks[i].RLock()
-		for _, all := range h.series.hashes[i] {
-			for _, series := range all {
-				series.Lock()
-				count += series.mmapChunks(h.chunkDiskMapper)
-				series.Unlock()
-			}
+		for _, series := range h.series.series[i] {
+			series.Lock()
+			count += series.mmapChunks(h.chunkDiskMapper)
+			series.Unlock()
 		}
 		h.series.locks[i].RUnlock()
 	}
 	h.metrics.mmapChunksTotal.Add(float64(count))
 }

-// seriesHashmap is a simple hashmap for memSeries by their label set. It is built
-// on top of a regular hashmap and holds a slice of series to resolve hash collisions.
+// seriesHashmap lets TSDB find a memSeries by its label set, via a 64-bit hash.
+// There is one map for the common case where the hash value is unique, and a
+// second map for the case that two series have the same hash value.
+// Each series is in only one of the maps.
 // Its methods require the hash to be submitted with it to avoid re-computations throughout
 // the code.
-type seriesHashmap map[uint64][]*memSeries
+type seriesHashmap struct {
+	unique    map[uint64]*memSeries
+	conflicts map[uint64][]*memSeries
+}

-func (m seriesHashmap) get(hash uint64, lset labels.Labels) *memSeries {
-	for _, s := range m[hash] {
+func (m *seriesHashmap) get(hash uint64, lset labels.Labels) *memSeries {
+	if s, found := m.unique[hash]; found {
+		if labels.Equal(s.lset, lset) {
+			return s
+		}
+	}
+	for _, s := range m.conflicts[hash] {
 		if labels.Equal(s.lset, lset) {
 			return s
 		}
@ -1735,28 +1759,50 @@ func (m seriesHashmap) get(hash uint64, lset labels.Labels) *memSeries {
 	return nil
 }

-func (m seriesHashmap) set(hash uint64, s *memSeries) {
-	l := m[hash]
+func (m *seriesHashmap) set(hash uint64, s *memSeries) {
+	if existing, found := m.unique[hash]; !found || labels.Equal(existing.lset, s.lset) {
+		m.unique[hash] = s
+		return
+	}
+	if m.conflicts == nil {
+		m.conflicts = make(map[uint64][]*memSeries)
+	}
+	l := m.conflicts[hash]
 	for i, prev := range l {
 		if labels.Equal(prev.lset, s.lset) {
 			l[i] = s
 			return
 		}
 	}
-	m[hash] = append(l, s)
+	m.conflicts[hash] = append(l, s)
 }

-func (m seriesHashmap) del(hash uint64, lset labels.Labels) {
+func (m *seriesHashmap) del(hash uint64, lset labels.Labels) {
 	var rem []*memSeries
-	for _, s := range m[hash] {
-		if !labels.Equal(s.lset, lset) {
-			rem = append(rem, s)
+	unique, found := m.unique[hash]
+	switch {
+	case !found:
+		return
+	case labels.Equal(unique.lset, lset):
+		conflicts := m.conflicts[hash]
+		if len(conflicts) == 0 {
+			delete(m.unique, hash)
+			return
+		}
+		rem = conflicts
+	default:
+		rem = append(rem, unique)
+		for _, s := range m.conflicts[hash] {
+			if !labels.Equal(s.lset, lset) {
+				rem = append(rem, s)
+			}
 		}
 	}
-	if len(rem) == 0 {
-		delete(m, hash)
+	m.unique[hash] = rem[0]
+	if len(rem) == 1 {
+		delete(m.conflicts, hash)
 	} else {
-		m[hash] = rem
+		m.conflicts[hash] = rem[1:]
 	}
 }

@ -1798,7 +1844,10 @@ func newStripeSeries(stripeSize int, seriesCallback SeriesLifecycleCallback) *st
 		s.series[i] = map[chunks.HeadSeriesRef]*memSeries{}
 	}
 	for i := range s.hashes {
-		s.hashes[i] = seriesHashmap{}
+		s.hashes[i] = seriesHashmap{
+			unique:    map[uint64]*memSeries{},
+			conflicts: nil, // Initialized on demand in set().
+		}
 	}
 	return s
 }
@ -1818,72 +1867,76 @@ func (s *stripeSeries) gc(mint int64, minOOOMmapRef chunks.ChunkDiskMapperRef) (
 		deletedFromPrevStripe       = 0
 	)
 	minMmapFile = math.MaxInt32
-	// Run through all series and truncate old chunks. Mark those with no
-	// chunks left as deleted and store their ID.
+
+	// For one series, truncate old chunks and check if any chunks left. If not, mark as deleted and collect the ID.
+	check := func(hashShard int, hash uint64, series *memSeries, deletedForCallback map[chunks.HeadSeriesRef]labels.Labels) {
+		series.Lock()
+		defer series.Unlock()
+
+		rmChunks += series.truncateChunksBefore(mint, minOOOMmapRef)
+
+		if len(series.mmappedChunks) > 0 {
+			seq, _ := series.mmappedChunks[0].ref.Unpack()
+			if seq < minMmapFile {
+				minMmapFile = seq
+			}
+		}
+		if series.ooo != nil && len(series.ooo.oooMmappedChunks) > 0 {
+			seq, _ := series.ooo.oooMmappedChunks[0].ref.Unpack()
+			if seq < minMmapFile {
+				minMmapFile = seq
+			}
+			for _, ch := range series.ooo.oooMmappedChunks {
+				if ch.minTime < minOOOTime {
+					minOOOTime = ch.minTime
+				}
+			}
+		}
+		if series.ooo != nil && series.ooo.oooHeadChunk != nil {
+			if series.ooo.oooHeadChunk.minTime < minOOOTime {
+				minOOOTime = series.ooo.oooHeadChunk.minTime
+			}
+		}
+		if len(series.mmappedChunks) > 0 || series.headChunks != nil || series.pendingCommit ||
+			(series.ooo != nil && (len(series.ooo.oooMmappedChunks) > 0 || series.ooo.oooHeadChunk != nil)) {
+			seriesMint := series.minTime()
+			if seriesMint < actualMint {
+				actualMint = seriesMint
+			}
+			return
+		}
+		// The series is gone entirely. We need to keep the series lock
+		// and make sure we have acquired the stripe locks for hash and ID of the
+		// series alike.
+		// If we don't hold them all, there's a very small chance that a series receives
+		// samples again while we are half-way into deleting it.
+		refShard := int(series.ref) & (s.size - 1)
+		if hashShard != refShard {
+			s.locks[refShard].Lock()
+			defer s.locks[refShard].Unlock()
+		}
+
+		deleted[storage.SeriesRef(series.ref)] = struct{}{}
+		s.hashes[hashShard].del(hash, series.lset)
+		delete(s.series[refShard], series.ref)
+		deletedForCallback[series.ref] = series.lset
+	}
+
+	// Run through all series shard by shard, checking which should be deleted.
 	for i := 0; i < s.size; i++ {
 		deletedForCallback := make(map[chunks.HeadSeriesRef]labels.Labels, deletedFromPrevStripe)
 		s.locks[i].Lock()

-		for hash, all := range s.hashes[i] {
+		// Delete conflicts first so seriesHashmap.del doesn't move them to the `unique` field,
+		// after deleting `unique`.
+		for hash, all := range s.hashes[i].conflicts {
 			for _, series := range all {
-				series.Lock()
-				rmChunks += series.truncateChunksBefore(mint, minOOOMmapRef)
-
-				if len(series.mmappedChunks) > 0 {
-					seq, _ := series.mmappedChunks[0].ref.Unpack()
-					if seq < minMmapFile {
-						minMmapFile = seq
-					}
-				}
-				if series.ooo != nil && len(series.ooo.oooMmappedChunks) > 0 {
-					seq, _ := series.ooo.oooMmappedChunks[0].ref.Unpack()
-					if seq < minMmapFile {
-						minMmapFile = seq
-					}
-					for _, ch := range series.ooo.oooMmappedChunks {
-						if ch.minTime < minOOOTime {
-							minOOOTime = ch.minTime
-						}
-					}
-				}
-				if series.ooo != nil && series.ooo.oooHeadChunk != nil {
-					if series.ooo.oooHeadChunk.minTime < minOOOTime {
-						minOOOTime = series.ooo.oooHeadChunk.minTime
-					}
-				}
-				if len(series.mmappedChunks) > 0 || series.headChunks != nil || series.pendingCommit ||
-					(series.ooo != nil && (len(series.ooo.oooMmappedChunks) > 0 || series.ooo.oooHeadChunk != nil)) {
-					seriesMint := series.minTime()
-					if seriesMint < actualMint {
-						actualMint = seriesMint
-					}
-					series.Unlock()
-					continue
-				}
-
-				// The series is gone entirely. We need to keep the series lock
-				// and make sure we have acquired the stripe locks for hash and ID of the
-				// series alike.
-				// If we don't hold them all, there's a very small chance that a series receives
-				// samples again while we are half-way into deleting it.
-				j := int(series.ref) & (s.size - 1)
-
-				if i != j {
-					s.locks[j].Lock()
-				}
-
-				deleted[storage.SeriesRef(series.ref)] = struct{}{}
-				s.hashes[i].del(hash, series.lset)
-				delete(s.series[j], series.ref)
-				deletedForCallback[series.ref] = series.lset
-
-				if i != j {
-					s.locks[j].Unlock()
-				}
-
-				series.Unlock()
+				check(i, hash, series, deletedForCallback)
 			}
 		}
+		for hash, series := range s.hashes[i].unique {
+			check(i, hash, series, deletedForCallback)
+		}

 		s.locks[i].Unlock()

@ -2290,7 +2343,11 @@ func (h *Head) ForEachSecondaryHash(fn func(secondaryHash []uint32)) {
 		buf = buf[:0]

 		h.series.locks[i].RLock()
-		for _, all := range h.series.hashes[i] {
+		for _, s := range h.series.hashes[i].unique {
+			// No need to lock series lock, as we're only accessing its immutable secondary hash.
+			buf = append(buf, s.secondaryHash)
+		}
+		for _, all := range h.series.hashes[i].conflicts {
 			for _, s := range all {
 				// No need to lock series lock, as we're only accessing its immutable secondary hash.
 				buf = append(buf, s.secondaryHash)
--- a/tsdb/head_append.go
+++ b/tsdb/head_append.go
@ -528,13 +528,13 @@ func (a *headAppender) AppendHistogram(ref storage.SeriesRef, lset labels.Labels
 	}

 	if h != nil {
-		if err := ValidateHistogram(h); err != nil {
+		if err := h.Validate(); err != nil {
 			return 0, err
 		}
 	}

 	if fh != nil {
-		if err := ValidateFloatHistogram(fh); err != nil {
+		if err := fh.Validate(); err != nil {
 			return 0, err
 		}
 	}
@ -649,103 +649,6 @@ func (a *headAppender) UpdateMetadata(ref storage.SeriesRef, lset labels.Labels,
 	return ref, nil
 }

-func ValidateHistogram(h *histogram.Histogram) error {
-	if err := checkHistogramSpans(h.NegativeSpans, len(h.NegativeBuckets)); err != nil {
-		return errors.Wrap(err, "negative side")
-	}
-	if err := checkHistogramSpans(h.PositiveSpans, len(h.PositiveBuckets)); err != nil {
-		return errors.Wrap(err, "positive side")
-	}
-	var nCount, pCount uint64
-	err := checkHistogramBuckets(h.NegativeBuckets, &nCount, true)
-	if err != nil {
-		return errors.Wrap(err, "negative side")
-	}
-	err = checkHistogramBuckets(h.PositiveBuckets, &pCount, true)
-	if err != nil {
-		return errors.Wrap(err, "positive side")
-	}
-
-	if c := nCount + pCount + h.ZeroCount; c > h.Count {
-		return errors.Wrap(
-			storage.ErrHistogramCountNotBigEnough,
-			fmt.Sprintf("%d observations found in buckets, but the Count field is %d", c, h.Count),
-		)
-	}
-
-	return nil
-}
-
-func ValidateFloatHistogram(h *histogram.FloatHistogram) error {
-	if err := checkHistogramSpans(h.NegativeSpans, len(h.NegativeBuckets)); err != nil {
-		return errors.Wrap(err, "negative side")
-	}
-	if err := checkHistogramSpans(h.PositiveSpans, len(h.PositiveBuckets)); err != nil {
-		return errors.Wrap(err, "positive side")
-	}
-	var nCount, pCount float64
-	err := checkHistogramBuckets(h.NegativeBuckets, &nCount, false)
-	if err != nil {
-		return errors.Wrap(err, "negative side")
-	}
-	err = checkHistogramBuckets(h.PositiveBuckets, &pCount, false)
-	if err != nil {
-		return errors.Wrap(err, "positive side")
-	}
-
-	// We do not check for h.Count being at least as large as the sum of the
-	// counts in the buckets because floating point precision issues can
-	// create false positives here.
-
-	return nil
-}
-
-func checkHistogramSpans(spans []histogram.Span, numBuckets int) error {
-	var spanBuckets int
-	for n, span := range spans {
-		if n > 0 && span.Offset < 0 {
-			return errors.Wrap(
-				storage.ErrHistogramSpanNegativeOffset,
-				fmt.Sprintf("span number %d with offset %d", n+1, span.Offset),
-			)
-		}
-		spanBuckets += int(span.Length)
-	}
-	if spanBuckets != numBuckets {
-		return errors.Wrap(
-			storage.ErrHistogramSpansBucketsMismatch,
-			fmt.Sprintf("spans need %d buckets, have %d buckets", spanBuckets, numBuckets),
-		)
-	}
-	return nil
-}
-
-func checkHistogramBuckets[BC histogram.BucketCount, IBC histogram.InternalBucketCount](buckets []IBC, count *BC, deltas bool) error {
-	if len(buckets) == 0 {
-		return nil
-	}
-
-	var last IBC
-	for i := 0; i < len(buckets); i++ {
-		var c IBC
-		if deltas {
-			c = last + buckets[i]
-		} else {
-			c = buckets[i]
-		}
-		if c < 0 {
-			return errors.Wrap(
-				storage.ErrHistogramNegativeBucketCount,
-				fmt.Sprintf("bucket number %d has observation count of %v", i+1, c),
-			)
-		}
-		last = c
-		*count += BC(c)
-	}
-
-	return nil
-}
-
 var _ storage.GetRef = &headAppender{}

 func (a *headAppender) GetRef(lset labels.Labels, hash uint64) (storage.SeriesRef, labels.Labels) {
@ -793,14 +696,6 @@ func (a *headAppender) log() error {
 			return errors.Wrap(err, "log samples")
 		}
 	}
-	if len(a.exemplars) > 0 {
-		rec = enc.Exemplars(exemplarsForEncoding(a.exemplars), buf)
-		buf = rec[:0]
-
-		if err := a.head.wal.Log(rec); err != nil {
-			return errors.Wrap(err, "log exemplars")
-		}
-	}
 	if len(a.histograms) > 0 {
 		rec = enc.HistogramSamples(a.histograms, buf)
 		buf = rec[:0]
@ -815,6 +710,18 @@ func (a *headAppender) log() error {
 			return errors.Wrap(err, "log float histograms")
 		}
 	}
+	// Exemplars should be logged after samples (float/native histogram/etc),
+	// otherwise it might happen that we send the exemplars in a remote write
+	// batch before the samples, which in turn means the exemplar is rejected
+	// for missing series, since series are created due to samples.
+	if len(a.exemplars) > 0 {
+		rec = enc.Exemplars(exemplarsForEncoding(a.exemplars), buf)
+		buf = rec[:0]
+
+		if err := a.head.wal.Log(rec); err != nil {
+			return errors.Wrap(err, "log exemplars")
+		}
+	}
 	return nil
 }

@ -851,6 +758,12 @@ func (a *headAppender) Commit() (err error) {
 	// No errors logging to WAL, so pass the exemplars along to the in memory storage.
 	for _, e := range a.exemplars {
 		s := a.head.series.getByID(chunks.HeadSeriesRef(e.ref))
+		if s == nil {
+			// This is very unlikely to happen, but we have seen it in the wild.
+			// It means that the series was truncated between AppendExemplar and Commit.
+			// See TestHeadCompactionWhileAppendAndCommitExemplar.
+			continue
+		}
 		// We don't instrument exemplar appends here, all is instrumented by storage.
 		if err := a.head.exemplars.AddExemplar(s.lset, e.exemplar); err != nil {
 			if err == storage.ErrOutOfOrderExemplar {
--- a/tsdb/head_read.go
+++ b/tsdb/head_read.go
@ -318,10 +318,10 @@ func (h *headChunkReader) Close() error {
 	return nil
 }

-// Chunk returns the chunk for the reference number.
-func (h *headChunkReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
+// ChunkOrIterable returns the chunk for the reference number.
+func (h *headChunkReader) ChunkOrIterable(meta chunks.Meta) (chunkenc.Chunk, chunkenc.Iterable, error) {
 	chk, _, err := h.chunk(meta, false)
-	return chk, err
+	return chk, nil, err
 }

 // ChunkWithCopy returns the chunk for the reference number.
@ -445,13 +445,13 @@ func (s *memSeries) chunk(id chunks.HeadChunkID, cdm chunkDiskMapper, memChunkPo
 	return elem, true, offset == 0, nil
 }

-// oooMergedChunk returns the requested chunk based on the given chunks.Meta
-// reference from memory or by m-mapping it from the disk. The returned chunk
-// might be a merge of all the overlapping chunks, if any, amongst all the
-// chunks in the OOOHead.
+// oooMergedChunks return an iterable over one or more OOO chunks for the given
+// chunks.Meta reference from memory or by m-mapping it from the disk. The
+// returned iterable will be a merge of all the overlapping chunks, if any,
+// amongst all the chunks in the OOOHead.
 // This function is not thread safe unless the caller holds a lock.
 // The caller must ensure that s.ooo is not nil.
-func (s *memSeries) oooMergedChunk(meta chunks.Meta, cdm chunkDiskMapper, mint, maxt int64) (chunk *mergedOOOChunks, err error) {
+func (s *memSeries) oooMergedChunks(meta chunks.Meta, cdm chunkDiskMapper, mint, maxt int64) (*mergedOOOChunks, error) {
 	_, cid := chunks.HeadChunkRef(meta.Ref).Unpack()

 	// ix represents the index of chunk in the s.mmappedChunks slice. The chunk meta's are
@ -528,11 +528,13 @@ func (s *memSeries) oooMergedChunk(meta chunks.Meta, cdm chunkDiskMapper, mint,
 	mc := &mergedOOOChunks{}
 	absoluteMax := int64(math.MinInt64)
 	for _, c := range tmpChks {
-		if c.meta.Ref != meta.Ref && (len(mc.chunks) == 0 || c.meta.MinTime > absoluteMax) {
+		if c.meta.Ref != meta.Ref && (len(mc.chunkIterables) == 0 || c.meta.MinTime > absoluteMax) {
 			continue
 		}
+		var iterable chunkenc.Iterable
 		if c.meta.Ref == oooHeadRef {
 			var xor *chunkenc.XORChunk
+			var err error
 			// If head chunk min and max time match the meta OOO markers
 			// that means that the chunk has not expanded so we can append
 			// it as it is.
@ -545,7 +547,7 @@ func (s *memSeries) oooMergedChunk(meta chunks.Meta, cdm chunkDiskMapper, mint,
 			if err != nil {
 				return nil, errors.Wrap(err, "failed to convert ooo head chunk to xor chunk")
 			}
-			c.meta.Chunk = xor
+			iterable = xor
 		} else {
 			chk, err := cdm.Chunk(c.ref)
 			if err != nil {
@ -560,12 +562,12 @@ func (s *memSeries) oooMergedChunk(meta chunks.Meta, cdm chunkDiskMapper, mint,
 				// wrap the chunk within a chunk that doesnt allows us to iterate
 				// through samples out of the OOOLastMinT and OOOLastMaxT
 				// markers.
-				c.meta.Chunk = boundedChunk{chk, meta.OOOLastMinTime, meta.OOOLastMaxTime}
+				iterable = boundedIterable{chk, meta.OOOLastMinTime, meta.OOOLastMaxTime}
 			} else {
-				c.meta.Chunk = chk
+				iterable = chk
 			}
 		}
-		mc.chunks = append(mc.chunks, c.meta)
+		mc.chunkIterables = append(mc.chunkIterables, iterable)
 		if c.meta.MaxTime > absoluteMax {
 			absoluteMax = c.meta.MaxTime
 		}
@ -574,77 +576,30 @@ func (s *memSeries) oooMergedChunk(meta chunks.Meta, cdm chunkDiskMapper, mint,
 	return mc, nil
 }

-var _ chunkenc.Chunk = &mergedOOOChunks{}
+var _ chunkenc.Iterable = &mergedOOOChunks{}

-// mergedOOOChunks holds the list of overlapping chunks. This struct satisfies
-// chunkenc.Chunk.
+// mergedOOOChunks holds the list of iterables for overlapping chunks.
 type mergedOOOChunks struct {
-	chunks []chunks.Meta
-}
-
-// Bytes is a very expensive method because its calling the iterator of all the
-// chunks in the mergedOOOChunk and building a new chunk with the samples.
-func (o mergedOOOChunks) Bytes() []byte {
-	xc := chunkenc.NewXORChunk()
-	app, err := xc.Appender()
-	if err != nil {
-		panic(err)
-	}
-	it := o.Iterator(nil)
-	for it.Next() == chunkenc.ValFloat {
-		t, v := it.At()
-		app.Append(t, v)
-	}
-
-	return xc.Bytes()
-}
-
-func (o mergedOOOChunks) Encoding() chunkenc.Encoding {
-	return chunkenc.EncXOR
-}
-
-func (o mergedOOOChunks) Appender() (chunkenc.Appender, error) {
-	return nil, errors.New("can't append to mergedOOOChunks")
+	chunkIterables []chunkenc.Iterable
 }

 func (o mergedOOOChunks) Iterator(iterator chunkenc.Iterator) chunkenc.Iterator {
-	return storage.ChainSampleIteratorFromMetas(iterator, o.chunks)
+	return storage.ChainSampleIteratorFromIterables(iterator, o.chunkIterables)
 }

-func (o mergedOOOChunks) NumSamples() int {
-	samples := 0
-	for _, c := range o.chunks {
-		samples += c.Chunk.NumSamples()
-	}
-	return samples
-}
+var _ chunkenc.Iterable = &boundedIterable{}

-func (o mergedOOOChunks) Compact() {}
-
-var _ chunkenc.Chunk = &boundedChunk{}
-
-// boundedChunk is an implementation of chunkenc.Chunk that uses a
+// boundedIterable is an implementation of chunkenc.Iterable that uses a
 // boundedIterator that only iterates through samples which timestamps are
 // >= minT and <= maxT.
-type boundedChunk struct {
-	chunkenc.Chunk
-	minT int64
-	maxT int64
+type boundedIterable struct {
+	chunk chunkenc.Chunk
+	minT  int64
+	maxT  int64
 }

-func (b boundedChunk) Bytes() []byte {
-	xor := chunkenc.NewXORChunk()
-	a, _ := xor.Appender()
-	it := b.Iterator(nil)
-	for it.Next() == chunkenc.ValFloat {
-		t, v := it.At()
-		a.Append(t, v)
-	}
-	return xor.Bytes()
-}
-
-func (b boundedChunk) Iterator(iterator chunkenc.Iterator) chunkenc.Iterator {
-	it := b.Chunk.Iterator(iterator)
+func (b boundedIterable) Iterator(iterator chunkenc.Iterator) chunkenc.Iterator {
+	it := b.chunk.Iterator(iterator)
 	if it == nil {
 		panic("iterator shouldn't be nil")
 	}
--- a/tsdb/head_read_test.go
+++ b/tsdb/head_read_test.go
@ -129,21 +129,10 @@ func TestBoundedChunk(t *testing.T) {
 	}
 	for _, tc := range tests {
 		t.Run(fmt.Sprintf("name=%s", tc.name), func(t *testing.T) {
-			chunk := boundedChunk{tc.inputChunk, tc.inputMinT, tc.inputMaxT}
-
-			// Testing Bytes()
-			expChunk := chunkenc.NewXORChunk()
-			if tc.inputChunk.NumSamples() > 0 {
-				app, err := expChunk.Appender()
-				require.NoError(t, err)
-				for ts := tc.inputMinT; ts <= tc.inputMaxT; ts++ {
-					app.Append(ts, float64(ts))
-				}
-			}
-			require.Equal(t, expChunk.Bytes(), chunk.Bytes())
+			iterable := boundedIterable{tc.inputChunk, tc.inputMinT, tc.inputMaxT}

 			var samples []sample
-			it := chunk.Iterator(nil)
+			it := iterable.Iterator(nil)

 			if tc.initialSeek != 0 {
 				// Testing Seek()
--- a/tsdb/head_test.go
+++ b/tsdb/head_test.go
@ -22,6 +22,7 @@ import (
 	"os"
 	"path"
 	"path/filepath"
+	"reflect"
 	"sort"
 	"strconv"
 	"strings"
@ -190,6 +191,10 @@ func readTestWAL(t testing.TB, dir string) (recs []interface{}) {
 			meta, err := dec.Metadata(rec, nil)
 			require.NoError(t, err)
 			recs = append(recs, meta)
+		case record.Exemplars:
+			exemplars, err := dec.Exemplars(rec, nil)
+			require.NoError(t, err)
+			recs = append(recs, exemplars)
 		default:
 			t.Fatalf("unknown record type")
 		}
@ -1835,16 +1840,16 @@ func TestGCChunkAccess(t *testing.T) {

 	cr, err := h.chunksRange(0, 1500, nil)
 	require.NoError(t, err)
-	_, err = cr.Chunk(chunks[0])
+	_, _, err = cr.ChunkOrIterable(chunks[0])
 	require.NoError(t, err)
-	_, err = cr.Chunk(chunks[1])
+	_, _, err = cr.ChunkOrIterable(chunks[1])
 	require.NoError(t, err)

 	require.NoError(t, h.Truncate(1500)) // Remove a chunk.

-	_, err = cr.Chunk(chunks[0])
+	_, _, err = cr.ChunkOrIterable(chunks[0])
 	require.Equal(t, storage.ErrNotFound, err)
-	_, err = cr.Chunk(chunks[1])
+	_, _, err = cr.ChunkOrIterable(chunks[1])
 	require.NoError(t, err)
 }

@ -1894,18 +1899,18 @@ func TestGCSeriesAccess(t *testing.T) {

 	cr, err := h.chunksRange(0, 2000, nil)
 	require.NoError(t, err)
-	_, err = cr.Chunk(chunks[0])
+	_, _, err = cr.ChunkOrIterable(chunks[0])
 	require.NoError(t, err)
-	_, err = cr.Chunk(chunks[1])
+	_, _, err = cr.ChunkOrIterable(chunks[1])
 	require.NoError(t, err)

 	require.NoError(t, h.Truncate(2000)) // Remove the series.

 	require.Equal(t, (*memSeries)(nil), h.series.getByID(1))

-	_, err = cr.Chunk(chunks[0])
+	_, _, err = cr.ChunkOrIterable(chunks[0])
 	require.Equal(t, storage.ErrNotFound, err)
-	_, err = cr.Chunk(chunks[1])
+	_, _, err = cr.ChunkOrIterable(chunks[1])
 	require.Equal(t, storage.ErrNotFound, err)
 }

@ -3488,7 +3493,6 @@ func TestHistogramInWALAndMmapChunk(t *testing.T) {
 			hists = tsdbutil.GenerateTestHistograms(numHistograms)
 		}
 		for _, h := range hists {
-			h.Count *= 2
 			h.NegativeSpans = h.PositiveSpans
 			h.NegativeBuckets = h.PositiveBuckets
 			_, err := app.AppendHistogram(0, s1, ts, h, nil)
@ -3511,7 +3515,6 @@ func TestHistogramInWALAndMmapChunk(t *testing.T) {
 			hists = tsdbutil.GenerateTestFloatHistograms(numHistograms)
 		}
 		for _, h := range hists {
-			h.Count *= 2
 			h.NegativeSpans = h.PositiveSpans
 			h.NegativeBuckets = h.PositiveBuckets
 			_, err := app.AppendHistogram(0, s1, ts, nil, h)
@ -3553,7 +3556,6 @@ func TestHistogramInWALAndMmapChunk(t *testing.T) {
 		}
 		for _, h := range hists {
 			ts++
-			h.Count *= 2
 			h.NegativeSpans = h.PositiveSpans
 			h.NegativeBuckets = h.PositiveBuckets
 			_, err := app.AppendHistogram(0, s2, ts, h, nil)
@ -3590,7 +3592,6 @@ func TestHistogramInWALAndMmapChunk(t *testing.T) {
 		}
 		for _, h := range hists {
 			ts++
-			h.Count *= 2
 			h.NegativeSpans = h.PositiveSpans
 			h.NegativeBuckets = h.PositiveBuckets
 			_, err := app.AppendHistogram(0, s2, ts, nil, h)
@ -4967,170 +4968,6 @@ func TestReplayAfterMmapReplayError(t *testing.T) {
 	require.NoError(t, h.Close())
 }

-func TestHistogramValidation(t *testing.T) {
-	tests := map[string]struct {
-		h         *histogram.Histogram
-		errMsg    string
-		skipFloat bool
-	}{
-		"valid histogram": {
-			h: tsdbutil.GenerateTestHistograms(1)[0],
-		},
-		"valid histogram that has its Count (4) higher than the actual total of buckets (2 + 1)": {
-			// This case is possible if NaN values (which do not fall into any bucket) are observed.
-			h: &histogram.Histogram{
-				ZeroCount:       2,
-				Count:           4,
-				Sum:             math.NaN(),
-				PositiveSpans:   []histogram.Span{{Offset: 0, Length: 1}},
-				PositiveBuckets: []int64{1},
-			},
-		},
-		"rejects histogram that has too few negative buckets": {
-			h: &histogram.Histogram{
-				NegativeSpans:   []histogram.Span{{Offset: 0, Length: 1}},
-				NegativeBuckets: []int64{},
-			},
-			errMsg: `negative side: spans need 1 buckets, have 0 buckets: histogram spans specify different number of buckets than provided`,
-		},
-		"rejects histogram that has too few positive buckets": {
-			h: &histogram.Histogram{
-				PositiveSpans:   []histogram.Span{{Offset: 0, Length: 1}},
-				PositiveBuckets: []int64{},
-			},
-			errMsg: `positive side: spans need 1 buckets, have 0 buckets: histogram spans specify different number of buckets than provided`,
-		},
-		"rejects histogram that has too many negative buckets": {
-			h: &histogram.Histogram{
-				NegativeSpans:   []histogram.Span{{Offset: 0, Length: 1}},
-				NegativeBuckets: []int64{1, 2},
-			},
-			errMsg: `negative side: spans need 1 buckets, have 2 buckets: histogram spans specify different number of buckets than provided`,
-		},
-		"rejects histogram that has too many positive buckets": {
-			h: &histogram.Histogram{
-				PositiveSpans:   []histogram.Span{{Offset: 0, Length: 1}},
-				PositiveBuckets: []int64{1, 2},
-			},
-			errMsg: `positive side: spans need 1 buckets, have 2 buckets: histogram spans specify different number of buckets than provided`,
-		},
-		"rejects a histogram that has a negative span with a negative offset": {
-			h: &histogram.Histogram{
-				NegativeSpans:   []histogram.Span{{Offset: -1, Length: 1}, {Offset: -1, Length: 1}},
-				NegativeBuckets: []int64{1, 2},
-			},
-			errMsg: `negative side: span number 2 with offset -1: histogram has a span whose offset is negative`,
-		},
-		"rejects a histogram which has a positive span with a negative offset": {
-			h: &histogram.Histogram{
-				PositiveSpans:   []histogram.Span{{Offset: -1, Length: 1}, {Offset: -1, Length: 1}},
-				PositiveBuckets: []int64{1, 2},
-			},
-			errMsg: `positive side: span number 2 with offset -1: histogram has a span whose offset is negative`,
-		},
-		"rejects a histogram that has a negative bucket with a negative count": {
-			h: &histogram.Histogram{
-				NegativeSpans:   []histogram.Span{{Offset: -1, Length: 1}},
-				NegativeBuckets: []int64{-1},
-			},
-			errMsg: `negative side: bucket number 1 has observation count of -1: histogram has a bucket whose observation count is negative`,
-		},
-		"rejects a histogram that has a positive bucket with a negative count": {
-			h: &histogram.Histogram{
-				PositiveSpans:   []histogram.Span{{Offset: -1, Length: 1}},
-				PositiveBuckets: []int64{-1},
-			},
-			errMsg: `positive side: bucket number 1 has observation count of -1: histogram has a bucket whose observation count is negative`,
-		},
-		"rejects a histogram that has a lower count than count in buckets": {
-			h: &histogram.Histogram{
-				Count:           0,
-				NegativeSpans:   []histogram.Span{{Offset: -1, Length: 1}},
-				PositiveSpans:   []histogram.Span{{Offset: -1, Length: 1}},
-				NegativeBuckets: []int64{1},
-				PositiveBuckets: []int64{1},
-			},
-			errMsg:    `2 observations found in buckets, but the Count field is 0: histogram's observation count should be at least the number of observations found in the buckets`,
-			skipFloat: true,
-		},
-		"rejects a histogram that doesn't count the zero bucket in its count": {
-			h: &histogram.Histogram{
-				Count:           2,
-				ZeroCount:       1,
-				NegativeSpans:   []histogram.Span{{Offset: -1, Length: 1}},
-				PositiveSpans:   []histogram.Span{{Offset: -1, Length: 1}},
-				NegativeBuckets: []int64{1},
-				PositiveBuckets: []int64{1},
-			},
-			errMsg:    `3 observations found in buckets, but the Count field is 2: histogram's observation count should be at least the number of observations found in the buckets`,
-			skipFloat: true,
-		},
-	}
-
-	for testName, tc := range tests {
-		t.Run(testName, func(t *testing.T) {
-			if err := ValidateHistogram(tc.h); tc.errMsg != "" {
-				require.EqualError(t, err, tc.errMsg)
-			} else {
-				require.NoError(t, err)
-			}
-			if tc.skipFloat {
-				return
-			}
-			if err := ValidateFloatHistogram(tc.h.ToFloat()); tc.errMsg != "" {
-				require.EqualError(t, err, tc.errMsg)
-			} else {
-				require.NoError(t, err)
-			}
-		})
-	}
-}
-
-func BenchmarkHistogramValidation(b *testing.B) {
-	histograms := generateBigTestHistograms(b.N, 500)
-	b.ResetTimer()
-	for _, h := range histograms {
-		require.NoError(b, ValidateHistogram(h))
-	}
-}
-
-func generateBigTestHistograms(numHistograms, numBuckets int) []*histogram.Histogram {
-	numSpans := numBuckets / 10
-	bucketsPerSide := numBuckets / 2
-	spanLength := uint32(bucketsPerSide / numSpans)
-	// Given all bucket deltas are 1, sum numHistograms + 1.
-	observationCount := numBuckets / 2 * (1 + numBuckets)
-
-	var histograms []*histogram.Histogram
-	for i := 0; i < numHistograms; i++ {
-		h := &histogram.Histogram{
-			Count:           uint64(i + observationCount),
-			ZeroCount:       uint64(i),
-			ZeroThreshold:   1e-128,
-			Sum:             18.4 * float64(i+1),
-			Schema:          2,
-			NegativeSpans:   make([]histogram.Span, numSpans),
-			PositiveSpans:   make([]histogram.Span, numSpans),
-			NegativeBuckets: make([]int64, bucketsPerSide),
-			PositiveBuckets: make([]int64, bucketsPerSide),
-		}
-
-		for j := 0; j < numSpans; j++ {
-			s := histogram.Span{Offset: 1, Length: spanLength}
-			h.NegativeSpans[j] = s
-			h.PositiveSpans[j] = s
-		}
-
-		for j := 0; j < bucketsPerSide; j++ {
-			h.NegativeBuckets[j] = 1
-			h.PositiveBuckets[j] = 1
-		}
-
-		histograms = append(histograms, h)
-	}
-	return histograms
-}
-
 func TestOOOAppendWithNoSeries(t *testing.T) {
 	dir := t.TempDir()
 	wal, err := wlog.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, wlog.CompressionSnappy)
@ -5471,7 +5308,7 @@ func BenchmarkCuttingHeadHistogramChunks(b *testing.B) {
 		numSamples = 50000
 		numBuckets = 100
 	)
-	samples := generateBigTestHistograms(numSamples, numBuckets)
+	samples := histogram.GenerateBigTestHistograms(numSamples, numBuckets)

 	h, _ := newTestHead(b, DefaultBlockDuration, wlog.CompressionNone, false)
 	defer func() {
@ -5535,7 +5372,7 @@ func TestCuttingNewHeadChunks(t *testing.T) {
 		"small histograms": {
 			numTotalSamples: 240,
 			histValFunc: func() func(i int) *histogram.Histogram {
-				hists := generateBigTestHistograms(240, 10)
+				hists := histogram.GenerateBigTestHistograms(240, 10)
 				return func(i int) *histogram.Histogram {
 					return hists[i]
 				}
@ -5551,7 +5388,7 @@ func TestCuttingNewHeadChunks(t *testing.T) {
 		"large histograms": {
 			numTotalSamples: 240,
 			histValFunc: func() func(i int) *histogram.Histogram {
-				hists := generateBigTestHistograms(240, 100)
+				hists := histogram.GenerateBigTestHistograms(240, 100)
 				return func(i int) *histogram.Histogram {
 					return hists[i]
 				}
@ -5560,14 +5397,13 @@ func TestCuttingNewHeadChunks(t *testing.T) {
 				numSamples int
 				numBytes   int
 			}{
-				{30, 696},
-				{30, 700},
-				{30, 708},
-				{30, 693},
+				{40, 896},
+				{40, 899},
+				{40, 896},
+				{30, 690},
 				{30, 691},
-				{30, 692},
-				{30, 695},
 				{30, 694},
+				{30, 693},
 			},
 		},
 		"really large histograms": {
@ -5575,7 +5411,7 @@ func TestCuttingNewHeadChunks(t *testing.T) {
 			// per chunk.
 			numTotalSamples: 11,
 			histValFunc: func() func(i int) *histogram.Histogram {
-				hists := generateBigTestHistograms(11, 100000)
+				hists := histogram.GenerateBigTestHistograms(11, 100000)
 				return func(i int) *histogram.Histogram {
 					return hists[i]
 				}
@ -5639,8 +5475,9 @@ func TestCuttingNewHeadChunks(t *testing.T) {
 				require.Len(t, chkMetas, len(tc.expectedChks))

 				for i, expected := range tc.expectedChks {
-					chk, err := chkReader.Chunk(chkMetas[i])
+					chk, iterable, err := chkReader.ChunkOrIterable(chkMetas[i])
 					require.NoError(t, err)
+					require.Nil(t, iterable)

 					require.Equal(t, expected.numSamples, chk.NumSamples())
 					require.Len(t, chk.Bytes(), expected.numBytes)
@ -5688,67 +5525,254 @@ func TestHeadDetectsDuplicateSampleAtSizeLimit(t *testing.T) {

 	storedSampleCount := 0
 	for _, chunkMeta := range chunks {
-		chunk, err := chunkReader.Chunk(chunkMeta)
+		chunk, iterable, err := chunkReader.ChunkOrIterable(chunkMeta)
 		require.NoError(t, err)
+		require.Nil(t, iterable)
 		storedSampleCount += chunk.NumSamples()
 	}

 	require.Equal(t, numSamples/2, storedSampleCount)
 }

-func TestSecondaryHashFunction(t *testing.T) {
-	dir := t.TempDir()
-	wal, err := wlog.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, wlog.CompressionNone)
-	require.NoError(t, err)
-
-	opts := DefaultHeadOptions()
-	opts.ChunkRange = 1000
-	opts.ChunkDirRoot = dir
-	opts.EnableExemplarStorage = true
-	opts.MaxExemplars.Store(config.DefaultExemplarsConfig.MaxExemplars)
-	opts.EnableNativeHistograms.Store(true)
-	opts.SecondaryHashFunction = func(l labels.Labels) uint32 {
-		return uint32(l.Len())
+func TestWALSampleAndExemplarOrder(t *testing.T) {
+	lbls := labels.FromStrings("foo", "bar")
+	testcases := map[string]struct {
+		appendF      func(app storage.Appender, ts int64) (storage.SeriesRef, error)
+		expectedType reflect.Type
+	}{
+		"float sample": {
+			appendF: func(app storage.Appender, ts int64) (storage.SeriesRef, error) {
+				return app.Append(0, lbls, ts, 1.0)
+			},
+			expectedType: reflect.TypeOf([]record.RefSample{}),
+		},
+		"histogram sample": {
+			appendF: func(app storage.Appender, ts int64) (storage.SeriesRef, error) {
+				return app.AppendHistogram(0, lbls, ts, tsdbutil.GenerateTestHistogram(1), nil)
+			},
+			expectedType: reflect.TypeOf([]record.RefHistogramSample{}),
+		},
+		"float histogram sample": {
+			appendF: func(app storage.Appender, ts int64) (storage.SeriesRef, error) {
+				return app.AppendHistogram(0, lbls, ts, nil, tsdbutil.GenerateTestFloatHistogram(1))
+			},
+			expectedType: reflect.TypeOf([]record.RefFloatHistogramSample{}),
+		},
 	}

-	h, err := NewHead(nil, nil, wal, nil, opts, nil)
-	require.NoError(t, err)
+	for testName, tc := range testcases {
+		t.Run(testName, func(t *testing.T) {
+			h, w := newTestHead(t, 1000, wlog.CompressionNone, false)
+			defer func() {
+				require.NoError(t, h.Close())
+			}()

-	t.Cleanup(func() {
-		require.NoError(t, h.Close())
-	})
+			app := h.Appender(context.Background())
+			ref, err := tc.appendF(app, 10)
+			require.NoError(t, err)
+			app.AppendExemplar(ref, lbls, exemplar.Exemplar{Value: 1.0, Ts: 5})

-	const seriesCount = 100
-	const labelsCount = 10
+			app.Commit()

+			recs := readTestWAL(t, w.Dir())
+			require.Len(t, recs, 3)
+			_, ok := recs[0].([]record.RefSeries)
+			require.True(t, ok, "expected first record to be a RefSeries")
+			actualType := reflect.TypeOf(recs[1])
+			require.Equal(t, tc.expectedType, actualType, "expected second record to be a %s", tc.expectedType)
+			_, ok = recs[2].([]record.RefExemplar)
+			require.True(t, ok, "expected third record to be a RefExemplar")
+		})
+	}
+}
+
+// TestHeadCompactionWhileAppendAndCommitExemplar simulates a use case where
+// a series is removed from the head while an exemplar is being appended to it.
+// This can happen in theory by compacting the head at the right time due to
+// a series being idle.
+// The test cheats a little bit by not appending a sample with the exemplar.
+// If you also add a sample and run Truncate in a concurrent goroutine and run
+// the test around a million(!) times, you can get
+// `unknown HeadSeriesRef when trying to add exemplar: 1` error on push.
+// It is likely that running the test for much longer and with more time variations
+// would trigger the
+// `signal SIGSEGV: segmentation violation code=0x1 addr=0x20 pc=0xbb03d1`
+// panic, that we have seen in the wild once.
+func TestHeadCompactionWhileAppendAndCommitExemplar(t *testing.T) {
+	h, _ := newTestHead(t, DefaultBlockDuration, wlog.CompressionNone, false)
 	app := h.Appender(context.Background())
-	for ix, s := range genSeries(seriesCount, labelsCount, 0, 0) {
-		_, err := app.Append(0, s.Labels(), int64(100*ix), float64(ix))
-		require.NoError(t, err)
-	}
-	require.NoError(t, app.Commit())
+	lbls := labels.FromStrings("foo", "bar")
+	ref, err := app.Append(0, lbls, 1, 1)
+	require.NoError(t, err)
+	app.Commit()
+	// Not adding a sample here to trigger the fault.
+	app = h.Appender(context.Background())
+	_, err = app.AppendExemplar(ref, lbls, exemplar.Exemplar{Value: 1, Ts: 20})
+	require.NoError(t, err)
+	h.Truncate(10)
+	app.Commit()
+	h.Close()
+}

-	checkSecondaryHashes := func(expected int) {
+func labelsWithHashCollision() (labels.Labels, labels.Labels) {
+	// These two series have the same XXHash; thanks to https://github.com/pstibrany/labels_hash_collisions
+	ls1 := labels.FromStrings("__name__", "metric", "lbl1", "value", "lbl2", "l6CQ5y")
+	ls2 := labels.FromStrings("__name__", "metric", "lbl1", "value", "lbl2", "v7uDlF")
+
+	if ls1.Hash() != ls2.Hash() {
+		// These ones are the same when using -tags stringlabels
+		ls1 = labels.FromStrings("__name__", "metric", "lbl", "HFnEaGl")
+		ls2 = labels.FromStrings("__name__", "metric", "lbl", "RqcXatm")
+	}
+
+	if ls1.Hash() != ls2.Hash() {
+		panic("This code needs to be updated: find new labels with colliding hash values.")
+	}
+
+	return ls1, ls2
+}
+
+// stripeSeriesWithCollidingSeries returns a stripeSeries with two memSeries having the same, colliding, hash.
+func stripeSeriesWithCollidingSeries(t *testing.T) (*stripeSeries, *memSeries, *memSeries) {
+	t.Helper()
+
+	lbls1, lbls2 := labelsWithHashCollision()
+	ms1 := memSeries{
+		lset: lbls1,
+	}
+	ms2 := memSeries{
+		lset: lbls2,
+	}
+	hash := lbls1.Hash()
+	s := newStripeSeries(1, noopSeriesLifecycleCallback{})
+
+	got, created, err := s.getOrSet(hash, lbls1, func() *memSeries {
+		return &ms1
+	})
+	require.NoError(t, err)
+	require.True(t, created)
+	require.Same(t, &ms1, got)
+
+	// Add a conflicting series
+	got, created, err = s.getOrSet(hash, lbls2, func() *memSeries {
+		return &ms2
+	})
+	require.NoError(t, err)
+	require.True(t, created)
+	require.Same(t, &ms2, got)
+
+	return s, &ms1, &ms2
+}
+
+func TestStripeSeries_getOrSet(t *testing.T) {
+	s, ms1, ms2 := stripeSeriesWithCollidingSeries(t)
+	hash := ms1.lset.Hash()
+
+	// Verify that we can get both of the series despite the hash collision
+	got := s.getByHash(hash, ms1.lset)
+	require.Same(t, ms1, got)
+	got = s.getByHash(hash, ms2.lset)
+	require.Same(t, ms2, got)
+}
+
+func TestStripeSeries_gc(t *testing.T) {
+	s, ms1, ms2 := stripeSeriesWithCollidingSeries(t)
+	hash := ms1.lset.Hash()
+
+	s.gc(0, 0)
+
+	// Verify that we can get neither ms1 nor ms2 after gc-ing corresponding series
+	got := s.getByHash(hash, ms1.lset)
+	require.Nil(t, got)
+	got = s.getByHash(hash, ms2.lset)
+	require.Nil(t, got)
+}
+
+func TestSecondaryHashFunction(t *testing.T) {
+	checkSecondaryHashes := func(t *testing.T, h *Head, labelsCount, expected int) {
 		reportedHashes := 0
 		h.ForEachSecondaryHash(func(secondaryHashes []uint32) {
 			reportedHashes += len(secondaryHashes)

 			for _, h := range secondaryHashes {
-				require.Equal(t, uint32(labelsCount), h)
+				require.Equal(t, labelsCount, int(h))
 			}
 		})
 		require.Equal(t, expected, reportedHashes)
 	}

-	checkSecondaryHashes(seriesCount)
+	testCases := []struct {
+		name   string
+		series func(*testing.T) []storage.Series
+	}{
+		{
+			name: "without collisions",
+			series: func(_ *testing.T) []storage.Series {
+				return genSeries(100, 10, 0, 0)
+			},
+		},
+		{
+			name: "with collisions",
+			series: func(t *testing.T) []storage.Series {
+				// Make a couple of series with colliding label sets
+				lbls1, lbls2 := labelsWithHashCollision()
+				series := []storage.Series{
+					storage.NewListSeries(
+						lbls1, []chunks.Sample{sample{t: 0, f: rand.Float64()}},
+					),
+					storage.NewListSeries(
+						lbls2, []chunks.Sample{sample{t: 0, f: rand.Float64()}},
+					),
+				}
+				require.Equal(t, series[len(series)-2].Labels().Hash(), series[len(series)-1].Labels().Hash(),
+					"The two series should have the same label set hash")
+				return series
+			},
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			dir := t.TempDir()
+			wal, err := wlog.NewSize(nil, nil, filepath.Join(dir, "wal"), 32768, wlog.CompressionNone)
+			require.NoError(t, err)

-	// Truncate head, remove half of the series (because their timestamp is before supplied truncation MinT)
-	require.NoError(t, h.Truncate(100*(seriesCount/2)))
+			opts := DefaultHeadOptions()
+			opts.ChunkRange = 1000
+			opts.ChunkDirRoot = dir
+			opts.EnableExemplarStorage = true
+			opts.MaxExemplars.Store(config.DefaultExemplarsConfig.MaxExemplars)
+			opts.EnableNativeHistograms.Store(true)
+			opts.SecondaryHashFunction = func(l labels.Labels) uint32 {
+				return uint32(l.Len())
+			}

-	// There should be 50 reported series now.
-	checkSecondaryHashes(50)
+			h, err := NewHead(nil, nil, wal, nil, opts, nil)
+			require.NoError(t, err)

-	// Truncate head again, remove all series, remove half of the series (because their timestamp is before supplied truncation MinT)
-	require.NoError(t, h.Truncate(100*seriesCount))
-	checkSecondaryHashes(0)
+			t.Cleanup(func() {
+				require.NoError(t, h.Close())
+			})
+
+			app := h.Appender(context.Background())
+			series := tc.series(t)
+			for ix, s := range series {
+				_, err := app.Append(0, s.Labels(), int64(100*ix), float64(ix))
+				require.NoError(t, err)
+			}
+
+			require.NoError(t, app.Commit())
+
+			labelsCount := series[0].Labels().Len()
+			checkSecondaryHashes(t, h, labelsCount, len(series))
+
+			// Truncate head, remove half of the series (because their timestamp is before supplied truncation MinT)
+			require.NoError(t, h.Truncate(100*int64(len(series)/2)))
+
+			checkSecondaryHashes(t, h, labelsCount, len(series)/2)
+
+			// Truncate head again, remove all series (because their timestamp is before supplied truncation MinT)
+			require.NoError(t, h.Truncate(100*int64(len(series))))
+			checkSecondaryHashes(t, h, labelsCount, 0)
+		})
+	}
 }
--- a/tsdb/head_wal.go
+++ b/tsdb/head_wal.go
@ -972,7 +972,7 @@ func decodeSeriesFromChunkSnapshot(d *record.Decoder, b []byte) (csr chunkSnapsh
 	dec := encoding.Decbuf{B: b}

 	if flag := dec.Byte(); flag != chunkSnapshotRecordTypeSeries {
-		return csr, errors.Errorf("invalid record type %x", flag)
+		return csr, fmt.Errorf("invalid record type %x", flag)
 	}

 	csr.ref = chunks.HeadSeriesRef(dec.Be64())
@ -1020,7 +1020,7 @@ func decodeSeriesFromChunkSnapshot(d *record.Decoder, b []byte) (csr chunkSnapsh

 	err = dec.Err()
 	if err != nil && len(dec.B) > 0 {
-		err = errors.Errorf("unexpected %d bytes left in entry", len(dec.B))
+		err = fmt.Errorf("unexpected %d bytes left in entry", len(dec.B))
 	}

 	return
@ -1043,7 +1043,7 @@ func decodeTombstonesSnapshotRecord(b []byte) (tombstones.Reader, error) {
 	dec := encoding.Decbuf{B: b}

 	if flag := dec.Byte(); flag != chunkSnapshotRecordTypeTombstones {
-		return nil, errors.Errorf("invalid record type %x", flag)
+		return nil, fmt.Errorf("invalid record type %x", flag)
 	}

 	tr, err := tombstones.Decode(dec.UvarintBytes())
@ -1256,7 +1256,7 @@ func LastChunkSnapshot(dir string) (string, int, int, error) {
 			continue
 		}
 		if !fi.IsDir() {
-			return "", 0, 0, errors.Errorf("chunk snapshot %s is not a directory", fi.Name())
+			return "", 0, 0, fmt.Errorf("chunk snapshot %s is not a directory", fi.Name())
 		}

 		splits := strings.Split(fi.Name()[len(chunkSnapshotPrefix):], ".")
@ -1494,7 +1494,7 @@ Outer:
 		default:
 			// This is a record type we don't understand. It is either and old format from earlier versions,
 			// or a new format and the code was rolled back to old version.
-			loopErr = errors.Errorf("unsupported snapshot record type 0b%b", rec[0])
+			loopErr = fmt.Errorf("unsupported snapshot record type 0b%b", rec[0])
 			break Outer
 		}
 	}
--- a/tsdb/index/index.go
+++ b/tsdb/index/index.go
@ -28,7 +28,6 @@ import (
 	"sort"
 	"unsafe"

-	"github.com/pkg/errors"
 	"golang.org/x/exp/slices"

 	"github.com/prometheus/prometheus/model/labels"
@ -108,8 +107,8 @@ func newCRC32() hash.Hash32 {

 type symbolCacheEntry struct {
 	index          uint32
-	lastValue      string
 	lastValueIndex uint32
+	lastValue      string
 }

 // Writer implements the IndexWriter interface for the standard
@ -173,7 +172,7 @@ func NewTOCFromByteSlice(bs ByteSlice) (*TOC, error) {
 	d := encoding.Decbuf{B: b[:len(b)-4]}

 	if d.Crc32(castagnoliTable) != expCRC {
-		return nil, errors.Wrap(encoding.ErrInvalidChecksum, "read TOC")
+		return nil, fmt.Errorf("read TOC: %w", encoding.ErrInvalidChecksum)
 	}

 	toc := &TOC{
@ -198,7 +197,7 @@ func NewWriter(ctx context.Context, fn string) (*Writer, error) {
 	defer df.Close() // Close for platform windows.

 	if err := os.RemoveAll(fn); err != nil {
-		return nil, errors.Wrap(err, "remove any existing index at path")
+		return nil, fmt.Errorf("remove any existing index at path: %w", err)
 	}

 	// Main index file we are building.
@ -217,7 +216,7 @@ func NewWriter(ctx context.Context, fn string) (*Writer, error) {
 		return nil, err
 	}
 	if err := df.Sync(); err != nil {
-		return nil, errors.Wrap(err, "sync dir")
+		return nil, fmt.Errorf("sync dir: %w", err)
 	}

 	iw := &Writer{
@ -289,7 +288,7 @@ func (fw *FileWriter) Write(bufs ...[]byte) error {
 		// Once we move to compressed/varint representations in those areas, this limitation
 		// can be lifted.
 		if fw.pos > 16*math.MaxUint32 {
-			return errors.Errorf("%q exceeding max size of 64GiB", fw.name)
+			return fmt.Errorf("%q exceeding max size of 64GiB", fw.name)
 		}
 	}
 	return nil
@ -316,7 +315,7 @@ func (fw *FileWriter) AddPadding(size int) error {
 	p = uint64(size) - p

 	if err := fw.Write(make([]byte, p)); err != nil {
-		return errors.Wrap(err, "add padding")
+		return fmt.Errorf("add padding: %w", err)
 	}
 	return nil
 }
@ -354,7 +353,7 @@ func (w *Writer) ensureStage(s indexWriterStage) error {
 		}
 	}
 	if w.stage > s {
-		return errors.Errorf("invalid stage %q, currently at %q", s, w.stage)
+		return fmt.Errorf("invalid stage %q, currently at %q", s, w.stage)
 	}

 	// Mark start of sections in table of contents.
@ -418,20 +417,20 @@ func (w *Writer) AddSeries(ref storage.SeriesRef, lset labels.Labels, chunks ...
 		return err
 	}
 	if labels.Compare(lset, w.lastSeries) <= 0 {
-		return errors.Errorf("out-of-order series added with label set %q", lset)
+		return fmt.Errorf("out-of-order series added with label set %q", lset)
 	}

 	if ref < w.lastRef && !w.lastSeries.IsEmpty() {
-		return errors.Errorf("series with reference greater than %d already added", ref)
+		return fmt.Errorf("series with reference greater than %d already added", ref)
 	}
 	// We add padding to 16 bytes to increase the addressable space we get through 4 byte
 	// series references.
 	if err := w.addPadding(16); err != nil {
-		return errors.Errorf("failed to write padding bytes: %v", err)
+		return fmt.Errorf("failed to write padding bytes: %v", err)
 	}

 	if w.f.pos%16 != 0 {
-		return errors.Errorf("series write not 16-byte aligned at %d", w.f.pos)
+		return fmt.Errorf("series write not 16-byte aligned at %d", w.f.pos)
 	}

 	w.buf2.Reset()
@ -444,7 +443,7 @@ func (w *Writer) AddSeries(ref storage.SeriesRef, lset labels.Labels, chunks ...
 		if !ok {
 			nameIndex, err = w.symbols.ReverseLookup(l.Name)
 			if err != nil {
-				return errors.Errorf("symbol entry for %q does not exist, %v", l.Name, err)
+				return fmt.Errorf("symbol entry for %q does not exist, %v", l.Name, err)
 			}
 		}
 		w.labelNames[l.Name]++
@ -454,12 +453,12 @@ func (w *Writer) AddSeries(ref storage.SeriesRef, lset labels.Labels, chunks ...
 		if !ok || cacheEntry.lastValue != l.Value {
 			valueIndex, err = w.symbols.ReverseLookup(l.Value)
 			if err != nil {
-				return errors.Errorf("symbol entry for %q does not exist, %v", l.Value, err)
+				return fmt.Errorf("symbol entry for %q does not exist, %v", l.Value, err)
 			}
 			w.symbolCache[l.Name] = symbolCacheEntry{
 				index:          nameIndex,
-				lastValue:      l.Value,
 				lastValueIndex: valueIndex,
+				lastValue:      l.Value,
 			}
 		}
 		w.buf2.PutUvarint32(valueIndex)
@ -494,7 +493,7 @@ func (w *Writer) AddSeries(ref storage.SeriesRef, lset labels.Labels, chunks ...
 	w.buf2.PutHash(w.crc32)

 	if err := w.write(w.buf1.Get(), w.buf2.Get()); err != nil {
-		return errors.Wrap(err, "write series data")
+		return fmt.Errorf("write series data: %w", err)
 	}

 	w.lastSeries.CopyFrom(lset)
@ -515,7 +514,7 @@ func (w *Writer) AddSymbol(sym string) error {
 		return err
 	}
 	if w.numSymbols != 0 && sym <= w.lastSymbol {
-		return errors.Errorf("symbol %q out-of-order", sym)
+		return fmt.Errorf("symbol %q out-of-order", sym)
 	}
 	w.lastSymbol = sym
 	w.numSymbols++
@ -528,7 +527,7 @@ func (w *Writer) finishSymbols() error {
 	symbolTableSize := w.f.pos - w.toc.Symbols - 4
 	// The symbol table's <len> part is 4 bytes. So the total symbol table size must be less than or equal to 2^32-1
 	if symbolTableSize > math.MaxUint32 {
-		return errors.Errorf("symbol table size exceeds %d bytes: %d", uint32(math.MaxUint32), symbolTableSize)
+		return fmt.Errorf("symbol table size exceeds %d bytes: %d", uint32(math.MaxUint32), symbolTableSize)
 	}

 	// Write out the length and symbol count.
@ -564,7 +563,7 @@ func (w *Writer) finishSymbols() error {
 	// Load in the symbol table efficiently for the rest of the index writing.
 	w.symbols, err = NewSymbols(realByteSlice(w.symbolFile.Bytes()), FormatV2, int(w.toc.Symbols))
 	if err != nil {
-		return errors.Wrap(err, "read symbols")
+		return fmt.Errorf("read symbols: %w", err)
 	}
 	return nil
 }
@ -661,7 +660,7 @@ func (w *Writer) writeLabelIndex(name string, values []uint32) error {
 	w.buf1.Reset()
 	l := w.f.pos - startPos - 4
 	if l > math.MaxUint32 {
-		return errors.Errorf("label index size exceeds 4 bytes: %d", l)
+		return fmt.Errorf("label index size exceeds 4 bytes: %d", l)
 	}
 	w.buf1.PutBE32int(int(l))
 	if err := w.writeAt(w.buf1.Get(), startPos); err != nil {
@ -705,7 +704,7 @@ func (w *Writer) writeLabelIndexesOffsetTable() error {
 	w.buf1.Reset()
 	l := w.f.pos - startPos - 4
 	if l > math.MaxUint32 {
-		return errors.Errorf("label indexes offset table size exceeds 4 bytes: %d", l)
+		return fmt.Errorf("label indexes offset table size exceeds 4 bytes: %d", l)
 	}
 	w.buf1.PutBE32int(int(l))
 	if err := w.writeAt(w.buf1.Get(), startPos); err != nil {
@ -786,7 +785,7 @@ func (w *Writer) writePostingsOffsetTable() error {
 	w.buf1.Reset()
 	l := w.f.pos - startPos - 4
 	if l > math.MaxUint32 {
-		return errors.Errorf("postings offset table size exceeds 4 bytes: %d", l)
+		return fmt.Errorf("postings offset table size exceeds 4 bytes: %d", l)
 	}
 	w.buf1.PutBE32int(int(l))
 	if err := w.writeAt(w.buf1.Get(), startPos); err != nil {
@ -840,7 +839,7 @@ func (w *Writer) writePostingsToTmpFiles() error {
 		d.ConsumePadding()
 		startPos := w.toc.LabelIndices - uint64(d.Len())
 		if startPos%16 != 0 {
-			return errors.Errorf("series not 16-byte aligned at %d", startPos)
+			return fmt.Errorf("series not 16-byte aligned at %d", startPos)
 		}
 		offsets = append(offsets, uint32(startPos/16))
 		// Skip to next series.
@ -924,7 +923,7 @@ func (w *Writer) writePostingsToTmpFiles() error {
 			// Symbol numbers are in order, so the strings will also be in order.
 			slices.Sort(values)
 			for _, v := range values {
-				value, err := w.symbols.Lookup(w.ctx, v)
+				value, err := w.symbols.Lookup(v)
 				if err != nil {
 					return err
 				}
@ -965,7 +964,7 @@ func (w *Writer) writePosting(name, value string, offs []uint32) error {

 	for _, off := range offs {
 		if off > (1<<32)-1 {
-			return errors.Errorf("series offset %d exceeds 4 bytes", off)
+			return fmt.Errorf("series offset %d exceeds 4 bytes", off)
 		}
 		w.buf1.PutBE32(off)
 	}
@ -974,7 +973,7 @@ func (w *Writer) writePosting(name, value string, offs []uint32) error {
 	l := w.buf1.Len()
 	// We convert to uint to make code compile on 32-bit systems, as math.MaxUint32 doesn't fit into int there.
 	if uint(l) > math.MaxUint32 {
-		return errors.Errorf("posting size exceeds 4 bytes: %d", l)
+		return fmt.Errorf("posting size exceeds 4 bytes: %d", l)
 	}
 	w.buf2.PutBE32int(l)
 	w.buf1.PutHash(w.crc32)
@ -1001,7 +1000,7 @@ func (w *Writer) writePostings() error {
 		return err
 	}
 	if uint64(n) != w.fP.pos {
-		return errors.Errorf("wrote %d bytes to posting temporary file, but only read back %d", w.fP.pos, n)
+		return fmt.Errorf("wrote %d bytes to posting temporary file, but only read back %d", w.fP.pos, n)
 	}
 	w.f.pos += uint64(n)

@ -1154,26 +1153,26 @@ func newReader(b ByteSlice, c io.Closer, cacheProvider ReaderCacheProvider) (*Re

 	// Verify header.
 	if r.b.Len() < HeaderLen {
-		return nil, errors.Wrap(encoding.ErrInvalidSize, "index header")
+		return nil, fmt.Errorf("index header: %w", encoding.ErrInvalidSize)
 	}
 	if m := binary.BigEndian.Uint32(r.b.Range(0, 4)); m != MagicIndex {
-		return nil, errors.Errorf("invalid magic number %x", m)
+		return nil, fmt.Errorf("invalid magic number %x", m)
 	}
 	r.version = int(r.b.Range(4, 5)[0])

 	if r.version != FormatV1 && r.version != FormatV2 {
-		return nil, errors.Errorf("unknown index file version %d", r.version)
+		return nil, fmt.Errorf("unknown index file version %d", r.version)
 	}

 	var err error
 	r.toc, err = NewTOCFromByteSlice(b)
 	if err != nil {
-		return nil, errors.Wrap(err, "read TOC")
+		return nil, fmt.Errorf("read TOC: %w", err)
 	}

 	r.symbols, err = NewSymbols(r.b, r.version, int(r.toc.Symbols))
 	if err != nil {
-		return nil, errors.Wrap(err, "read symbols")
+		return nil, fmt.Errorf("read symbols: %w", err)
 	}

 	if r.version == FormatV1 {
@ -1188,7 +1187,7 @@ func newReader(b ByteSlice, c io.Closer, cacheProvider ReaderCacheProvider) (*Re
 			r.postingsV1[string(name)][string(value)] = off
 			return nil
 		}); err != nil {
-			return nil, errors.Wrap(err, "read postings table")
+			return nil, fmt.Errorf("read postings table: %w", err)
 		}
 	} else {
 		var lastName, lastValue []byte
@ -1216,7 +1215,7 @@ func newReader(b ByteSlice, c io.Closer, cacheProvider ReaderCacheProvider) (*Re
 			valueCount++
 			return nil
 		}); err != nil {
-			return nil, errors.Wrap(err, "read postings table")
+			return nil, fmt.Errorf("read postings table: %w", err)
 		}
 		if lastName != nil {
 			r.postings[string(lastName)] = append(r.postings[string(lastName)], postingOffset{value: string(lastValue), off: lastOff})
@ -1236,7 +1235,7 @@ func newReader(b ByteSlice, c io.Closer, cacheProvider ReaderCacheProvider) (*Re
 		}
 		off, err := r.symbols.ReverseLookup(k)
 		if err != nil {
-			return nil, errors.Wrap(err, "reverse symbol lookup")
+			return nil, fmt.Errorf("reverse symbol lookup: %w", err)
 		}
 		r.nameSymbols[off] = k
 	}
@ -1271,7 +1270,7 @@ func (r *Reader) PostingsRanges() (map[labels.Label]Range, error) {
 		}
 		return nil
 	}); err != nil {
-		return nil, errors.Wrap(err, "read postings table")
+		return nil, fmt.Errorf("read postings table: %w", err)
 	}
 	return m, nil
 }
@ -1314,21 +1313,18 @@ func NewSymbols(bs ByteSlice, version, off int) (*Symbols, error) {
 	return s, nil
 }

-func (s Symbols) Lookup(ctx context.Context, o uint32) (string, error) {
+func (s Symbols) Lookup(o uint32) (string, error) {
 	d := encoding.Decbuf{
 		B: s.bs.Range(0, s.bs.Len()),
 	}

 	if s.version == FormatV2 {
 		if int(o) >= s.seen {
-			return "", errors.Errorf("unknown symbol offset %d", o)
+			return "", fmt.Errorf("unknown symbol offset %d", o)
 		}
 		d.Skip(s.offsets[int(o/symbolFactor)])
 		// Walk until we find the one we want.
 		for i := o - (o / symbolFactor * symbolFactor); i > 0; i-- {
-			if ctx.Err() != nil {
-				return "", ctx.Err()
-			}
 			d.UvarintBytes()
 		}
 	} else {
@ -1343,7 +1339,7 @@ func (s Symbols) Lookup(ctx context.Context, o uint32) (string, error) {

 func (s Symbols) ReverseLookup(sym string) (uint32, error) {
 	if len(s.offsets) == 0 {
-		return 0, errors.Errorf("unknown symbol %q - no symbols", sym)
+		return 0, fmt.Errorf("unknown symbol %q - no symbols", sym)
 	}
 	i := sort.Search(len(s.offsets), func(i int) bool {
 		// Any decoding errors here will be lost, however
@ -1376,7 +1372,7 @@ func (s Symbols) ReverseLookup(sym string) (uint32, error) {
 		return 0, d.Err()
 	}
 	if lastSymbol != sym {
-		return 0, errors.Errorf("unknown symbol %q", sym)
+		return 0, fmt.Errorf("unknown symbol %q", sym)
 	}
 	if s.version == FormatV2 {
 		return uint32(res), nil
@ -1435,7 +1431,7 @@ func ReadPostingsOffsetTable(bs ByteSlice, off uint64, f func(name, value []byte
 		offsetPos := startLen - d.Len()

 		if keyCount := d.Uvarint(); keyCount != 2 {
-			return errors.Errorf("unexpected number of keys for postings offset table %d", keyCount)
+			return fmt.Errorf("unexpected number of keys for postings offset table %d", keyCount)
 		}
 		name := d.UvarintBytes()
 		value := d.UvarintBytes()
@ -1460,7 +1456,7 @@ func (r *Reader) lookupSymbol(ctx context.Context, o uint32) (string, error) {
 	if s, ok := r.nameSymbols[o]; ok {
 		return s, nil
 	}
-	return r.symbols.Lookup(ctx, o)
+	return r.symbols.Lookup(o)
 }

 // Symbols returns an iterator over the symbols that exist within the index.
@ -1490,7 +1486,7 @@ func (r *Reader) SortedLabelValues(ctx context.Context, name string, matchers ..
 // TODO(replay): Support filtering by matchers.
 func (r *Reader) LabelValues(ctx context.Context, name string, matchers ...*labels.Matcher) ([]string, error) {
 	if len(matchers) > 0 {
-		return nil, errors.Errorf("matchers parameter is not implemented: %+v", matchers)
+		return nil, fmt.Errorf("matchers parameter is not implemented: %+v", matchers)
 	}

 	if r.version == FormatV1 {
@ -1538,7 +1534,7 @@ func (r *Reader) LabelValues(ctx context.Context, name string, matchers ...*labe
 		d.Uvarint64() // Offset.
 	}
 	if d.Err() != nil {
-		return nil, errors.Wrap(d.Err(), "get postings offset entry")
+		return nil, fmt.Errorf("get postings offset entry: %w", d.Err())
 	}

 	return values, ctx.Err()
@ -1564,12 +1560,12 @@ func (r *Reader) LabelNamesFor(ctx context.Context, ids ...storage.SeriesRef) ([
 		d := encoding.NewDecbufUvarintAt(r.b, int(offset), castagnoliTable)
 		buf := d.Get()
 		if d.Err() != nil {
-			return nil, errors.Wrap(d.Err(), "get buffer for series")
+			return nil, fmt.Errorf("get buffer for series: %w", d.Err())
 		}

 		offsets, err := r.dec.LabelNamesOffsetsFor(buf)
 		if err != nil {
-			return nil, errors.Wrap(err, "get label name offsets")
+			return nil, fmt.Errorf("get label name offsets: %w", err)
 		}
 		for _, off := range offsets {
 			offsetsMap[off] = struct{}{}
@ -1581,7 +1577,7 @@ func (r *Reader) LabelNamesFor(ctx context.Context, ids ...storage.SeriesRef) ([
 	for off := range offsetsMap {
 		name, err := r.lookupSymbol(ctx, off)
 		if err != nil {
-			return nil, errors.Wrap(err, "lookup symbol in LabelNamesFor")
+			return nil, fmt.Errorf("lookup symbol in LabelNamesFor: %w", err)
 		}
 		names = append(names, name)
 	}
@ -1602,7 +1598,7 @@ func (r *Reader) LabelValueFor(ctx context.Context, id storage.SeriesRef, label
 	d := encoding.NewDecbufUvarintAt(r.b, int(offset), castagnoliTable)
 	buf := d.Get()
 	if d.Err() != nil {
-		return "", errors.Wrap(d.Err(), "label values for")
+		return "", fmt.Errorf("label values for: %w", d.Err())
 	}

 	value, err := r.dec.LabelValueFor(ctx, buf, label)
@ -1629,7 +1625,11 @@ func (r *Reader) Series(id storage.SeriesRef, builder *labels.ScratchBuilder, ch
 	if d.Err() != nil {
 		return d.Err()
 	}
-	return errors.Wrap(r.dec.Series(d.Get(), builder, chks), "read series")
+	err := r.dec.Series(d.Get(), builder, chks)
+	if err != nil {
+		return fmt.Errorf("read series: %w", err)
+	}
+	return nil
 }

 func (r *Reader) Postings(ctx context.Context, name string, values ...string) (Postings, error) {
@ -1648,7 +1648,7 @@ func (r *Reader) Postings(ctx context.Context, name string, values ...string) (P
 			d := encoding.NewDecbufAt(r.b, int(postingsOff), castagnoliTable)
 			_, p, err := r.dec.Postings(d.Get())
 			if err != nil {
-				return nil, errors.Wrap(err, "decode postings")
+				return nil, fmt.Errorf("decode postings: %w", err)
 			}
 			res = append(res, p)
 		}
@ -1710,7 +1710,7 @@ func (r *Reader) Postings(ctx context.Context, name string, values ...string) (P
 					d2 := encoding.NewDecbufAt(r.b, int(postingsOff), castagnoliTable)
 					_, p, err := r.dec.Postings(d2.Get())
 					if err != nil {
-						return nil, errors.Wrap(err, "decode postings")
+						return nil, fmt.Errorf("decode postings: %w", err)
 					}
 					res = append(res, p)
 				}
@ -1726,10 +1726,10 @@ func (r *Reader) Postings(ctx context.Context, name string, values ...string) (P
 			}
 		}
 		if d.Err() != nil {
-			return nil, errors.Wrap(d.Err(), "get postings offset entry")
+			return nil, fmt.Errorf("get postings offset entry: %w", d.Err())
 		}
 		if ctx.Err() != nil {
-			return nil, errors.Wrap(ctx.Err(), "get postings offset entry")
+			return nil, fmt.Errorf("get postings offset entry: %w", ctx.Err())
 		}
 	}

@ -1773,7 +1773,7 @@ func (r *Reader) ShardedPostings(p Postings, shardIndex, shardCount uint64) Post
 			// Get the series labels (no chunks).
 			err := r.Series(id, &bufLbls, nil)
 			if err != nil {
-				return ErrPostings(errors.Errorf("series %d not found", id))
+				return ErrPostings(fmt.Errorf("series %d not found", id))
 			}

 			hash = labels.StableHash(bufLbls.Labels())
@ -1802,7 +1802,7 @@ func (r *Reader) Size() int64 {
 // TODO(twilkie) implement support for matchers.
 func (r *Reader) LabelNames(_ context.Context, matchers ...*labels.Matcher) ([]string, error) {
 	if len(matchers) > 0 {
-		return nil, errors.Errorf("matchers parameter is not implemented: %+v", matchers)
+		return nil, fmt.Errorf("matchers parameter is not implemented: %+v", matchers)
 	}

 	labelNames := make([]string, 0, len(r.postings))
@ -1873,7 +1873,7 @@ func (dec *Decoder) LabelNamesOffsetsFor(b []byte) ([]uint32, error) {
 		_ = d.Uvarint() // skip the label value

 		if d.Err() != nil {
-			return nil, errors.Wrap(d.Err(), "read series label offsets")
+			return nil, fmt.Errorf("read series label offsets: %w", d.Err())
 		}
 	}

@ -1890,18 +1890,18 @@ func (dec *Decoder) LabelValueFor(ctx context.Context, b []byte, label string) (
 		lvo := uint32(d.Uvarint())

 		if d.Err() != nil {
-			return "", errors.Wrap(d.Err(), "read series label offsets")
+			return "", fmt.Errorf("read series label offsets: %w", d.Err())
 		}

 		ln, err := dec.LookupSymbol(ctx, lno)
 		if err != nil {
-			return "", errors.Wrap(err, "lookup label name")
+			return "", fmt.Errorf("lookup label name: %w", err)
 		}

 		if ln == label {
 			lv, err := dec.LookupSymbol(ctx, lvo)
 			if err != nil {
-				return "", errors.Wrap(err, "lookup label value")
+				return "", fmt.Errorf("lookup label value: %w", err)
 			}

 			return lv, nil
@ -1928,16 +1928,16 @@ func (dec *Decoder) Series(b []byte, builder *labels.ScratchBuilder, chks *[]chu
 		lvo := uint32(d.Uvarint())

 		if d.Err() != nil {
-			return errors.Wrap(d.Err(), "read series label offsets")
+			return fmt.Errorf("read series label offsets: %w", d.Err())
 		}

 		ln, err := dec.LookupSymbol(context.TODO(), lno)
 		if err != nil {
-			return errors.Wrap(err, "lookup label name")
+			return fmt.Errorf("lookup label name: %w", err)
 		}
 		lv, err := dec.LookupSymbol(context.TODO(), lvo)
 		if err != nil {
-			return errors.Wrap(err, "lookup label value")
+			return fmt.Errorf("lookup label value: %w", err)
 		}

 		builder.Add(ln, lv)
@ -1974,7 +1974,7 @@ func (dec *Decoder) Series(b []byte, builder *labels.ScratchBuilder, chks *[]chu
 		t0 = maxt

 		if d.Err() != nil {
-			return errors.Wrapf(d.Err(), "read meta for chunk %d", i)
+			return fmt.Errorf("read meta for chunk %d: %w", i, d.Err())
 		}

 		*chks = append(*chks, chunks.Meta{
--- a/tsdb/index/index_test.go
+++ b/tsdb/index/index_test.go
@ -15,6 +15,7 @@ package index

 import (
 	"context"
+	"errors"
 	"fmt"
 	"hash/crc32"
 	"math/rand"
@ -23,7 +24,6 @@ import (
 	"sort"
 	"testing"

-	"github.com/pkg/errors"
 	"github.com/stretchr/testify/require"

 	"github.com/prometheus/prometheus/model/labels"
@ -66,7 +66,7 @@ func (m mockIndex) Symbols() (map[string]struct{}, error) {

 func (m mockIndex) AddSeries(ref storage.SeriesRef, l labels.Labels, chunks ...chunks.Meta) error {
 	if _, ok := m.series[ref]; ok {
-		return errors.Errorf("series with reference %d already added", ref)
+		return fmt.Errorf("series with reference %d already added", ref)
 	}
 	l.Range(func(lbl labels.Label) {
 		m.symbols[lbl.Name] = struct{}{}
@ -115,7 +115,7 @@ func (m mockIndex) Postings(ctx context.Context, name string, values ...string)
 func (m mockIndex) SortedPostings(p Postings) Postings {
 	ep, err := ExpandPostings(p)
 	if err != nil {
-		return ErrPostings(errors.Wrap(err, "expand postings"))
+		return ErrPostings(fmt.Errorf("expand postings: %w", err))
 	}

 	sort.Slice(ep, func(i, j int) bool {
@ -576,7 +576,6 @@ func TestNewFileReaderErrorNoOpenFiles(t *testing.T) {
 }

 func TestSymbols(t *testing.T) {
-	ctx := context.Background()
 	buf := encoding.Encbuf{}

 	// Add prefix to the buffer to simulate symbols as part of larger buffer.
@ -599,11 +598,11 @@ func TestSymbols(t *testing.T) {
 	require.Equal(t, 32, s.Size())

 	for i := 99; i >= 0; i-- {
-		s, err := s.Lookup(ctx, uint32(i))
+		s, err := s.Lookup(uint32(i))
 		require.NoError(t, err)
 		require.Equal(t, string(rune(i)), s)
 	}
-	_, err = s.Lookup(ctx, 100)
+	_, err = s.Lookup(100)
 	require.Error(t, err)

 	for i := 99; i >= 0; i-- {
--- a/tsdb/index/postings.go
+++ b/tsdb/index/postings.go
@ -17,12 +17,12 @@ import (
 	"container/heap"
 	"context"
 	"encoding/binary"
+	"fmt"
 	"runtime"
 	"sort"
 	"strings"
 	"sync"

-	"github.com/pkg/errors"
 	"golang.org/x/exp/slices"

 	"github.com/prometheus/prometheus/model/labels"
@ -949,7 +949,7 @@ func (h *postingsWithIndexHeap) next() error {
 	}

 	if err := pi.p.Err(); err != nil {
-		return errors.Wrapf(err, "postings %d", pi.index)
+		return fmt.Errorf("postings %d: %w", pi.index, err)
 	}
 	h.popIndex()
 	return nil
--- a/tsdb/index/postings_test.go
+++ b/tsdb/index/postings_test.go
@ -17,13 +17,13 @@ import (
 	"container/heap"
 	"context"
 	"encoding/binary"
+	"errors"
 	"fmt"
 	"math/rand"
 	"sort"
 	"strconv"
 	"testing"

-	"github.com/pkg/errors"
 	"github.com/stretchr/testify/require"

 	"github.com/prometheus/prometheus/model/labels"
--- a/tsdb/ooo_head.go
+++ b/tsdb/ooo_head.go
@ -17,7 +17,10 @@ import (
 	"fmt"
 	"sort"

+	"github.com/oklog/ulid"
+
 	"github.com/prometheus/prometheus/tsdb/chunkenc"
+	"github.com/prometheus/prometheus/tsdb/chunks"
 	"github.com/prometheus/prometheus/tsdb/tombstones"
 )

@ -111,22 +114,27 @@ type OOORangeHead struct {
 	// the timerange of the query and having preexisting pointers to the first
 	// and last timestamp help with that.
 	mint, maxt int64
+
+	isoState *oooIsolationState
 }

-func NewOOORangeHead(head *Head, mint, maxt int64) *OOORangeHead {
+func NewOOORangeHead(head *Head, mint, maxt int64, minRef chunks.ChunkDiskMapperRef) *OOORangeHead {
+	isoState := head.oooIso.TrackReadAfter(minRef)
+
 	return &OOORangeHead{
-		head: head,
-		mint: mint,
-		maxt: maxt,
+		head:     head,
+		mint:     mint,
+		maxt:     maxt,
+		isoState: isoState,
 	}
 }

 func (oh *OOORangeHead) Index() (IndexReader, error) {
-	return NewOOOHeadIndexReader(oh.head, oh.mint, oh.maxt), nil
+	return NewOOOHeadIndexReader(oh.head, oh.mint, oh.maxt, oh.isoState.minRef), nil
 }

 func (oh *OOORangeHead) Chunks() (ChunkReader, error) {
-	return NewOOOHeadChunkReader(oh.head, oh.mint, oh.maxt), nil
+	return NewOOOHeadChunkReader(oh.head, oh.mint, oh.maxt, oh.isoState), nil
 }

 func (oh *OOORangeHead) Tombstones() (tombstones.Reader, error) {
@ -135,13 +143,13 @@ func (oh *OOORangeHead) Tombstones() (tombstones.Reader, error) {
 	return tombstones.NewMemTombstones(), nil
 }

+var oooRangeHeadULID = ulid.MustParse("0000000000XXXX000RANGEHEAD")
+
 func (oh *OOORangeHead) Meta() BlockMeta {
-	var id [16]byte
-	copy(id[:], "____ooo_head____")
 	return BlockMeta{
 		MinTime: oh.mint,
 		MaxTime: oh.maxt,
-		ULID:    id,
+		ULID:    oooRangeHeadULID,
 		Stats: BlockStats{
 			NumSeries: oh.head.NumSeries(),
 		},
--- a/tsdb/ooo_head_read.go
+++ b/tsdb/ooo_head_read.go
@ -18,6 +18,7 @@ import (
 	"errors"
 	"math"

+	"github.com/oklog/ulid"
 	"golang.org/x/exp/slices"

 	"github.com/prometheus/prometheus/model/labels"
@ -37,26 +38,29 @@ var _ IndexReader = &OOOHeadIndexReader{}
 // decided to do this to avoid code duplication.
 // The only methods that change are the ones about getting Series and Postings.
 type OOOHeadIndexReader struct {
-	*headIndexReader // A reference to the headIndexReader so we can reuse as many interface implementation as possible.
+	*headIndexReader            // A reference to the headIndexReader so we can reuse as many interface implementation as possible.
+	lastGarbageCollectedMmapRef chunks.ChunkDiskMapperRef
 }

-func NewOOOHeadIndexReader(head *Head, mint, maxt int64) *OOOHeadIndexReader {
+func NewOOOHeadIndexReader(head *Head, mint, maxt int64, lastGarbageCollectedMmapRef chunks.ChunkDiskMapperRef) *OOOHeadIndexReader {
 	hr := &headIndexReader{
 		head: head,
 		mint: mint,
 		maxt: maxt,
 	}
-	return &OOOHeadIndexReader{hr}
+	return &OOOHeadIndexReader{hr, lastGarbageCollectedMmapRef}
 }

 func (oh *OOOHeadIndexReader) Series(ref storage.SeriesRef, builder *labels.ScratchBuilder, chks *[]chunks.Meta) error {
-	return oh.series(ref, builder, chks, 0)
+	return oh.series(ref, builder, chks, oh.lastGarbageCollectedMmapRef, 0)
 }

-// The passed lastMmapRef tells upto what max m-map chunk that we can consider.
-// If it is 0, it means all chunks need to be considered.
-// If it is non-0, then the oooHeadChunk must not be considered.
-func (oh *OOOHeadIndexReader) series(ref storage.SeriesRef, builder *labels.ScratchBuilder, chks *[]chunks.Meta, lastMmapRef chunks.ChunkDiskMapperRef) error {
+// lastGarbageCollectedMmapRef gives the last mmap chunk that may be being garbage collected and so
+// any chunk at or before this ref will not be considered. 0 disables this check.
+//
+// maxMmapRef tells upto what max m-map chunk that we can consider. If it is non-0, then
+// the oooHeadChunk will not be considered.
+func (oh *OOOHeadIndexReader) series(ref storage.SeriesRef, builder *labels.ScratchBuilder, chks *[]chunks.Meta, lastGarbageCollectedMmapRef, maxMmapRef chunks.ChunkDiskMapperRef) error {
 	s := oh.head.series.getByID(chunks.HeadSeriesRef(ref))

 	if s == nil {
@ -111,14 +115,14 @@ func (oh *OOOHeadIndexReader) series(ref storage.SeriesRef, builder *labels.Scra
 	// so we can set the correct markers.
 	if s.ooo.oooHeadChunk != nil {
 		c := s.ooo.oooHeadChunk
-		if c.OverlapsClosedInterval(oh.mint, oh.maxt) && lastMmapRef == 0 {
+		if c.OverlapsClosedInterval(oh.mint, oh.maxt) && maxMmapRef == 0 {
 			ref := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(len(s.ooo.oooMmappedChunks))))
 			addChunk(c.minTime, c.maxTime, ref)
 		}
 	}
 	for i := len(s.ooo.oooMmappedChunks) - 1; i >= 0; i-- {
 		c := s.ooo.oooMmappedChunks[i]
-		if c.OverlapsClosedInterval(oh.mint, oh.maxt) && (lastMmapRef == 0 || lastMmapRef.GreaterThanOrEqualTo(c.ref)) {
+		if c.OverlapsClosedInterval(oh.mint, oh.maxt) && (maxMmapRef == 0 || maxMmapRef.GreaterThanOrEqualTo(c.ref)) && (lastGarbageCollectedMmapRef == 0 || c.ref.GreaterThan(lastGarbageCollectedMmapRef)) {
 			ref := chunks.ChunkRef(chunks.NewHeadChunkRef(s.ref, s.oooHeadChunkID(i)))
 			addChunk(c.minTime, c.maxTime, ref)
 		}
@ -237,46 +241,51 @@ func (oh *OOOHeadIndexReader) Postings(ctx context.Context, name string, values
 type OOOHeadChunkReader struct {
 	head       *Head
 	mint, maxt int64
+	isoState   *oooIsolationState
 }

-func NewOOOHeadChunkReader(head *Head, mint, maxt int64) *OOOHeadChunkReader {
+func NewOOOHeadChunkReader(head *Head, mint, maxt int64, isoState *oooIsolationState) *OOOHeadChunkReader {
 	return &OOOHeadChunkReader{
-		head: head,
-		mint: mint,
-		maxt: maxt,
+		head:     head,
+		mint:     mint,
+		maxt:     maxt,
+		isoState: isoState,
 	}
 }

-func (cr OOOHeadChunkReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
+func (cr OOOHeadChunkReader) ChunkOrIterable(meta chunks.Meta) (chunkenc.Chunk, chunkenc.Iterable, error) {
 	sid, _ := chunks.HeadChunkRef(meta.Ref).Unpack()

 	s := cr.head.series.getByID(sid)
 	// This means that the series has been garbage collected.
 	if s == nil {
-		return nil, storage.ErrNotFound
+		return nil, nil, storage.ErrNotFound
 	}

 	s.Lock()
 	if s.ooo == nil {
 		// There is no OOO data for this series.
 		s.Unlock()
-		return nil, storage.ErrNotFound
+		return nil, nil, storage.ErrNotFound
 	}
-	c, err := s.oooMergedChunk(meta, cr.head.chunkDiskMapper, cr.mint, cr.maxt)
+	mc, err := s.oooMergedChunks(meta, cr.head.chunkDiskMapper, cr.mint, cr.maxt)
 	s.Unlock()
 	if err != nil {
-		return nil, err
+		return nil, nil, err
 	}

 	// This means that the query range did not overlap with the requested chunk.
-	if len(c.chunks) == 0 {
-		return nil, storage.ErrNotFound
+	if len(mc.chunkIterables) == 0 {
+		return nil, nil, storage.ErrNotFound
 	}

-	return c, nil
+	return nil, mc, nil
 }

 func (cr OOOHeadChunkReader) Close() error {
+	if cr.isoState != nil {
+		cr.isoState.Close()
+	}
 	return nil
 }

@ -311,7 +320,7 @@ func NewOOOCompactionHead(ctx context.Context, head *Head) (*OOOCompactionHead,
 		ch.lastWBLFile = lastWBLFile
 	}

-	ch.oooIR = NewOOOHeadIndexReader(head, math.MinInt64, math.MaxInt64)
+	ch.oooIR = NewOOOHeadIndexReader(head, math.MinInt64, math.MaxInt64, 0)
 	n, v := index.AllPostingsKey()

 	// TODO: verify this gets only ooo samples.
@ -370,20 +379,20 @@ func (ch *OOOCompactionHead) Index() (IndexReader, error) {
 }

 func (ch *OOOCompactionHead) Chunks() (ChunkReader, error) {
-	return NewOOOHeadChunkReader(ch.oooIR.head, ch.oooIR.mint, ch.oooIR.maxt), nil
+	return NewOOOHeadChunkReader(ch.oooIR.head, ch.oooIR.mint, ch.oooIR.maxt, nil), nil
 }

 func (ch *OOOCompactionHead) Tombstones() (tombstones.Reader, error) {
 	return tombstones.NewMemTombstones(), nil
 }

+var oooCompactionHeadULID = ulid.MustParse("0000000000XX000COMPACTHEAD")
+
 func (ch *OOOCompactionHead) Meta() BlockMeta {
-	var id [16]byte
-	copy(id[:], "copy(id[:], \"ooo_compact_head\")")
 	return BlockMeta{
 		MinTime: ch.mint,
 		MaxTime: ch.maxt,
-		ULID:    id,
+		ULID:    oooCompactionHeadULID,
 		Stats: BlockStats{
 			NumSeries: uint64(len(ch.postings)),
 		},
@ -396,7 +405,7 @@ func (ch *OOOCompactionHead) Meta() BlockMeta {
 // Only the method of BlockReader interface are valid for the cloned OOOCompactionHead.
 func (ch *OOOCompactionHead) CloneForTimeRange(mint, maxt int64) *OOOCompactionHead {
 	return &OOOCompactionHead{
-		oooIR:       NewOOOHeadIndexReader(ch.oooIR.head, mint, maxt),
+		oooIR:       NewOOOHeadIndexReader(ch.oooIR.head, mint, maxt, 0),
 		lastMmapRef: ch.lastMmapRef,
 		postings:    ch.postings,
 		chunkRange:  ch.chunkRange,
@ -442,7 +451,7 @@ func (ir *OOOCompactionHeadIndexReader) ShardedPostings(p index.Postings, shardI
 }

 func (ir *OOOCompactionHeadIndexReader) Series(ref storage.SeriesRef, builder *labels.ScratchBuilder, chks *[]chunks.Meta) error {
-	return ir.ch.oooIR.series(ref, builder, chks, ir.ch.lastMmapRef)
+	return ir.ch.oooIR.series(ref, builder, chks, 0, ir.ch.lastMmapRef)
 }

 func (ir *OOOCompactionHeadIndexReader) SortedLabelValues(_ context.Context, name string, matchers ...*labels.Matcher) ([]string, error) {
--- a/tsdb/ooo_head_read_test.go
+++ b/tsdb/ooo_head_read_test.go
@ -356,7 +356,7 @@ func TestOOOHeadIndexReader_Series(t *testing.T) {
 						})
 					}

-					ir := NewOOOHeadIndexReader(h, tc.queryMinT, tc.queryMaxT)
+					ir := NewOOOHeadIndexReader(h, tc.queryMinT, tc.queryMaxT, 0)

 					var chks []chunks.Meta
 					var b labels.ScratchBuilder
@ -437,7 +437,7 @@ func TestOOOHeadChunkReader_LabelValues(t *testing.T) {
 	for _, tc := range cases {
 		t.Run(tc.name, func(t *testing.T) {
 			// We first want to test using a head index reader that covers the biggest query interval
-			oh := NewOOOHeadIndexReader(head, tc.queryMinT, tc.queryMaxT)
+			oh := NewOOOHeadIndexReader(head, tc.queryMinT, tc.queryMaxT, 0)
 			matchers := []*labels.Matcher{labels.MustNewMatcher(labels.MatchEqual, "foo", "bar1")}
 			values, err := oh.LabelValues(ctx, "foo", matchers...)
 			sort.Strings(values)
@ -484,10 +484,12 @@ func TestOOOHeadChunkReader_Chunk(t *testing.T) {
 	t.Run("Getting a non existing chunk fails with not found error", func(t *testing.T) {
 		db := newTestDBWithOpts(t, opts)

-		cr := NewOOOHeadChunkReader(db.head, 0, 1000)
-		c, err := cr.Chunk(chunks.Meta{
+		cr := NewOOOHeadChunkReader(db.head, 0, 1000, nil)
+		defer cr.Close()
+		c, iterable, err := cr.ChunkOrIterable(chunks.Meta{
 			Ref: 0x1000000, Chunk: chunkenc.Chunk(nil), MinTime: 100, MaxTime: 300,
 		})
+		require.Nil(t, iterable)
 		require.Equal(t, err, fmt.Errorf("not found"))
 		require.Equal(t, c, nil)
 	})
@ -842,20 +844,22 @@ func TestOOOHeadChunkReader_Chunk(t *testing.T) {

 			// The Series method is the one that populates the chunk meta OOO
 			// markers like OOOLastRef. These are then used by the ChunkReader.
-			ir := NewOOOHeadIndexReader(db.head, tc.queryMinT, tc.queryMaxT)
+			ir := NewOOOHeadIndexReader(db.head, tc.queryMinT, tc.queryMaxT, 0)
 			var chks []chunks.Meta
 			var b labels.ScratchBuilder
 			err := ir.Series(s1Ref, &b, &chks)
 			require.NoError(t, err)
 			require.Equal(t, len(tc.expChunksSamples), len(chks))

-			cr := NewOOOHeadChunkReader(db.head, tc.queryMinT, tc.queryMaxT)
+			cr := NewOOOHeadChunkReader(db.head, tc.queryMinT, tc.queryMaxT, nil)
+			defer cr.Close()
 			for i := 0; i < len(chks); i++ {
-				c, err := cr.Chunk(chks[i])
+				c, iterable, err := cr.ChunkOrIterable(chks[i])
 				require.NoError(t, err)
+				require.Nil(t, c)

 				var resultSamples chunks.SampleSlice
-				it := c.Iterator(nil)
+				it := iterable.Iterator(nil)
 				for it.Next() == chunkenc.ValFloat {
 					t, v := it.At()
 					resultSamples = append(resultSamples, sample{t: t, f: v})
@ -1005,7 +1009,7 @@ func TestOOOHeadChunkReader_Chunk_ConsistentQueryResponseDespiteOfHeadExpanding(

 			// The Series method is the one that populates the chunk meta OOO
 			// markers like OOOLastRef. These are then used by the ChunkReader.
-			ir := NewOOOHeadIndexReader(db.head, tc.queryMinT, tc.queryMaxT)
+			ir := NewOOOHeadIndexReader(db.head, tc.queryMinT, tc.queryMaxT, 0)
 			var chks []chunks.Meta
 			var b labels.ScratchBuilder
 			err := ir.Series(s1Ref, &b, &chks)
@ -1020,13 +1024,15 @@ func TestOOOHeadChunkReader_Chunk_ConsistentQueryResponseDespiteOfHeadExpanding(
 			}
 			require.NoError(t, app.Commit())

-			cr := NewOOOHeadChunkReader(db.head, tc.queryMinT, tc.queryMaxT)
+			cr := NewOOOHeadChunkReader(db.head, tc.queryMinT, tc.queryMaxT, nil)
+			defer cr.Close()
 			for i := 0; i < len(chks); i++ {
-				c, err := cr.Chunk(chks[i])
+				c, iterable, err := cr.ChunkOrIterable(chks[i])
 				require.NoError(t, err)
+				require.Nil(t, c)

 				var resultSamples chunks.SampleSlice
-				it := c.Iterator(nil)
+				it := iterable.Iterator(nil)
 				for it.Next() == chunkenc.ValFloat {
 					ts, v := it.At()
 					resultSamples = append(resultSamples, sample{t: ts, f: v})
--- a/tsdb/ooo_isolation.go
+++ b/tsdb/ooo_isolation.go
@ -0,0 +1,79 @@
+// Copyright 2023 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tsdb
+
+import (
+	"container/list"
+	"sync"
+
+	"github.com/prometheus/prometheus/tsdb/chunks"
+)
+
+type oooIsolation struct {
+	mtx       sync.RWMutex
+	openReads *list.List
+}
+
+type oooIsolationState struct {
+	i *oooIsolation
+	e *list.Element
+
+	minRef chunks.ChunkDiskMapperRef
+}
+
+func newOOOIsolation() *oooIsolation {
+	return &oooIsolation{
+		openReads: list.New(),
+	}
+}
+
+// HasOpenReadsAtOrBefore returns true if this oooIsolation is aware of any reads that use
+// chunks with reference at or before ref.
+func (i *oooIsolation) HasOpenReadsAtOrBefore(ref chunks.ChunkDiskMapperRef) bool {
+	i.mtx.RLock()
+	defer i.mtx.RUnlock()
+
+	for e := i.openReads.Front(); e != nil; e = e.Next() {
+		s := e.Value.(*oooIsolationState)
+
+		if ref.GreaterThan(s.minRef) {
+			return true
+		}
+	}
+
+	return false
+}
+
+// TrackReadAfter records a read that uses chunks with reference after minRef.
+//
+// The caller must ensure that the returned oooIsolationState is eventually closed when
+// the read is complete.
+func (i *oooIsolation) TrackReadAfter(minRef chunks.ChunkDiskMapperRef) *oooIsolationState {
+	s := &oooIsolationState{
+		i:      i,
+		minRef: minRef,
+	}
+
+	i.mtx.Lock()
+	s.e = i.openReads.PushBack(s)
+	i.mtx.Unlock()
+
+	return s
+}
+
+func (s oooIsolationState) Close() {
+	s.i.mtx.Lock()
+	s.i.openReads.Remove(s.e)
+	s.i.mtx.Unlock()
+}
--- a/tsdb/ooo_isolation_test.go
+++ b/tsdb/ooo_isolation_test.go
@ -0,0 +1,60 @@
+// Copyright 2023 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tsdb
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestOOOIsolation(t *testing.T) {
+	i := newOOOIsolation()
+
+	// Empty state shouldn't have any open reads.
+	require.False(t, i.HasOpenReadsAtOrBefore(0))
+	require.False(t, i.HasOpenReadsAtOrBefore(1))
+	require.False(t, i.HasOpenReadsAtOrBefore(2))
+	require.False(t, i.HasOpenReadsAtOrBefore(3))
+
+	// Add a read.
+	read1 := i.TrackReadAfter(1)
+	require.False(t, i.HasOpenReadsAtOrBefore(0))
+	require.False(t, i.HasOpenReadsAtOrBefore(1))
+	require.True(t, i.HasOpenReadsAtOrBefore(2))
+
+	// Add another overlapping read.
+	read2 := i.TrackReadAfter(0)
+	require.False(t, i.HasOpenReadsAtOrBefore(0))
+	require.True(t, i.HasOpenReadsAtOrBefore(1))
+	require.True(t, i.HasOpenReadsAtOrBefore(2))
+
+	// Close the second read, should now only report open reads for the first read's ref.
+	read2.Close()
+	require.False(t, i.HasOpenReadsAtOrBefore(0))
+	require.False(t, i.HasOpenReadsAtOrBefore(1))
+	require.True(t, i.HasOpenReadsAtOrBefore(2))
+
+	// Close the second read again: this should do nothing and ensures we can safely call Close() multiple times.
+	read2.Close()
+	require.False(t, i.HasOpenReadsAtOrBefore(0))
+	require.False(t, i.HasOpenReadsAtOrBefore(1))
+	require.True(t, i.HasOpenReadsAtOrBefore(2))
+
+	// Closing the first read should indicate no further open reads.
+	read1.Close()
+	require.False(t, i.HasOpenReadsAtOrBefore(0))
+	require.False(t, i.HasOpenReadsAtOrBefore(1))
+	require.False(t, i.HasOpenReadsAtOrBefore(2))
+}
--- a/tsdb/querier.go
+++ b/tsdb/querier.go
@ -685,36 +685,42 @@ func (b *blockBaseSeriesSet) Warnings() annotations.Annotations { return nil }
 // populateWithDelGenericSeriesIterator assumes that chunks that would be fully
 // removed by intervals are filtered out in previous phase.
 //
-// On each iteration currChkMeta is available. If currDelIter is not nil, it
-// means that the chunk iterator in currChkMeta is invalid and a chunk rewrite
-// is needed, for which currDelIter should be used.
+// On each iteration currMeta is available. If currDelIter is not nil, it
+// means that the chunk in currMeta is invalid and a chunk rewrite is needed,
+// for which currDelIter should be used.
 type populateWithDelGenericSeriesIterator struct {
 	blockID ulid.ULID
-	chunks  ChunkReader
-	// chks are expected to be sorted by minTime and should be related to
+	cr      ChunkReader
+	// metas are expected to be sorted by minTime and should be related to
 	// the same, single series.
-	chks []chunks.Meta
+	// It's possible for a single chunks.Meta to refer to multiple chunks.
+	// cr.ChunkOrIterator() would return an iterable and a nil chunk in this
+	// case.
+	metas []chunks.Meta

-	i         int // Index into chks; -1 if not started yet.
+	i         int // Index into metas; -1 if not started yet.
 	err       error
 	bufIter   DeletedIterator // Retained for memory re-use. currDelIter may point here.
 	intervals tombstones.Intervals

 	currDelIter chunkenc.Iterator
-	currChkMeta chunks.Meta
+	// currMeta is the current chunks.Meta from metas. currMeta.Chunk is set to
+	// the chunk returned from cr.ChunkOrIterable(). As that can return a nil
+	// chunk, currMeta.Chunk is not always guaranteed to be set.
+	currMeta chunks.Meta
 }

 func (p *populateWithDelGenericSeriesIterator) reset(blockID ulid.ULID, cr ChunkReader, chks []chunks.Meta, intervals tombstones.Intervals) {
 	p.blockID = blockID
-	p.chunks = cr
-	p.chks = chks
+	p.cr = cr
+	p.metas = chks
 	p.i = -1
 	p.err = nil
 	// Note we don't touch p.bufIter.Iter; it is holding on to an iterator we might reuse in next().
 	p.bufIter.Intervals = p.bufIter.Intervals[:0]
 	p.intervals = intervals
 	p.currDelIter = nil
-	p.currChkMeta = chunks.Meta{}
+	p.currMeta = chunks.Meta{}
 }

 // If copyHeadChunk is true, then the head chunk (i.e. the in-memory chunk of the TSDB)
@ -722,43 +728,54 @@ func (p *populateWithDelGenericSeriesIterator) reset(blockID ulid.ULID, cr Chunk
 // However, if the deletion intervals overlaps with the head chunk, then the head chunk is
 // not copied irrespective of copyHeadChunk because it will be re-encoded later anyway.
 func (p *populateWithDelGenericSeriesIterator) next(copyHeadChunk bool) bool {
-	if p.err != nil || p.i >= len(p.chks)-1 {
+	if p.err != nil || p.i >= len(p.metas)-1 {
 		return false
 	}

 	p.i++
-	p.currChkMeta = p.chks[p.i]
+	p.currMeta = p.metas[p.i]

 	p.bufIter.Intervals = p.bufIter.Intervals[:0]
 	for _, interval := range p.intervals {
-		if p.currChkMeta.OverlapsClosedInterval(interval.Mint, interval.Maxt) {
+		if p.currMeta.OverlapsClosedInterval(interval.Mint, interval.Maxt) {
 			p.bufIter.Intervals = p.bufIter.Intervals.Add(interval)
 		}
 	}

-	hcr, ok := p.chunks.(*headChunkReader)
+	hcr, ok := p.cr.(*headChunkReader)
+	var iterable chunkenc.Iterable
 	if ok && copyHeadChunk && len(p.bufIter.Intervals) == 0 {
 		// ChunkWithCopy will copy the head chunk.
 		var maxt int64
-		p.currChkMeta.Chunk, maxt, p.err = hcr.ChunkWithCopy(p.currChkMeta)
+		p.currMeta.Chunk, maxt, p.err = hcr.ChunkWithCopy(p.currMeta)
 		// For the in-memory head chunk the index reader sets maxt as MaxInt64. We fix it here.
-		p.currChkMeta.MaxTime = maxt
+		p.currMeta.MaxTime = maxt
 	} else {
-		p.currChkMeta.Chunk, p.err = p.chunks.Chunk(p.currChkMeta)
+		p.currMeta.Chunk, iterable, p.err = p.cr.ChunkOrIterable(p.currMeta)
 	}
+
 	if p.err != nil {
-		p.err = errors.Wrapf(p.err, "cannot populate chunk %d from block %s", p.currChkMeta.Ref, p.blockID.String())
+		p.err = errors.Wrapf(p.err, "cannot populate chunk %d from block %s", p.currMeta.Ref, p.blockID.String())
 		return false
 	}

-	if len(p.bufIter.Intervals) == 0 {
-		// If there is no overlap with deletion intervals, we can take chunk as it is.
-		p.currDelIter = nil
+	// Use the single chunk if possible.
+	if p.currMeta.Chunk != nil {
+		if len(p.bufIter.Intervals) == 0 {
+			// If there is no overlap with deletion intervals and a single chunk is
+			// returned, we can take chunk as it is.
+			p.currDelIter = nil
+			return true
+		}
+		// Otherwise we need to iterate over the samples in the single chunk
+		// and create new chunks.
+		p.bufIter.Iter = p.currMeta.Chunk.Iterator(p.bufIter.Iter)
+		p.currDelIter = &p.bufIter
 		return true
 	}

-	// We don't want the full chunk, take just a part of it.
-	p.bufIter.Iter = p.currChkMeta.Chunk.Iterator(p.bufIter.Iter)
+	// Otherwise, use the iterable to create an iterator.
+	p.bufIter.Iter = iterable.Iterator(p.bufIter.Iter)
 	p.currDelIter = &p.bufIter
 	return true
 }
@ -822,7 +839,7 @@ func (p *populateWithDelSeriesIterator) Next() chunkenc.ValueType {
 		if p.currDelIter != nil {
 			p.curr = p.currDelIter
 		} else {
-			p.curr = p.currChkMeta.Chunk.Iterator(p.curr)
+			p.curr = p.currMeta.Chunk.Iterator(p.curr)
 		}
 		if valueType := p.curr.Next(); valueType != chunkenc.ValNone {
 			return valueType
@ -874,22 +891,61 @@ func (p *populateWithDelSeriesIterator) Err() error {
 type populateWithDelChunkSeriesIterator struct {
 	populateWithDelGenericSeriesIterator

-	curr chunks.Meta
+	// currMetaWithChunk is current meta with its chunk field set. This meta
+	// is guaranteed to map to a single chunk. This differs from
+	// populateWithDelGenericSeriesIterator.currMeta as that
+	// could refer to multiple chunks.
+	currMetaWithChunk chunks.Meta
+
+	// chunksFromIterable stores the chunks created from iterating through
+	// the iterable returned by cr.ChunkOrIterable() (with deleted samples
+	// removed).
+	chunksFromIterable    []chunks.Meta
+	chunksFromIterableIdx int
 }

 func (p *populateWithDelChunkSeriesIterator) reset(blockID ulid.ULID, cr ChunkReader, chks []chunks.Meta, intervals tombstones.Intervals) {
 	p.populateWithDelGenericSeriesIterator.reset(blockID, cr, chks, intervals)
-	p.curr = chunks.Meta{}
+	p.currMetaWithChunk = chunks.Meta{}
+	p.chunksFromIterable = p.chunksFromIterable[:0]
+	p.chunksFromIterableIdx = -1
 }

 func (p *populateWithDelChunkSeriesIterator) Next() bool {
+	if p.currMeta.Chunk == nil {
+		// If we've been creating chunks from the iterable, check if there are
+		// any more chunks to iterate through.
+		if p.chunksFromIterableIdx < len(p.chunksFromIterable)-1 {
+			p.chunksFromIterableIdx++
+			p.currMetaWithChunk = p.chunksFromIterable[p.chunksFromIterableIdx]
+			return true
+		}
+	}
+
+	// Move to the next chunk/deletion iterator.
 	if !p.next(true) {
 		return false
 	}
-	p.curr = p.currChkMeta
-	if p.currDelIter == nil {
-		return true
+
+	if p.currMeta.Chunk != nil {
+		if p.currDelIter == nil {
+			p.currMetaWithChunk = p.currMeta
+			return true
+		}
+		// If ChunkOrIterable() returned a non-nil chunk, the samples in
+		// p.currDelIter will only form one chunk, as the only change
+		// p.currDelIter might make is deleting some samples.
+		return p.populateCurrForSingleChunk()
 	}
+
+	// If ChunkOrIterable() returned an iterable, multiple chunks may be
+	// created from the samples in p.currDelIter.
+	return p.populateChunksFromIterable()
+}
+
+// populateCurrForSingleChunk sets the fields within p.currMetaWithChunk. This
+// should be called if the samples in p.currDelIter only form one chunk.
+func (p *populateWithDelChunkSeriesIterator) populateCurrForSingleChunk() bool {
 	valueType := p.currDelIter.Next()
 	if valueType == chunkenc.ValNone {
 		if err := p.currDelIter.Err(); err != nil {
@ -897,9 +953,9 @@ func (p *populateWithDelChunkSeriesIterator) Next() bool {
 		}
 		return false
 	}
-	p.curr.MinTime = p.currDelIter.AtT()
+	p.currMetaWithChunk.MinTime = p.currDelIter.AtT()

-	// Re-encode the chunk if iterator is provider. This means that it has
+	// Re-encode the chunk if iterator is provided. This means that it has
 	// some samples to be deleted or chunk is opened.
 	var (
 		newChunk chunkenc.Chunk
@ -957,7 +1013,7 @@ func (p *populateWithDelChunkSeriesIterator) Next() bool {
 			}
 		}
 	default:
-		err = fmt.Errorf("populateWithDelChunkSeriesIterator: value type %v unsupported", valueType)
+		err = fmt.Errorf("populateCurrForSingleChunk: value type %v unsupported", valueType)
 	}

 	if err != nil {
@ -969,12 +1025,127 @@ func (p *populateWithDelChunkSeriesIterator) Next() bool {
 		return false
 	}

-	p.curr.Chunk = newChunk
-	p.curr.MaxTime = t
+	p.currMetaWithChunk.Chunk = newChunk
+	p.currMetaWithChunk.MaxTime = t
 	return true
 }

-func (p *populateWithDelChunkSeriesIterator) At() chunks.Meta { return p.curr }
+// populateChunksFromIterable reads the samples from currDelIter to create
+// chunks for chunksFromIterable. It also sets p.currMetaWithChunk to the first
+// chunk.
+func (p *populateWithDelChunkSeriesIterator) populateChunksFromIterable() bool {
+	p.chunksFromIterable = p.chunksFromIterable[:0]
+	p.chunksFromIterableIdx = -1
+
+	firstValueType := p.currDelIter.Next()
+	if firstValueType == chunkenc.ValNone {
+		if err := p.currDelIter.Err(); err != nil {
+			p.err = errors.Wrap(err, "populateChunksFromIterable: no samples could be read")
+			return false
+		}
+		return false
+	}
+
+	var (
+		// t is the timestamp for the current sample.
+		t     int64
+		cmint int64
+		cmaxt int64
+
+		currentChunk chunkenc.Chunk
+
+		app chunkenc.Appender
+
+		newChunk chunkenc.Chunk
+		recoded  bool
+
+		err error
+	)
+
+	prevValueType := chunkenc.ValNone
+
+	for currentValueType := firstValueType; currentValueType != chunkenc.ValNone; currentValueType = p.currDelIter.Next() {
+		// Check if the encoding has changed (i.e. we need to create a new
+		// chunk as chunks can't have multiple encoding types).
+		// For the first sample, the following condition will always be true as
+		// ValNoneNone != ValFloat | ValHistogram | ValFloatHistogram.
+		if currentValueType != prevValueType {
+			if prevValueType != chunkenc.ValNone {
+				p.chunksFromIterable = append(p.chunksFromIterable, chunks.Meta{Chunk: currentChunk, MinTime: cmint, MaxTime: cmaxt})
+			}
+			cmint = p.currDelIter.AtT()
+			if currentChunk, err = currentValueType.NewChunk(); err != nil {
+				break
+			}
+			if app, err = currentChunk.Appender(); err != nil {
+				break
+			}
+		}
+
+		switch currentValueType {
+		case chunkenc.ValFloat:
+			{
+				var v float64
+				t, v = p.currDelIter.At()
+				app.Append(t, v)
+			}
+		case chunkenc.ValHistogram:
+			{
+				var v *histogram.Histogram
+				t, v = p.currDelIter.AtHistogram()
+				// No need to set prevApp as AppendHistogram will set the
+				// counter reset header for the appender that's returned.
+				newChunk, recoded, app, err = app.AppendHistogram(nil, t, v, false)
+			}
+		case chunkenc.ValFloatHistogram:
+			{
+				var v *histogram.FloatHistogram
+				t, v = p.currDelIter.AtFloatHistogram()
+				// No need to set prevApp as AppendHistogram will set the
+				// counter reset header for the appender that's returned.
+				newChunk, recoded, app, err = app.AppendFloatHistogram(nil, t, v, false)
+			}
+		}
+
+		if err != nil {
+			break
+		}
+
+		if newChunk != nil {
+			if !recoded {
+				p.chunksFromIterable = append(p.chunksFromIterable, chunks.Meta{Chunk: currentChunk, MinTime: cmint, MaxTime: cmaxt})
+			}
+			currentChunk = newChunk
+			cmint = t
+		}
+
+		cmaxt = t
+		prevValueType = currentValueType
+	}
+
+	if err != nil {
+		p.err = errors.Wrap(err, "populateChunksFromIterable: error when writing new chunks")
+		return false
+	}
+	if err = p.currDelIter.Err(); err != nil {
+		p.err = errors.Wrap(err, "populateChunksFromIterable: currDelIter error when writing new chunks")
+		return false
+	}
+
+	if prevValueType != chunkenc.ValNone {
+		p.chunksFromIterable = append(p.chunksFromIterable, chunks.Meta{Chunk: currentChunk, MinTime: cmint, MaxTime: cmaxt})
+	}
+
+	if len(p.chunksFromIterable) == 0 {
+		return false
+	}
+
+	p.currMetaWithChunk = p.chunksFromIterable[0]
+	p.chunksFromIterableIdx = 0
+	return true
+}
+
+func (p *populateWithDelChunkSeriesIterator) At() chunks.Meta { return p.currMetaWithChunk }

 // blockSeriesSet allows to iterate over sorted, populated series with applied tombstones.
 // Series with all deleted chunks are still present as Series with no samples.
@ -1174,8 +1345,8 @@ func newNopChunkReader() ChunkReader {
 	}
 }

-func (cr nopChunkReader) Chunk(chunks.Meta) (chunkenc.Chunk, error) {
-	return cr.emptyChunk, nil
+func (cr nopChunkReader) ChunkOrIterable(chunks.Meta) (chunkenc.Chunk, chunkenc.Iterable, error) {
+	return cr.emptyChunk, nil, nil
 }

 func (cr nopChunkReader) Close() error { return nil }
--- a/tsdb/querier_test.go
+++ b/tsdb/querier_test.go
@ -725,12 +725,14 @@ func TestBlockQuerierDelete(t *testing.T) {

 type fakeChunksReader struct {
 	ChunkReader
-	chks map[chunks.ChunkRef]chunkenc.Chunk
+	chks      map[chunks.ChunkRef]chunkenc.Chunk
+	iterables map[chunks.ChunkRef]chunkenc.Iterable
 }

 func createFakeReaderAndNotPopulatedChunks(s ...[]chunks.Sample) (*fakeChunksReader, []chunks.Meta) {
 	f := &fakeChunksReader{
-		chks: map[chunks.ChunkRef]chunkenc.Chunk{},
+		chks:      map[chunks.ChunkRef]chunkenc.Chunk{},
+		iterables: map[chunks.ChunkRef]chunkenc.Iterable{},
 	}
 	chks := make([]chunks.Meta, 0, len(s))

@ -747,21 +749,102 @@ func createFakeReaderAndNotPopulatedChunks(s ...[]chunks.Sample) (*fakeChunksRea
 	return f, chks
 }

-func (r *fakeChunksReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
-	chk, ok := r.chks[meta.Ref]
-	if !ok {
-		return nil, errors.Errorf("chunk not found at ref %v", meta.Ref)
+// Samples in each slice are assumed to be sorted.
+func createFakeReaderAndIterables(s ...[]chunks.Sample) (*fakeChunksReader, []chunks.Meta) {
+	f := &fakeChunksReader{
+		chks:      map[chunks.ChunkRef]chunkenc.Chunk{},
+		iterables: map[chunks.ChunkRef]chunkenc.Iterable{},
 	}
-	return chk, nil
+	chks := make([]chunks.Meta, 0, len(s))
+
+	for ref, samples := range s {
+		f.iterables[chunks.ChunkRef(ref)] = &mockIterable{s: samples}
+
+		var minTime, maxTime int64
+		if len(samples) > 0 {
+			minTime = samples[0].T()
+			maxTime = samples[len(samples)-1].T()
+		}
+		chks = append(chks, chunks.Meta{
+			Ref:     chunks.ChunkRef(ref),
+			MinTime: minTime,
+			MaxTime: maxTime,
+		})
+	}
+	return f, chks
 }

+func (r *fakeChunksReader) ChunkOrIterable(meta chunks.Meta) (chunkenc.Chunk, chunkenc.Iterable, error) {
+	if chk, ok := r.chks[meta.Ref]; ok {
+		return chk, nil, nil
+	}
+
+	if it, ok := r.iterables[meta.Ref]; ok {
+		return nil, it, nil
+	}
+	return nil, nil, fmt.Errorf("chunk or iterable not found at ref %v", meta.Ref)
+}
+
+type mockIterable struct {
+	s []chunks.Sample
+}
+
+func (it *mockIterable) Iterator(chunkenc.Iterator) chunkenc.Iterator {
+	return &mockSampleIterator{
+		s:   it.s,
+		idx: -1,
+	}
+}
+
+type mockSampleIterator struct {
+	s   []chunks.Sample
+	idx int
+}
+
+func (it *mockSampleIterator) Seek(t int64) chunkenc.ValueType {
+	for ; it.idx < len(it.s); it.idx++ {
+		if it.idx != -1 && it.s[it.idx].T() >= t {
+			return it.s[it.idx].Type()
+		}
+	}
+
+	return chunkenc.ValNone
+}
+
+func (it *mockSampleIterator) At() (int64, float64) {
+	return it.s[it.idx].T(), it.s[it.idx].F()
+}
+
+func (it *mockSampleIterator) AtHistogram() (int64, *histogram.Histogram) {
+	return it.s[it.idx].T(), it.s[it.idx].H()
+}
+
+func (it *mockSampleIterator) AtFloatHistogram() (int64, *histogram.FloatHistogram) {
+	return it.s[it.idx].T(), it.s[it.idx].FH()
+}
+
+func (it *mockSampleIterator) AtT() int64 {
+	return it.s[it.idx].T()
+}
+
+func (it *mockSampleIterator) Next() chunkenc.ValueType {
+	if it.idx < len(it.s)-1 {
+		it.idx++
+		return it.s[it.idx].Type()
+	}
+
+	return chunkenc.ValNone
+}
+
+func (it *mockSampleIterator) Err() error { return nil }
+
 func TestPopulateWithTombSeriesIterators(t *testing.T) {
 	type minMaxTimes struct {
 		minTime, maxTime int64
 	}
 	cases := []struct {
-		name string
-		chks [][]chunks.Sample
+		name    string
+		samples [][]chunks.Sample

 		expected            []chunks.Sample
 		expectedChks        []chunks.Meta
@ -772,23 +855,38 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		// Seek being zero means do not test seek.
 		seek        int64
 		seekSuccess bool
+
+		// Set this to true if a sample slice will form multiple chunks.
+		skipChunkTest bool
+
+		skipIterableTest bool
 	}{
 		{
-			name: "no chunk",
-			chks: [][]chunks.Sample{},
+			name:    "no chunk",
+			samples: [][]chunks.Sample{},
 		},
 		{
-			name: "one empty chunk", // This should never happen.
-			chks: [][]chunks.Sample{{}},
+			name:    "one empty chunk", // This should never happen.
+			samples: [][]chunks.Sample{{}},

 			expectedChks: []chunks.Meta{
 				assureChunkFromSamples(t, []chunks.Sample{}),
 			},
 			expectedMinMaxTimes: []minMaxTimes{{0, 0}},
+			// iterables with no samples will return no chunks instead of empty chunks
+			skipIterableTest: true,
 		},
 		{
-			name: "three empty chunks", // This should never happen.
-			chks: [][]chunks.Sample{{}, {}, {}},
+			name:    "one empty iterable",
+			samples: [][]chunks.Sample{{}},
+
+			// iterables with no samples will return no chunks
+			expectedChks:  nil,
+			skipChunkTest: true,
+		},
+		{
+			name:    "three empty chunks", // This should never happen.
+			samples: [][]chunks.Sample{{}, {}, {}},

 			expectedChks: []chunks.Meta{
 				assureChunkFromSamples(t, []chunks.Sample{}),
@ -796,10 +894,20 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 				assureChunkFromSamples(t, []chunks.Sample{}),
 			},
 			expectedMinMaxTimes: []minMaxTimes{{0, 0}, {0, 0}, {0, 0}},
+			// iterables with no samples will return no chunks instead of empty chunks
+			skipIterableTest: true,
+		},
+		{
+			name:    "three empty iterables",
+			samples: [][]chunks.Sample{{}, {}, {}},
+
+			// iterables with no samples will return no chunks
+			expectedChks:  nil,
+			skipChunkTest: true,
 		},
 		{
 			name: "one chunk",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{sample{1, 2, nil, nil}, sample{2, 3, nil, nil}, sample{3, 5, nil, nil}, sample{6, 1, nil, nil}},
 			},

@ -815,7 +923,7 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		},
 		{
 			name: "two full chunks",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{sample{1, 2, nil, nil}, sample{2, 3, nil, nil}, sample{3, 5, nil, nil}, sample{6, 1, nil, nil}},
 				{sample{7, 89, nil, nil}, sample{9, 8, nil, nil}},
 			},
@ -835,7 +943,7 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		},
 		{
 			name: "three full chunks",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{sample{1, 2, nil, nil}, sample{2, 3, nil, nil}, sample{3, 5, nil, nil}, sample{6, 1, nil, nil}},
 				{sample{7, 89, nil, nil}, sample{9, 8, nil, nil}},
 				{sample{10, 22, nil, nil}, sample{203, 3493, nil, nil}},
@ -859,15 +967,15 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		},
 		// Seek cases.
 		{
-			name: "three empty chunks and seek", // This should never happen.
-			chks: [][]chunks.Sample{{}, {}, {}},
-			seek: 1,
+			name:    "three empty chunks and seek", // This should never happen.
+			samples: [][]chunks.Sample{{}, {}, {}},
+			seek:    1,

 			seekSuccess: false,
 		},
 		{
 			name: "two chunks and seek beyond chunks",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{sample{1, 2, nil, nil}, sample{3, 5, nil, nil}, sample{6, 1, nil, nil}},
 				{sample{7, 89, nil, nil}, sample{9, 8, nil, nil}},
 			},
@ -877,7 +985,7 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		},
 		{
 			name: "two chunks and seek on middle of first chunk",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{sample{1, 2, nil, nil}, sample{3, 5, nil, nil}, sample{6, 1, nil, nil}},
 				{sample{7, 89, nil, nil}, sample{9, 8, nil, nil}},
 			},
@ -890,7 +998,7 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		},
 		{
 			name: "two chunks and seek before first chunk",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{sample{1, 2, nil, nil}, sample{3, 5, nil, nil}, sample{6, 1, nil, nil}},
 				{sample{7, 89, nil, nil}, sample{9, 8, nil, nil}},
 			},
@ -904,12 +1012,12 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		// Deletion / Trim cases.
 		{
 			name:      "no chunk with deletion interval",
-			chks:      [][]chunks.Sample{},
+			samples:   [][]chunks.Sample{},
 			intervals: tombstones.Intervals{{Mint: 20, Maxt: 21}},
 		},
 		{
 			name: "two chunks with trimmed first and last samples from edge chunks",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{sample{1, 2, nil, nil}, sample{2, 3, nil, nil}, sample{3, 5, nil, nil}, sample{6, 1, nil, nil}},
 				{sample{7, 89, nil, nil}, sample{9, 8, nil, nil}},
 			},
@ -930,7 +1038,7 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		},
 		{
 			name: "two chunks with trimmed middle sample of first chunk",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{sample{1, 2, nil, nil}, sample{2, 3, nil, nil}, sample{3, 5, nil, nil}, sample{6, 1, nil, nil}},
 				{sample{7, 89, nil, nil}, sample{9, 8, nil, nil}},
 			},
@ -951,7 +1059,7 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		},
 		{
 			name: "two chunks with deletion across two chunks",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{sample{1, 2, nil, nil}, sample{2, 3, nil, nil}, sample{3, 5, nil, nil}, sample{6, 1, nil, nil}},
 				{sample{7, 89, nil, nil}, sample{9, 8, nil, nil}},
 			},
@ -973,7 +1081,7 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		// Deletion with seek.
 		{
 			name: "two chunks with trimmed first and last samples from edge chunks, seek from middle of first chunk",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{sample{1, 2, nil, nil}, sample{2, 3, nil, nil}, sample{3, 5, nil, nil}, sample{6, 1, nil, nil}},
 				{sample{7, 89, nil, nil}, sample{9, 8, nil, nil}},
 			},
@ -985,9 +1093,20 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 				sample{3, 5, nil, nil}, sample{6, 1, nil, nil}, sample{7, 89, nil, nil},
 			},
 		},
+		{
+			name: "one chunk where all samples are trimmed",
+			samples: [][]chunks.Sample{
+				{sample{2, 3, nil, nil}, sample{3, 5, nil, nil}, sample{6, 1, nil, nil}},
+				{sample{7, 89, nil, nil}, sample{9, 8, nil, nil}},
+			},
+			intervals: tombstones.Intervals{{Mint: math.MinInt64, Maxt: 3}}.Add(tombstones.Interval{Mint: 4, Maxt: math.MaxInt64}),
+
+			expected:     nil,
+			expectedChks: nil,
+		},
 		{
 			name: "one histogram chunk",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{
 					sample{1, 0, tsdbutil.GenerateTestHistogram(1), nil},
 					sample{2, 0, tsdbutil.GenerateTestHistogram(2), nil},
@ -1013,7 +1132,7 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		},
 		{
 			name: "one histogram chunk intersect with earlier deletion interval",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{
 					sample{1, 0, tsdbutil.GenerateTestHistogram(1), nil},
 					sample{2, 0, tsdbutil.GenerateTestHistogram(2), nil},
@ -1036,7 +1155,7 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		},
 		{
 			name: "one histogram chunk intersect with later deletion interval",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{
 					sample{1, 0, tsdbutil.GenerateTestHistogram(1), nil},
 					sample{2, 0, tsdbutil.GenerateTestHistogram(2), nil},
@ -1061,7 +1180,7 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		},
 		{
 			name: "one float histogram chunk",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{
 					sample{1, 0, nil, tsdbutil.GenerateTestFloatHistogram(1)},
 					sample{2, 0, nil, tsdbutil.GenerateTestFloatHistogram(2)},
@ -1087,7 +1206,7 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		},
 		{
 			name: "one float histogram chunk intersect with earlier deletion interval",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{
 					sample{1, 0, nil, tsdbutil.GenerateTestFloatHistogram(1)},
 					sample{2, 0, nil, tsdbutil.GenerateTestFloatHistogram(2)},
@ -1110,7 +1229,7 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		},
 		{
 			name: "one float histogram chunk intersect with later deletion interval",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{
 					sample{1, 0, nil, tsdbutil.GenerateTestFloatHistogram(1)},
 					sample{2, 0, nil, tsdbutil.GenerateTestFloatHistogram(2)},
@ -1135,7 +1254,7 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		},
 		{
 			name: "one gauge histogram chunk",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{
 					sample{1, 0, tsdbutil.GenerateTestGaugeHistogram(1), nil},
 					sample{2, 0, tsdbutil.GenerateTestGaugeHistogram(2), nil},
@ -1161,7 +1280,7 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		},
 		{
 			name: "one gauge histogram chunk intersect with earlier deletion interval",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{
 					sample{1, 0, tsdbutil.GenerateTestGaugeHistogram(1), nil},
 					sample{2, 0, tsdbutil.GenerateTestGaugeHistogram(2), nil},
@ -1184,7 +1303,7 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		},
 		{
 			name: "one gauge histogram chunk intersect with later deletion interval",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{
 					sample{1, 0, tsdbutil.GenerateTestGaugeHistogram(1), nil},
 					sample{2, 0, tsdbutil.GenerateTestGaugeHistogram(2), nil},
@ -1209,7 +1328,7 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		},
 		{
 			name: "one gauge float histogram",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{
 					sample{1, 0, nil, tsdbutil.GenerateTestGaugeFloatHistogram(1)},
 					sample{2, 0, nil, tsdbutil.GenerateTestGaugeFloatHistogram(2)},
@ -1235,7 +1354,7 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		},
 		{
 			name: "one gauge float histogram chunk intersect with earlier deletion interval",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{
 					sample{1, 0, nil, tsdbutil.GenerateTestGaugeFloatHistogram(1)},
 					sample{2, 0, nil, tsdbutil.GenerateTestGaugeFloatHistogram(2)},
@ -1258,7 +1377,7 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		},
 		{
 			name: "one gauge float histogram chunk intersect with later deletion interval",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{
 					sample{1, 0, nil, tsdbutil.GenerateTestGaugeFloatHistogram(1)},
 					sample{2, 0, nil, tsdbutil.GenerateTestGaugeFloatHistogram(2)},
@ -1283,7 +1402,7 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		},
 		{
 			name: "three full mixed chunks",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{sample{1, 2, nil, nil}, sample{2, 3, nil, nil}, sample{3, 5, nil, nil}, sample{6, 1, nil, nil}},
 				{
 					sample{7, 0, tsdbutil.GenerateTestGaugeHistogram(89), nil},
@ -1315,7 +1434,7 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		},
 		{
 			name: "three full mixed chunks in different order",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{
 					sample{7, 0, tsdbutil.GenerateTestGaugeHistogram(89), nil},
 					sample{9, 0, tsdbutil.GenerateTestGaugeHistogram(8), nil},
@ -1347,7 +1466,7 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		},
 		{
 			name: "three full mixed chunks in different order intersect with deletion interval",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{
 					sample{7, 0, tsdbutil.GenerateTestGaugeHistogram(89), nil},
 					sample{9, 0, tsdbutil.GenerateTestGaugeHistogram(8), nil},
@ -1378,7 +1497,7 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 		},
 		{
 			name: "three full mixed chunks overlapping",
-			chks: [][]chunks.Sample{
+			samples: [][]chunks.Sample{
 				{
 					sample{7, 0, tsdbutil.GenerateTestGaugeHistogram(89), nil},
 					sample{12, 0, tsdbutil.GenerateTestGaugeHistogram(8), nil},
@ -1408,11 +1527,237 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 			},
 			expectedMinMaxTimes: []minMaxTimes{{7, 12}, {11, 16}, {10, 203}},
 		},
+		{
+			// This case won't actually happen until OOO native histograms is implemented.
+			// Issue: https://github.com/prometheus/prometheus/issues/11220.
+			name: "int histogram iterables with counter resets",
+			samples: [][]chunks.Sample{
+				{
+					sample{7, 0, tsdbutil.GenerateTestHistogram(8), nil},
+					sample{8, 0, tsdbutil.GenerateTestHistogram(9), nil},
+					// Counter reset should be detected when chunks are created from the iterable.
+					sample{12, 0, tsdbutil.GenerateTestHistogram(5), nil},
+					sample{15, 0, tsdbutil.GenerateTestHistogram(6), nil},
+					sample{16, 0, tsdbutil.GenerateTestHistogram(7), nil},
+					// Counter reset should be detected when chunks are created from the iterable.
+					sample{17, 0, tsdbutil.GenerateTestHistogram(5), nil},
+				},
+				{
+					sample{18, 0, tsdbutil.GenerateTestHistogram(6), nil},
+					sample{19, 0, tsdbutil.GenerateTestHistogram(7), nil},
+					// Counter reset should be detected when chunks are created from the iterable.
+					sample{20, 0, tsdbutil.GenerateTestHistogram(5), nil},
+					sample{21, 0, tsdbutil.GenerateTestHistogram(6), nil},
+				},
+			},
+
+			expected: []chunks.Sample{
+				sample{7, 0, tsdbutil.GenerateTestHistogram(8), nil},
+				sample{8, 0, tsdbutil.GenerateTestHistogram(9), nil},
+				sample{12, 0, tsdbutil.GenerateTestHistogram(5), nil},
+				sample{15, 0, tsdbutil.GenerateTestHistogram(6), nil},
+				sample{16, 0, tsdbutil.GenerateTestHistogram(7), nil},
+				sample{17, 0, tsdbutil.GenerateTestHistogram(5), nil},
+				sample{18, 0, tsdbutil.GenerateTestHistogram(6), nil},
+				sample{19, 0, tsdbutil.GenerateTestHistogram(7), nil},
+				sample{20, 0, tsdbutil.GenerateTestHistogram(5), nil},
+				sample{21, 0, tsdbutil.GenerateTestHistogram(6), nil},
+			},
+			expectedChks: []chunks.Meta{
+				assureChunkFromSamples(t, []chunks.Sample{
+					sample{7, 0, tsdbutil.GenerateTestHistogram(8), nil},
+					sample{8, 0, tsdbutil.SetHistogramNotCounterReset(tsdbutil.GenerateTestHistogram(9)), nil},
+				}),
+				assureChunkFromSamples(t, []chunks.Sample{
+					sample{12, 0, tsdbutil.SetHistogramCounterReset(tsdbutil.GenerateTestHistogram(5)), nil},
+					sample{15, 0, tsdbutil.SetHistogramNotCounterReset(tsdbutil.GenerateTestHistogram(6)), nil},
+					sample{16, 0, tsdbutil.SetHistogramNotCounterReset(tsdbutil.GenerateTestHistogram(7)), nil},
+				}),
+				assureChunkFromSamples(t, []chunks.Sample{
+					sample{17, 0, tsdbutil.SetHistogramCounterReset(tsdbutil.GenerateTestHistogram(5)), nil},
+				}),
+				assureChunkFromSamples(t, []chunks.Sample{
+					sample{18, 0, tsdbutil.GenerateTestHistogram(6), nil},
+					sample{19, 0, tsdbutil.SetHistogramNotCounterReset(tsdbutil.GenerateTestHistogram(7)), nil},
+				}),
+				assureChunkFromSamples(t, []chunks.Sample{
+					sample{20, 0, tsdbutil.SetHistogramCounterReset(tsdbutil.GenerateTestHistogram(5)), nil},
+					sample{21, 0, tsdbutil.SetHistogramNotCounterReset(tsdbutil.GenerateTestHistogram(6)), nil},
+				}),
+			},
+			expectedMinMaxTimes: []minMaxTimes{
+				{7, 8},
+				{12, 16},
+				{17, 17},
+				{18, 19},
+				{20, 21},
+			},
+
+			// Skipping chunk test - can't create a single chunk for each
+			// sample slice since there are counter resets in the middle of
+			// the slices.
+			skipChunkTest: true,
+		},
+		{
+			// This case won't actually happen until OOO native histograms is implemented.
+			// Issue: https://github.com/prometheus/prometheus/issues/11220.
+			name: "float histogram iterables with counter resets",
+			samples: [][]chunks.Sample{
+				{
+					sample{7, 0, nil, tsdbutil.GenerateTestFloatHistogram(8)},
+					sample{8, 0, nil, tsdbutil.GenerateTestFloatHistogram(9)},
+					// Counter reset should be detected when chunks are created from the iterable.
+					sample{12, 0, nil, tsdbutil.GenerateTestFloatHistogram(5)},
+					sample{15, 0, nil, tsdbutil.GenerateTestFloatHistogram(6)},
+					sample{16, 0, nil, tsdbutil.GenerateTestFloatHistogram(7)},
+					// Counter reset should be detected when chunks are created from the iterable.
+					sample{17, 0, nil, tsdbutil.GenerateTestFloatHistogram(5)},
+				},
+				{
+					sample{18, 0, nil, tsdbutil.GenerateTestFloatHistogram(6)},
+					sample{19, 0, nil, tsdbutil.GenerateTestFloatHistogram(7)},
+					// Counter reset should be detected when chunks are created from the iterable.
+					sample{20, 0, nil, tsdbutil.GenerateTestFloatHistogram(5)},
+					sample{21, 0, nil, tsdbutil.GenerateTestFloatHistogram(6)},
+				},
+			},
+
+			expected: []chunks.Sample{
+				sample{7, 0, nil, tsdbutil.GenerateTestFloatHistogram(8)},
+				sample{8, 0, nil, tsdbutil.GenerateTestFloatHistogram(9)},
+				sample{12, 0, nil, tsdbutil.GenerateTestFloatHistogram(5)},
+				sample{15, 0, nil, tsdbutil.GenerateTestFloatHistogram(6)},
+				sample{16, 0, nil, tsdbutil.GenerateTestFloatHistogram(7)},
+				sample{17, 0, nil, tsdbutil.GenerateTestFloatHistogram(5)},
+				sample{18, 0, nil, tsdbutil.GenerateTestFloatHistogram(6)},
+				sample{19, 0, nil, tsdbutil.GenerateTestFloatHistogram(7)},
+				sample{20, 0, nil, tsdbutil.GenerateTestFloatHistogram(5)},
+				sample{21, 0, nil, tsdbutil.GenerateTestFloatHistogram(6)},
+			},
+			expectedChks: []chunks.Meta{
+				assureChunkFromSamples(t, []chunks.Sample{
+					sample{7, 0, nil, tsdbutil.GenerateTestFloatHistogram(8)},
+					sample{8, 0, nil, tsdbutil.SetFloatHistogramNotCounterReset(tsdbutil.GenerateTestFloatHistogram(9))},
+				}),
+				assureChunkFromSamples(t, []chunks.Sample{
+					sample{12, 0, nil, tsdbutil.SetFloatHistogramCounterReset(tsdbutil.GenerateTestFloatHistogram(5))},
+					sample{15, 0, nil, tsdbutil.SetFloatHistogramNotCounterReset(tsdbutil.GenerateTestFloatHistogram(6))},
+					sample{16, 0, nil, tsdbutil.SetFloatHistogramNotCounterReset(tsdbutil.GenerateTestFloatHistogram(7))},
+				}),
+				assureChunkFromSamples(t, []chunks.Sample{
+					sample{17, 0, nil, tsdbutil.SetFloatHistogramCounterReset(tsdbutil.GenerateTestFloatHistogram(5))},
+				}),
+				assureChunkFromSamples(t, []chunks.Sample{
+					sample{18, 0, nil, tsdbutil.GenerateTestFloatHistogram(6)},
+					sample{19, 0, nil, tsdbutil.SetFloatHistogramNotCounterReset(tsdbutil.GenerateTestFloatHistogram(7))},
+				}),
+				assureChunkFromSamples(t, []chunks.Sample{
+					sample{20, 0, nil, tsdbutil.SetFloatHistogramCounterReset(tsdbutil.GenerateTestFloatHistogram(5))},
+					sample{21, 0, nil, tsdbutil.SetFloatHistogramNotCounterReset(tsdbutil.GenerateTestFloatHistogram(6))},
+				}),
+			},
+			expectedMinMaxTimes: []minMaxTimes{
+				{7, 8},
+				{12, 16},
+				{17, 17},
+				{18, 19},
+				{20, 21},
+			},
+
+			// Skipping chunk test - can't create a single chunk for each
+			// sample slice since there are counter resets in the middle of
+			// the slices.
+			skipChunkTest: true,
+		},
+		{
+			// This case won't actually happen until OOO native histograms is implemented.
+			// Issue: https://github.com/prometheus/prometheus/issues/11220.
+			name: "iterables with mixed encodings and counter resets",
+			samples: [][]chunks.Sample{
+				{
+					sample{7, 0, tsdbutil.GenerateTestHistogram(8), nil},
+					sample{8, 0, tsdbutil.GenerateTestHistogram(9), nil},
+					sample{9, 0, nil, tsdbutil.GenerateTestFloatHistogram(10)},
+					sample{10, 0, nil, tsdbutil.GenerateTestFloatHistogram(11)},
+					sample{11, 0, nil, tsdbutil.GenerateTestFloatHistogram(12)},
+					sample{12, 13, nil, nil},
+					sample{13, 14, nil, nil},
+					sample{14, 0, tsdbutil.GenerateTestHistogram(8), nil},
+					// Counter reset should be detected when chunks are created from the iterable.
+					sample{15, 0, tsdbutil.GenerateTestHistogram(7), nil},
+				},
+				{
+					sample{18, 0, tsdbutil.GenerateTestHistogram(6), nil},
+					sample{19, 45, nil, nil},
+				},
+			},
+
+			expected: []chunks.Sample{
+				sample{7, 0, tsdbutil.GenerateTestHistogram(8), nil},
+				sample{8, 0, tsdbutil.GenerateTestHistogram(9), nil},
+				sample{9, 0, nil, tsdbutil.GenerateTestFloatHistogram(10)},
+				sample{10, 0, nil, tsdbutil.GenerateTestFloatHistogram(11)},
+				sample{11, 0, nil, tsdbutil.GenerateTestFloatHistogram(12)},
+				sample{12, 13, nil, nil},
+				sample{13, 14, nil, nil},
+				sample{14, 0, tsdbutil.GenerateTestHistogram(8), nil},
+				sample{15, 0, tsdbutil.GenerateTestHistogram(7), nil},
+				sample{18, 0, tsdbutil.GenerateTestHistogram(6), nil},
+				sample{19, 45, nil, nil},
+			},
+			expectedChks: []chunks.Meta{
+				assureChunkFromSamples(t, []chunks.Sample{
+					sample{7, 0, tsdbutil.GenerateTestHistogram(8), nil},
+					sample{8, 0, tsdbutil.GenerateTestHistogram(9), nil},
+				}),
+				assureChunkFromSamples(t, []chunks.Sample{
+					sample{9, 0, nil, tsdbutil.GenerateTestFloatHistogram(10)},
+					sample{10, 0, nil, tsdbutil.GenerateTestFloatHistogram(11)},
+					sample{11, 0, nil, tsdbutil.GenerateTestFloatHistogram(12)},
+				}),
+				assureChunkFromSamples(t, []chunks.Sample{
+					sample{12, 13, nil, nil},
+					sample{13, 14, nil, nil},
+				}),
+				assureChunkFromSamples(t, []chunks.Sample{
+					sample{14, 0, tsdbutil.GenerateTestHistogram(8), nil},
+				}),
+				assureChunkFromSamples(t, []chunks.Sample{
+					sample{15, 0, tsdbutil.SetHistogramCounterReset(tsdbutil.GenerateTestHistogram(7)), nil},
+				}),
+				assureChunkFromSamples(t, []chunks.Sample{
+					sample{18, 0, tsdbutil.GenerateTestHistogram(6), nil},
+				}),
+				assureChunkFromSamples(t, []chunks.Sample{
+					sample{19, 45, nil, nil},
+				}),
+			},
+			expectedMinMaxTimes: []minMaxTimes{
+				{7, 8},
+				{9, 11},
+				{12, 13},
+				{14, 14},
+				{15, 15},
+				{18, 18},
+				{19, 19},
+			},
+
+			skipChunkTest: true,
+		},
 	}
 	for _, tc := range cases {
 		t.Run(tc.name, func(t *testing.T) {
 			t.Run("sample", func(t *testing.T) {
-				f, chkMetas := createFakeReaderAndNotPopulatedChunks(tc.chks...)
+				var f *fakeChunksReader
+				var chkMetas []chunks.Meta
+				// If the test case wants to skip the chunks test, it probably
+				// means you can't create valid chunks from sample slices,
+				// therefore create iterables over the samples instead.
+				if tc.skipChunkTest {
+					f, chkMetas = createFakeReaderAndIterables(tc.samples...)
+				} else {
+					f, chkMetas = createFakeReaderAndNotPopulatedChunks(tc.samples...)
+				}
 				it := &populateWithDelSeriesIterator{}
 				it.reset(ulid.ULID{}, f, chkMetas, tc.intervals)

@ -1433,7 +1778,35 @@ func TestPopulateWithTombSeriesIterators(t *testing.T) {
 				require.Equal(t, tc.expected, r)
 			})
 			t.Run("chunk", func(t *testing.T) {
-				f, chkMetas := createFakeReaderAndNotPopulatedChunks(tc.chks...)
+				if tc.skipChunkTest {
+					t.Skip()
+				}
+				f, chkMetas := createFakeReaderAndNotPopulatedChunks(tc.samples...)
+				it := &populateWithDelChunkSeriesIterator{}
+				it.reset(ulid.ULID{}, f, chkMetas, tc.intervals)
+
+				if tc.seek != 0 {
+					// Chunk iterator does not have Seek method.
+					return
+				}
+				expandedResult, err := storage.ExpandChunks(it)
+				require.NoError(t, err)
+
+				// We don't care about ref IDs for comparison, only chunk's samples matters.
+				rmChunkRefs(expandedResult)
+				rmChunkRefs(tc.expectedChks)
+				require.Equal(t, tc.expectedChks, expandedResult)
+
+				for i, meta := range expandedResult {
+					require.Equal(t, tc.expectedMinMaxTimes[i].minTime, meta.MinTime)
+					require.Equal(t, tc.expectedMinMaxTimes[i].maxTime, meta.MaxTime)
+				}
+			})
+			t.Run("iterables", func(t *testing.T) {
+				if tc.skipIterableTest {
+					t.Skip()
+				}
+				f, chkMetas := createFakeReaderAndIterables(tc.samples...)
 				it := &populateWithDelChunkSeriesIterator{}
 				it.reset(ulid.ULID{}, f, chkMetas, tc.intervals)

@ -1726,13 +2099,13 @@ func BenchmarkMergedSeriesSet(b *testing.B) {

 type mockChunkReader map[chunks.ChunkRef]chunkenc.Chunk

-func (cr mockChunkReader) Chunk(meta chunks.Meta) (chunkenc.Chunk, error) {
+func (cr mockChunkReader) ChunkOrIterable(meta chunks.Meta) (chunkenc.Chunk, chunkenc.Iterable, error) {
 	chk, ok := cr[meta.Ref]
 	if ok {
-		return chk, nil
+		return chk, nil, nil
 	}

-	return nil, errors.New("Chunk with ref not found")
+	return nil, nil, errors.New("Chunk with ref not found")
 }

 func (cr mockChunkReader) Close() error {
@ -1871,7 +2244,7 @@ func (m mockIndex) Symbols() index.StringIter {

 func (m *mockIndex) AddSeries(ref storage.SeriesRef, l labels.Labels, chunks ...chunks.Meta) error {
 	if _, ok := m.series[ref]; ok {
-		return errors.Errorf("series with reference %d already added", ref)
+		return fmt.Errorf("series with reference %d already added", ref)
 	}
 	l.Range(func(lbl labels.Label) {
 		m.symbols[lbl.Name] = struct{}{}
@ -1892,7 +2265,7 @@ func (m *mockIndex) AddSeries(ref storage.SeriesRef, l labels.Labels, chunks ...
 func (m mockIndex) WritePostings(name, value string, it index.Postings) error {
 	l := labels.Label{Name: name, Value: value}
 	if _, ok := m.postings[l]; ok {
-		return errors.Errorf("postings for %s already added", l)
+		return fmt.Errorf("postings for %s already added", l)
 	}
 	ep, err := index.ExpandPostings(it)
 	if err != nil {
@ -2656,6 +3029,7 @@ func TestQuerierIndexQueriesRace(t *testing.T) {
 	for _, c := range testCases {
 		c := c
 		t.Run(fmt.Sprintf("%v", c.matchers), func(t *testing.T) {
+			t.Parallel()
 			db := openTestDB(t, DefaultOptions(), nil)
 			h := db.Head()
 			t.Cleanup(func() {
@ -2675,6 +3049,9 @@ func TestQuerierIndexQueriesRace(t *testing.T) {
 				values, _, err := q.LabelValues(ctx, "seq", c.matchers...)
 				require.NoError(t, err)
 				require.Emptyf(t, values, `label values for label "seq" should be empty`)
+
+				// Sleep to give the appends some change to run.
+				time.Sleep(time.Millisecond)
 			}
 		})
 	}
@ -2691,6 +3068,7 @@ func appendSeries(t *testing.T, ctx context.Context, wg *sync.WaitGroup, h *Head
 		require.NoError(t, err)

 		// Throttle down the appends to keep the test somewhat nimble.
+		// Otherwise, we end up appending thousands or millions of samples.
 		time.Sleep(time.Millisecond)
 	}
 }
@ -2817,7 +3195,7 @@ func BenchmarkQueries(b *testing.B) {

 					qHead, err := NewBlockQuerier(NewRangeHead(head, 1, nSamples), 1, nSamples)
 					require.NoError(b, err)
-					qOOOHead, err := NewBlockQuerier(NewOOORangeHead(head, 1, nSamples), 1, nSamples)
+					qOOOHead, err := NewBlockQuerier(NewOOORangeHead(head, 1, nSamples, 0), 1, nSamples)
 					require.NoError(b, err)

 					queryTypes = append(queryTypes, qt{
@ -3042,7 +3420,7 @@ func TestBlockBaseSeriesSet(t *testing.T) {
 			idx := tc.expIdxs[i]

 			require.Equal(t, tc.series[idx].lset, bcs.curr.labels)
-			require.Equal(t, tc.series[idx].chunks, si.chks)
+			require.Equal(t, tc.series[idx].chunks, si.metas)

 			i++
 		}
--- a/Show more
+++ b/Show more