From 5752050b42195d87d7dd0760f7ec3b088f115cd3 Mon Sep 17 00:00:00 2001 From: Paulin Todev Date: Fri, 22 Sep 2023 17:47:44 +0100 Subject: [PATCH 01/17] Scrape metrics can now be registered with a non-default registry. * A registerer is passed to the scrape Manager, and all scrape metrics register with it. * For now the registry which we pass to the scrape Manager is still the global one. Signed-off-by: Paulin Todev --- cmd/prometheus/main.go | 12 +- scrape/manager.go | 86 +++--------- scrape/manager_test.go | 23 ++- scrape/metrics.go | 307 +++++++++++++++++++++++++++++++++++++++++ scrape/scrape.go | 245 ++++++-------------------------- scrape/scrape_test.go | 86 +++++++++--- 6 files changed, 464 insertions(+), 295 deletions(-) create mode 100644 scrape/metrics.go diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index cdfb42b185..9e85f571a8 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -620,8 +620,18 @@ func main() { discoveryManagerNotify = legacymanager.NewManager(ctxNotify, log.With(logger, "component", "discovery manager notify"), legacymanager.Name("notify")) } + scrapeManager, err := scrape.NewManager( + &cfg.scrape, + log.With(logger, "component", "scrape manager"), + fanoutStorage, + prometheus.DefaultRegisterer, + ) + if err != nil { + level.Error(logger).Log("msg", "failed to create a scrape manager", "err", err) + os.Exit(1) + } + var ( - scrapeManager = scrape.NewManager(&cfg.scrape, log.With(logger, "component", "scrape manager"), fanoutStorage) tracingManager = tracing.NewManager(logger) queryEngine *promql.Engine diff --git a/scrape/manager.go b/scrape/manager.go index bd73dd2962..14dd610618 100644 --- a/scrape/manager.go +++ b/scrape/manager.go @@ -34,80 +34,20 @@ import ( "github.com/prometheus/prometheus/util/osutil" ) -var targetMetadataCache = newMetadataMetricsCollector() - -// MetadataMetricsCollector is a Custom Collector for the metadata cache metrics. -type MetadataMetricsCollector struct { - CacheEntries *prometheus.Desc - CacheBytes *prometheus.Desc - - scrapeManager *Manager -} - -func newMetadataMetricsCollector() *MetadataMetricsCollector { - return &MetadataMetricsCollector{ - CacheEntries: prometheus.NewDesc( - "prometheus_target_metadata_cache_entries", - "Total number of metric metadata entries in the cache", - []string{"scrape_job"}, - nil, - ), - CacheBytes: prometheus.NewDesc( - "prometheus_target_metadata_cache_bytes", - "The number of bytes that are currently used for storing metric metadata in the cache", - []string{"scrape_job"}, - nil, - ), - } -} - -func (mc *MetadataMetricsCollector) registerManager(m *Manager) { - mc.scrapeManager = m -} - -// Describe sends the metrics descriptions to the channel. -func (mc *MetadataMetricsCollector) Describe(ch chan<- *prometheus.Desc) { - ch <- mc.CacheEntries - ch <- mc.CacheBytes -} - -// Collect creates and sends the metrics for the metadata cache. -func (mc *MetadataMetricsCollector) Collect(ch chan<- prometheus.Metric) { - if mc.scrapeManager == nil { - return - } - - for tset, targets := range mc.scrapeManager.TargetsActive() { - var size, length int - for _, t := range targets { - size += t.MetadataSize() - length += t.MetadataLength() - } - - ch <- prometheus.MustNewConstMetric( - mc.CacheEntries, - prometheus.GaugeValue, - float64(length), - tset, - ) - - ch <- prometheus.MustNewConstMetric( - mc.CacheBytes, - prometheus.GaugeValue, - float64(size), - tset, - ) - } -} - // NewManager is the Manager constructor -func NewManager(o *Options, logger log.Logger, app storage.Appendable) *Manager { +func NewManager(o *Options, logger log.Logger, app storage.Appendable, registerer prometheus.Registerer) (*Manager, error) { if o == nil { o = &Options{} } if logger == nil { logger = log.NewNopLogger() } + + sm, err := newScrapeMetrics(registerer) + if err != nil { + return nil, fmt.Errorf("failed to create scrape manager due to error: %w", err) + } + m := &Manager{ append: app, opts: o, @@ -116,10 +56,12 @@ func NewManager(o *Options, logger log.Logger, app storage.Appendable) *Manager scrapePools: make(map[string]*scrapePool), graceShut: make(chan struct{}), triggerReload: make(chan struct{}, 1), + metrics: sm, } - targetMetadataCache.registerManager(m) - return m + m.metrics.setTargetMetadataCacheGatherer(m) + + return m, nil } // Options are the configuration parameters to the scrape manager. @@ -154,6 +96,8 @@ type Manager struct { targetSets map[string][]*targetgroup.Group triggerReload chan struct{} + + metrics *scrapeMetrics } // Run receives and saves target set updates and triggers the scraping loops reloading. @@ -211,8 +155,10 @@ func (m *Manager) reload() { level.Error(m.logger).Log("msg", "error reloading target set", "err", "invalid config id:"+setName) continue } - sp, err := newScrapePool(scrapeConfig, m.append, m.offsetSeed, log.With(m.logger, "scrape_pool", setName), m.opts) + m.metrics.targetScrapePools.Inc() + sp, err := newScrapePool(scrapeConfig, m.append, m.offsetSeed, log.With(m.logger, "scrape_pool", setName), m.opts, m.metrics) if err != nil { + m.metrics.targetScrapePoolsFailed.Inc() level.Error(m.logger).Log("msg", "error creating new scrape pool", "err", err, "scrape_pool", setName) continue } diff --git a/scrape/manager_test.go b/scrape/manager_test.go index 50f6320137..a689c469d4 100644 --- a/scrape/manager_test.go +++ b/scrape/manager_test.go @@ -20,6 +20,7 @@ import ( "testing" "time" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" "github.com/stretchr/testify/require" "gopkg.in/yaml.v2" @@ -492,10 +493,13 @@ scrape_configs: cfg3 = loadConfiguration(t, cfgText3) ch = make(chan struct{}, 1) + + testRegistry = prometheus.NewRegistry() ) opts := Options{} - scrapeManager := NewManager(&opts, nil, nil) + scrapeManager, err := NewManager(&opts, nil, nil, testRegistry) + require.NoError(t, err) newLoop := func(scrapeLoopOptions) loop { ch <- struct{}{} return noopLoop() @@ -512,6 +516,7 @@ scrape_configs: logger: nil, config: cfg1.ScrapeConfigs[0], client: http.DefaultClient, + metrics: scrapeManager.metrics, } scrapeManager.scrapePools = map[string]*scrapePool{ "job1": sp, @@ -560,7 +565,9 @@ scrape_configs: func TestManagerTargetsUpdates(t *testing.T) { opts := Options{} - m := NewManager(&opts, nil, nil) + testRegistry := prometheus.NewRegistry() + m, err := NewManager(&opts, nil, nil, testRegistry) + require.NoError(t, err) ts := make(chan map[string][]*targetgroup.Group) go m.Run(ts) @@ -613,7 +620,9 @@ global: } opts := Options{} - scrapeManager := NewManager(&opts, nil, nil) + testRegistry := prometheus.NewRegistry() + scrapeManager, err := NewManager(&opts, nil, nil, testRegistry) + require.NoError(t, err) // Load the first config. cfg1 := getConfig("ha1") @@ -658,8 +667,9 @@ scrape_configs: - targets: ["foo:9093"] ` var ( - cfg1 = loadConfiguration(t, cfgText1) - cfg2 = loadConfiguration(t, cfgText2) + cfg1 = loadConfiguration(t, cfgText1) + cfg2 = loadConfiguration(t, cfgText2) + testRegistry = prometheus.NewRegistry() ) reload := func(scrapeManager *Manager, cfg *config.Config) { @@ -695,7 +705,8 @@ scrape_configs: } opts := Options{} - scrapeManager := NewManager(&opts, nil, nil) + scrapeManager, err := NewManager(&opts, nil, nil, testRegistry) + require.NoError(t, err) reload(scrapeManager, cfg1) require.ElementsMatch(t, []string{"job1", "job2"}, scrapeManager.ScrapePools()) diff --git a/scrape/metrics.go b/scrape/metrics.go new file mode 100644 index 0000000000..d74143185b --- /dev/null +++ b/scrape/metrics.go @@ -0,0 +1,307 @@ +// Copyright 2016 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scrape + +import ( + "fmt" + + "github.com/prometheus/client_golang/prometheus" +) + +type scrapeMetrics struct { + // Used by Manager. + targetMetadataCache *MetadataMetricsCollector + targetScrapePools prometheus.Counter + targetScrapePoolsFailed prometheus.Counter + + // Used by scrapePool. + targetReloadIntervalLength *prometheus.SummaryVec + targetScrapePoolReloads prometheus.Counter + targetScrapePoolReloadsFailed prometheus.Counter + targetScrapePoolSyncsCounter *prometheus.CounterVec + targetScrapePoolExceededTargetLimit prometheus.Counter + targetScrapePoolTargetLimit *prometheus.GaugeVec + targetScrapePoolTargetsAdded *prometheus.GaugeVec + targetSyncIntervalLength *prometheus.SummaryVec + targetSyncFailed *prometheus.CounterVec + + // Used by targetScraper. + targetScrapeExceededBodySizeLimit prometheus.Counter + + // Used by scrapeCache. + targetScrapeCacheFlushForced prometheus.Counter + + // Used by scrapeLoop. + targetIntervalLength *prometheus.SummaryVec + targetScrapeSampleLimit prometheus.Counter + targetScrapeSampleDuplicate prometheus.Counter + targetScrapeSampleOutOfOrder prometheus.Counter + targetScrapeSampleOutOfBounds prometheus.Counter + targetScrapeExemplarOutOfOrder prometheus.Counter + targetScrapePoolExceededLabelLimits prometheus.Counter + targetScrapeNativeHistogramBucketLimit prometheus.Counter +} + +func newScrapeMetrics(reg prometheus.Registerer) (*scrapeMetrics, error) { + sm := &scrapeMetrics{} + + // Manager metrics. + sm.targetMetadataCache = &MetadataMetricsCollector{ + CacheEntries: prometheus.NewDesc( + "prometheus_target_metadata_cache_entries", + "Total number of metric metadata entries in the cache", + []string{"scrape_job"}, + nil, + ), + CacheBytes: prometheus.NewDesc( + "prometheus_target_metadata_cache_bytes", + "The number of bytes that are currently used for storing metric metadata in the cache", + []string{"scrape_job"}, + nil, + ), + // TargetsGatherer should be set later, because it's a circular dependency. + // newScrapeMetrics() is called by NewManager(), while also TargetsGatherer is the new Manager. + } + + sm.targetScrapePools = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "prometheus_target_scrape_pools_total", + Help: "Total number of scrape pool creation attempts.", + }, + ) + sm.targetScrapePoolsFailed = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "prometheus_target_scrape_pools_failed_total", + Help: "Total number of scrape pool creations that failed.", + }, + ) + + // Used by scrapePool. + sm.targetReloadIntervalLength = prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Name: "prometheus_target_reload_length_seconds", + Help: "Actual interval to reload the scrape pool with a given configuration.", + Objectives: map[float64]float64{0.01: 0.001, 0.05: 0.005, 0.5: 0.05, 0.90: 0.01, 0.99: 0.001}, + }, + []string{"interval"}, + ) + sm.targetScrapePoolReloads = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "prometheus_target_scrape_pool_reloads_total", + Help: "Total number of scrape pool reloads.", + }, + ) + sm.targetScrapePoolReloadsFailed = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "prometheus_target_scrape_pool_reloads_failed_total", + Help: "Total number of failed scrape pool reloads.", + }, + ) + sm.targetScrapePoolExceededTargetLimit = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "prometheus_target_scrape_pool_exceeded_target_limit_total", + Help: "Total number of times scrape pools hit the target limit, during sync or config reload.", + }, + ) + sm.targetScrapePoolTargetLimit = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "prometheus_target_scrape_pool_target_limit", + Help: "Maximum number of targets allowed in this scrape pool.", + }, + []string{"scrape_job"}, + ) + sm.targetScrapePoolTargetsAdded = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "prometheus_target_scrape_pool_targets", + Help: "Current number of targets in this scrape pool.", + }, + []string{"scrape_job"}, + ) + sm.targetScrapePoolSyncsCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "prometheus_target_scrape_pool_sync_total", + Help: "Total number of syncs that were executed on a scrape pool.", + }, + []string{"scrape_job"}, + ) + sm.targetSyncIntervalLength = prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Name: "prometheus_target_sync_length_seconds", + Help: "Actual interval to sync the scrape pool.", + Objectives: map[float64]float64{0.01: 0.001, 0.05: 0.005, 0.5: 0.05, 0.90: 0.01, 0.99: 0.001}, + }, + []string{"scrape_job"}, + ) + sm.targetSyncFailed = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "prometheus_target_sync_failed_total", + Help: "Total number of target sync failures.", + }, + []string{"scrape_job"}, + ) + + // Used by targetScraper. + sm.targetScrapeExceededBodySizeLimit = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "prometheus_target_scrapes_exceeded_body_size_limit_total", + Help: "Total number of scrapes that hit the body size limit", + }, + ) + + // Used by scrapeCache. + sm.targetScrapeCacheFlushForced = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "prometheus_target_scrapes_cache_flush_forced_total", + Help: "How many times a scrape cache was flushed due to getting big while scrapes are failing.", + }, + ) + + // Used by scrapeLoop. + sm.targetIntervalLength = prometheus.NewSummaryVec( + prometheus.SummaryOpts{ + Name: "prometheus_target_interval_length_seconds", + Help: "Actual intervals between scrapes.", + Objectives: map[float64]float64{0.01: 0.001, 0.05: 0.005, 0.5: 0.05, 0.90: 0.01, 0.99: 0.001}, + }, + []string{"interval"}, + ) + sm.targetScrapeSampleLimit = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "prometheus_target_scrapes_exceeded_sample_limit_total", + Help: "Total number of scrapes that hit the sample limit and were rejected.", + }, + ) + sm.targetScrapeSampleDuplicate = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "prometheus_target_scrapes_sample_duplicate_timestamp_total", + Help: "Total number of samples rejected due to duplicate timestamps but different values.", + }, + ) + sm.targetScrapeSampleOutOfOrder = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "prometheus_target_scrapes_sample_out_of_order_total", + Help: "Total number of samples rejected due to not being out of the expected order.", + }, + ) + sm.targetScrapeSampleOutOfBounds = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "prometheus_target_scrapes_sample_out_of_bounds_total", + Help: "Total number of samples rejected due to timestamp falling outside of the time bounds.", + }, + ) + sm.targetScrapePoolExceededLabelLimits = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "prometheus_target_scrape_pool_exceeded_label_limits_total", + Help: "Total number of times scrape pools hit the label limits, during sync or config reload.", + }, + ) + sm.targetScrapeNativeHistogramBucketLimit = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "prometheus_target_scrapes_exceeded_native_histogram_bucket_limit_total", + Help: "Total number of scrapes that hit the native histogram bucket limit and were rejected.", + }, + ) + sm.targetScrapeExemplarOutOfOrder = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "prometheus_target_scrapes_exemplar_out_of_order_total", + Help: "Total number of exemplar rejected due to not being out of the expected order.", + }, + ) + + for _, collector := range []prometheus.Collector{ + // Used by Manager. + sm.targetMetadataCache, + sm.targetScrapePools, + sm.targetScrapePoolsFailed, + // Used by scrapePool. + sm.targetReloadIntervalLength, + sm.targetScrapePoolReloads, + sm.targetScrapePoolReloadsFailed, + sm.targetSyncIntervalLength, + sm.targetScrapePoolSyncsCounter, + sm.targetScrapePoolExceededTargetLimit, + sm.targetScrapePoolTargetLimit, + sm.targetScrapePoolTargetsAdded, + sm.targetSyncFailed, + // Used by targetScraper. + sm.targetScrapeExceededBodySizeLimit, + // Used by scrapeCache. + sm.targetScrapeCacheFlushForced, + // Used by scrapeLoop. + sm.targetIntervalLength, + sm.targetScrapeSampleLimit, + sm.targetScrapeSampleDuplicate, + sm.targetScrapeSampleOutOfOrder, + sm.targetScrapeSampleOutOfBounds, + sm.targetScrapeExemplarOutOfOrder, + sm.targetScrapePoolExceededLabelLimits, + sm.targetScrapeNativeHistogramBucketLimit, + } { + err := reg.Register(collector) + if err != nil { + return nil, fmt.Errorf("failed to register scrape metrics: %w", err) + } + } + return sm, nil +} + +func (sm *scrapeMetrics) setTargetMetadataCacheGatherer(gatherer TargetsGatherer) { + sm.targetMetadataCache.TargetsGatherer = gatherer +} + +type TargetsGatherer interface { + TargetsActive() map[string][]*Target +} + +// MetadataMetricsCollector is a Custom Collector for the metadata cache metrics. +type MetadataMetricsCollector struct { + CacheEntries *prometheus.Desc + CacheBytes *prometheus.Desc + TargetsGatherer TargetsGatherer +} + +// Describe sends the metrics descriptions to the channel. +func (mc *MetadataMetricsCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- mc.CacheEntries + ch <- mc.CacheBytes +} + +// Collect creates and sends the metrics for the metadata cache. +func (mc *MetadataMetricsCollector) Collect(ch chan<- prometheus.Metric) { + if mc.TargetsGatherer == nil { + return + } + + for tset, targets := range mc.TargetsGatherer.TargetsActive() { + var size, length int + for _, t := range targets { + size += t.MetadataSize() + length += t.MetadataLength() + } + + ch <- prometheus.MustNewConstMetric( + mc.CacheEntries, + prometheus.GaugeValue, + float64(length), + tset, + ) + + ch <- prometheus.MustNewConstMetric( + mc.CacheBytes, + prometheus.GaugeValue, + float64(size), + tset, + ) + } +} diff --git a/scrape/scrape.go b/scrape/scrape.go index 27236ee329..d67297ca7e 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -31,7 +31,6 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/pkg/errors" - "github.com/prometheus/client_golang/prometheus" config_util "github.com/prometheus/common/config" "github.com/prometheus/common/model" "github.com/prometheus/common/version" @@ -61,172 +60,6 @@ var AlignScrapeTimestamps = true var errNameLabelMandatory = fmt.Errorf("missing metric name (%s label)", labels.MetricName) -var ( - targetIntervalLength = prometheus.NewSummaryVec( - prometheus.SummaryOpts{ - Name: "prometheus_target_interval_length_seconds", - Help: "Actual intervals between scrapes.", - Objectives: map[float64]float64{0.01: 0.001, 0.05: 0.005, 0.5: 0.05, 0.90: 0.01, 0.99: 0.001}, - }, - []string{"interval"}, - ) - targetReloadIntervalLength = prometheus.NewSummaryVec( - prometheus.SummaryOpts{ - Name: "prometheus_target_reload_length_seconds", - Help: "Actual interval to reload the scrape pool with a given configuration.", - Objectives: map[float64]float64{0.01: 0.001, 0.05: 0.005, 0.5: 0.05, 0.90: 0.01, 0.99: 0.001}, - }, - []string{"interval"}, - ) - targetScrapePools = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "prometheus_target_scrape_pools_total", - Help: "Total number of scrape pool creation attempts.", - }, - ) - targetScrapePoolsFailed = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "prometheus_target_scrape_pools_failed_total", - Help: "Total number of scrape pool creations that failed.", - }, - ) - targetScrapePoolReloads = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "prometheus_target_scrape_pool_reloads_total", - Help: "Total number of scrape pool reloads.", - }, - ) - targetScrapePoolReloadsFailed = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "prometheus_target_scrape_pool_reloads_failed_total", - Help: "Total number of failed scrape pool reloads.", - }, - ) - targetScrapePoolExceededTargetLimit = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "prometheus_target_scrape_pool_exceeded_target_limit_total", - Help: "Total number of times scrape pools hit the target limit, during sync or config reload.", - }, - ) - targetScrapePoolTargetLimit = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "prometheus_target_scrape_pool_target_limit", - Help: "Maximum number of targets allowed in this scrape pool.", - }, - []string{"scrape_job"}, - ) - targetScrapePoolTargetsAdded = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "prometheus_target_scrape_pool_targets", - Help: "Current number of targets in this scrape pool.", - }, - []string{"scrape_job"}, - ) - targetSyncIntervalLength = prometheus.NewSummaryVec( - prometheus.SummaryOpts{ - Name: "prometheus_target_sync_length_seconds", - Help: "Actual interval to sync the scrape pool.", - Objectives: map[float64]float64{0.01: 0.001, 0.05: 0.005, 0.5: 0.05, 0.90: 0.01, 0.99: 0.001}, - }, - []string{"scrape_job"}, - ) - targetScrapePoolSyncsCounter = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Name: "prometheus_target_scrape_pool_sync_total", - Help: "Total number of syncs that were executed on a scrape pool.", - }, - []string{"scrape_job"}, - ) - targetScrapeExceededBodySizeLimit = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "prometheus_target_scrapes_exceeded_body_size_limit_total", - Help: "Total number of scrapes that hit the body size limit", - }, - ) - targetScrapeSampleLimit = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "prometheus_target_scrapes_exceeded_sample_limit_total", - Help: "Total number of scrapes that hit the sample limit and were rejected.", - }, - ) - targetScrapeSampleDuplicate = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "prometheus_target_scrapes_sample_duplicate_timestamp_total", - Help: "Total number of samples rejected due to duplicate timestamps but different values.", - }, - ) - targetScrapeSampleOutOfOrder = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "prometheus_target_scrapes_sample_out_of_order_total", - Help: "Total number of samples rejected due to not being out of the expected order.", - }, - ) - targetScrapeSampleOutOfBounds = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "prometheus_target_scrapes_sample_out_of_bounds_total", - Help: "Total number of samples rejected due to timestamp falling outside of the time bounds.", - }, - ) - targetScrapeCacheFlushForced = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "prometheus_target_scrapes_cache_flush_forced_total", - Help: "How many times a scrape cache was flushed due to getting big while scrapes are failing.", - }, - ) - targetScrapeExemplarOutOfOrder = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "prometheus_target_scrapes_exemplar_out_of_order_total", - Help: "Total number of exemplar rejected due to not being out of the expected order.", - }, - ) - targetScrapePoolExceededLabelLimits = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "prometheus_target_scrape_pool_exceeded_label_limits_total", - Help: "Total number of times scrape pools hit the label limits, during sync or config reload.", - }, - ) - targetSyncFailed = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Name: "prometheus_target_sync_failed_total", - Help: "Total number of target sync failures.", - }, - []string{"scrape_job"}, - ) - targetScrapeNativeHistogramBucketLimit = prometheus.NewCounter( - prometheus.CounterOpts{ - Name: "prometheus_target_scrapes_exceeded_native_histogram_bucket_limit_total", - Help: "Total number of scrapes that hit the native histogram bucket limit and were rejected.", - }, - ) -) - -func init() { - prometheus.MustRegister( - targetIntervalLength, - targetReloadIntervalLength, - targetScrapePools, - targetScrapePoolsFailed, - targetScrapePoolReloads, - targetScrapePoolReloadsFailed, - targetSyncIntervalLength, - targetScrapePoolSyncsCounter, - targetScrapeExceededBodySizeLimit, - targetScrapeSampleLimit, - targetScrapeSampleDuplicate, - targetScrapeSampleOutOfOrder, - targetScrapeSampleOutOfBounds, - targetScrapePoolExceededTargetLimit, - targetScrapePoolTargetLimit, - targetScrapePoolTargetsAdded, - targetScrapeCacheFlushForced, - targetMetadataCache, - targetScrapeExemplarOutOfOrder, - targetScrapePoolExceededLabelLimits, - targetSyncFailed, - targetScrapeNativeHistogramBucketLimit, - ) -} - // scrapePool manages scrapes for sets of targets. type scrapePool struct { appendable storage.Appendable @@ -251,6 +84,8 @@ type scrapePool struct { newLoop func(scrapeLoopOptions) loop noDefaultPort bool + + metrics *scrapeMetrics } type labelLimits struct { @@ -279,15 +114,13 @@ const maxAheadTime = 10 * time.Minute // returning an empty label set is interpreted as "drop" type labelsMutator func(labels.Labels) labels.Labels -func newScrapePool(cfg *config.ScrapeConfig, app storage.Appendable, offsetSeed uint64, logger log.Logger, options *Options) (*scrapePool, error) { - targetScrapePools.Inc() +func newScrapePool(cfg *config.ScrapeConfig, app storage.Appendable, offsetSeed uint64, logger log.Logger, options *Options, metrics *scrapeMetrics) (*scrapePool, error) { if logger == nil { logger = log.NewNopLogger() } client, err := config_util.NewClientFromConfig(cfg.HTTPClientConfig, cfg.JobName, options.HTTPClientOptions...) if err != nil { - targetScrapePoolsFailed.Inc() return nil, errors.Wrap(err, "error creating HTTP client") } @@ -302,6 +135,7 @@ func newScrapePool(cfg *config.ScrapeConfig, app storage.Appendable, offsetSeed activeTargets: map[uint64]*Target{}, loops: map[uint64]loop{}, logger: logger, + metrics: metrics, httpOpts: options.HTTPClientOptions, noDefaultPort: options.NoDefaultPort, } @@ -309,7 +143,7 @@ func newScrapePool(cfg *config.ScrapeConfig, app storage.Appendable, offsetSeed // Update the targets retrieval function for metadata to a new scrape cache. cache := opts.cache if cache == nil { - cache = newScrapeCache() + cache = newScrapeCache(metrics) } opts.target.SetMetadataStore(cache) @@ -336,9 +170,10 @@ func newScrapePool(cfg *config.ScrapeConfig, app storage.Appendable, offsetSeed options.EnableMetadataStorage, opts.target, options.PassMetadataInContext, + metrics, ) } - targetScrapePoolTargetLimit.WithLabelValues(sp.config.JobName).Set(float64(sp.config.TargetLimit)) + sp.metrics.targetScrapePoolTargetLimit.WithLabelValues(sp.config.JobName).Set(float64(sp.config.TargetLimit)) return sp, nil } @@ -393,11 +228,11 @@ func (sp *scrapePool) stop() { sp.client.CloseIdleConnections() if sp.config != nil { - targetScrapePoolSyncsCounter.DeleteLabelValues(sp.config.JobName) - targetScrapePoolTargetLimit.DeleteLabelValues(sp.config.JobName) - targetScrapePoolTargetsAdded.DeleteLabelValues(sp.config.JobName) - targetSyncIntervalLength.DeleteLabelValues(sp.config.JobName) - targetSyncFailed.DeleteLabelValues(sp.config.JobName) + sp.metrics.targetScrapePoolSyncsCounter.DeleteLabelValues(sp.config.JobName) + sp.metrics.targetScrapePoolTargetLimit.DeleteLabelValues(sp.config.JobName) + sp.metrics.targetScrapePoolTargetsAdded.DeleteLabelValues(sp.config.JobName) + sp.metrics.targetSyncIntervalLength.DeleteLabelValues(sp.config.JobName) + sp.metrics.targetSyncFailed.DeleteLabelValues(sp.config.JobName) } } @@ -407,12 +242,12 @@ func (sp *scrapePool) stop() { func (sp *scrapePool) reload(cfg *config.ScrapeConfig) error { sp.mtx.Lock() defer sp.mtx.Unlock() - targetScrapePoolReloads.Inc() + sp.metrics.targetScrapePoolReloads.Inc() start := time.Now() client, err := config_util.NewClientFromConfig(cfg.HTTPClientConfig, cfg.JobName, sp.httpOpts...) if err != nil { - targetScrapePoolReloadsFailed.Inc() + sp.metrics.targetScrapePoolReloadsFailed.Inc() return errors.Wrap(err, "error creating HTTP client") } @@ -421,7 +256,7 @@ func (sp *scrapePool) reload(cfg *config.ScrapeConfig) error { oldClient := sp.client sp.client = client - targetScrapePoolTargetLimit.WithLabelValues(sp.config.JobName).Set(float64(sp.config.TargetLimit)) + sp.metrics.targetScrapePoolTargetLimit.WithLabelValues(sp.config.JobName).Set(float64(sp.config.TargetLimit)) var ( wg sync.WaitGroup @@ -449,7 +284,7 @@ func (sp *scrapePool) reload(cfg *config.ScrapeConfig) error { oldLoop.disableEndOfRunStalenessMarkers() cache = oc } else { - cache = newScrapeCache() + cache = newScrapeCache(sp.metrics) } t := sp.activeTargets[fp] @@ -496,7 +331,7 @@ func (sp *scrapePool) reload(cfg *config.ScrapeConfig) error { wg.Wait() oldClient.CloseIdleConnections() - targetReloadIntervalLength.WithLabelValues(interval.String()).Observe( + sp.metrics.targetReloadIntervalLength.WithLabelValues(interval.String()).Observe( time.Since(start).Seconds(), ) return nil @@ -520,7 +355,7 @@ func (sp *scrapePool) Sync(tgs []*targetgroup.Group) { for _, err := range failures { level.Error(sp.logger).Log("msg", "Creating target failed", "err", err) } - targetSyncFailed.WithLabelValues(sp.config.JobName).Add(float64(len(failures))) + sp.metrics.targetSyncFailed.WithLabelValues(sp.config.JobName).Add(float64(len(failures))) for _, t := range targets { // Replicate .Labels().IsEmpty() with a loop here to avoid generating garbage. nonEmpty := false @@ -539,10 +374,10 @@ func (sp *scrapePool) Sync(tgs []*targetgroup.Group) { sp.targetMtx.Unlock() sp.sync(all) - targetSyncIntervalLength.WithLabelValues(sp.config.JobName).Observe( + sp.metrics.targetSyncIntervalLength.WithLabelValues(sp.config.JobName).Observe( time.Since(start).Seconds(), ) - targetScrapePoolSyncsCounter.WithLabelValues(sp.config.JobName).Inc() + sp.metrics.targetScrapePoolSyncsCounter.WithLabelValues(sp.config.JobName).Inc() } // sync takes a list of potentially duplicated targets, deduplicates them, starts @@ -583,6 +418,7 @@ func (sp *scrapePool) sync(targets []*Target) { timeout: timeout, bodySizeLimit: bodySizeLimit, acceptHeader: acceptHeader(sp.config.ScrapeProtocols), + metrics: sp.metrics, } l := sp.newLoop(scrapeLoopOptions{ target: t, @@ -634,7 +470,7 @@ func (sp *scrapePool) sync(targets []*Target) { sp.targetMtx.Unlock() - targetScrapePoolTargetsAdded.WithLabelValues(sp.config.JobName).Set(float64(len(uniqueLoops))) + sp.metrics.targetScrapePoolTargetsAdded.WithLabelValues(sp.config.JobName).Set(float64(len(uniqueLoops))) forcedErr := sp.refreshTargetLimitErr() for _, l := range sp.loops { l.setForcedError(forcedErr) @@ -658,7 +494,7 @@ func (sp *scrapePool) refreshTargetLimitErr() error { return nil } if l := len(sp.activeTargets); l > int(sp.config.TargetLimit) { - targetScrapePoolExceededTargetLimit.Inc() + sp.metrics.targetScrapePoolExceededTargetLimit.Inc() return fmt.Errorf("target_limit exceeded (number of targets: %d, limit: %d)", l, sp.config.TargetLimit) } return nil @@ -806,6 +642,8 @@ type targetScraper struct { bodySizeLimit int64 acceptHeader string + + metrics *scrapeMetrics } var errBodySizeLimit = errors.New("body size limit exceeded") @@ -863,7 +701,7 @@ func (s *targetScraper) readResponse(ctx context.Context, resp *http.Response, w return "", err } if n >= s.bodySizeLimit { - targetScrapeExceededBodySizeLimit.Inc() + s.metrics.targetScrapeExceededBodySizeLimit.Inc() return "", errBodySizeLimit } return resp.Header.Get("Content-Type"), nil @@ -889,7 +727,7 @@ func (s *targetScraper) readResponse(ctx context.Context, resp *http.Response, w return "", err } if n >= s.bodySizeLimit { - targetScrapeExceededBodySizeLimit.Inc() + s.metrics.targetScrapeExceededBodySizeLimit.Inc() return "", errBodySizeLimit } return resp.Header.Get("Content-Type"), nil @@ -942,6 +780,8 @@ type scrapeLoop struct { reportExtraMetrics bool appendMetadataToWAL bool + + metrics *scrapeMetrics } // scrapeCache tracks mappings of exposed metric strings to label sets and @@ -969,6 +809,8 @@ type scrapeCache struct { metaMtx sync.Mutex metadata map[string]*metaEntry + + metrics *scrapeMetrics } // metaEntry holds meta information about a metric. @@ -984,13 +826,14 @@ func (m *metaEntry) size() int { return len(m.Help) + len(m.Unit) + len(m.Type) } -func newScrapeCache() *scrapeCache { +func newScrapeCache(metrics *scrapeMetrics) *scrapeCache { return &scrapeCache{ series: map[string]*cacheEntry{}, droppedSeries: map[string]*uint64{}, seriesCur: map[uint64]labels.Labels{}, seriesPrev: map[uint64]labels.Labels{}, metadata: map[string]*metaEntry{}, + metrics: metrics, } } @@ -1009,7 +852,7 @@ func (c *scrapeCache) iterDone(flushCache bool) { // since the last scrape, and allow an additional 1000 in case // initial scrapes all fail. flushCache = true - targetScrapeCacheFlushForced.Inc() + c.metrics.targetScrapeCacheFlushForced.Inc() } if flushCache { @@ -1213,6 +1056,7 @@ func newScrapeLoop(ctx context.Context, appendMetadataToWAL bool, target *Target, passMetadataInContext bool, + metrics *scrapeMetrics, ) *scrapeLoop { if l == nil { l = log.NewNopLogger() @@ -1221,7 +1065,7 @@ func newScrapeLoop(ctx context.Context, buffers = pool.New(1e3, 1e6, 3, func(sz int) interface{} { return make([]byte, 0, sz) }) } if cache == nil { - cache = newScrapeCache() + cache = newScrapeCache(metrics) } appenderCtx := ctx @@ -1256,6 +1100,7 @@ func newScrapeLoop(ctx context.Context, scrapeClassicHistograms: scrapeClassicHistograms, reportExtraMetrics: reportExtraMetrics, appendMetadataToWAL: appendMetadataToWAL, + metrics: metrics, } sl.ctx, sl.cancel = context.WithCancel(ctx) @@ -1335,7 +1180,7 @@ func (sl *scrapeLoop) scrapeAndReport(last, appendTime time.Time, errc chan<- er // Only record after the first scrape. if !last.IsZero() { - targetIntervalLength.WithLabelValues(sl.interval.String()).Observe( + sl.metrics.targetIntervalLength.WithLabelValues(sl.interval.String()).Observe( time.Since(last).Seconds(), ) } @@ -1676,7 +1521,7 @@ loop: // If any label limits is exceeded the scrape should fail. if err = verifyLabelLimits(lset, sl.labelLimits); err != nil { - targetScrapePoolExceededLabelLimits.Inc() + sl.metrics.targetScrapePoolExceededLabelLimits.Inc() break loop } @@ -1741,14 +1586,14 @@ loop: err = sampleLimitErr } // We only want to increment this once per scrape, so this is Inc'd outside the loop. - targetScrapeSampleLimit.Inc() + sl.metrics.targetScrapeSampleLimit.Inc() } if bucketLimitErr != nil { if err == nil { err = bucketLimitErr // If sample limit is hit, that error takes precedence. } // We only want to increment this once per scrape, so this is Inc'd outside the loop. - targetScrapeNativeHistogramBucketLimit.Inc() + sl.metrics.targetScrapeNativeHistogramBucketLimit.Inc() } if appErrs.numOutOfOrder > 0 { level.Warn(sl.l).Log("msg", "Error on ingesting out-of-order samples", "num_dropped", appErrs.numOutOfOrder) @@ -1792,17 +1637,17 @@ func (sl *scrapeLoop) checkAddError(ce *cacheEntry, met []byte, tp *int64, err e case storage.ErrOutOfOrderSample: appErrs.numOutOfOrder++ level.Debug(sl.l).Log("msg", "Out of order sample", "series", string(met)) - targetScrapeSampleOutOfOrder.Inc() + sl.metrics.targetScrapeSampleOutOfOrder.Inc() return false, nil case storage.ErrDuplicateSampleForTimestamp: appErrs.numDuplicates++ level.Debug(sl.l).Log("msg", "Duplicate sample for timestamp", "series", string(met)) - targetScrapeSampleDuplicate.Inc() + sl.metrics.targetScrapeSampleDuplicate.Inc() return false, nil case storage.ErrOutOfBounds: appErrs.numOutOfBounds++ level.Debug(sl.l).Log("msg", "Out of bounds metric", "series", string(met)) - targetScrapeSampleOutOfBounds.Inc() + sl.metrics.targetScrapeSampleOutOfBounds.Inc() return false, nil case errSampleLimit: // Keep on parsing output if we hit the limit, so we report the correct @@ -1826,7 +1671,7 @@ func (sl *scrapeLoop) checkAddExemplarError(err error, e exemplar.Exemplar, appE case storage.ErrOutOfOrderExemplar: appErrs.numExemplarOutOfOrder++ level.Debug(sl.l).Log("msg", "Out of order exemplar", "exemplar", fmt.Sprintf("%+v", e)) - targetScrapeExemplarOutOfOrder.Inc() + sl.metrics.targetScrapeExemplarOutOfOrder.Inc() return nil default: return err diff --git a/scrape/scrape_test.go b/scrape/scrape_test.go index 78a479fb7e..672f466146 100644 --- a/scrape/scrape_test.go +++ b/scrape/scrape_test.go @@ -57,11 +57,18 @@ func TestMain(m *testing.M) { testutil.TolerantVerifyLeak(m) } +func newTestScrapeMetrics(t testing.TB) *scrapeMetrics { + reg := prometheus.NewRegistry() + metrics, err := newScrapeMetrics(reg) + require.NoError(t, err) + return metrics +} + func TestNewScrapePool(t *testing.T) { var ( app = &nopAppendable{} cfg = &config.ScrapeConfig{} - sp, _ = newScrapePool(cfg, app, 0, nil, &Options{}) + sp, _ = newScrapePool(cfg, app, 0, nil, &Options{}, newTestScrapeMetrics(t)) ) if a, ok := sp.appendable.(*nopAppendable); !ok || a != app { @@ -97,7 +104,7 @@ func TestDroppedTargetsList(t *testing.T) { }, }, } - sp, _ = newScrapePool(cfg, app, 0, nil, &Options{}) + sp, _ = newScrapePool(cfg, app, 0, nil, &Options{}, newTestScrapeMetrics(t)) expectedLabelSetString = "{__address__=\"127.0.0.1:9090\", __scrape_interval__=\"0s\", __scrape_timeout__=\"0s\", job=\"dropMe\"}" expectedLength = 2 ) @@ -117,7 +124,10 @@ func TestDroppedTargetsList(t *testing.T) { // TestDiscoveredLabelsUpdate checks that DiscoveredLabels are updated // even when new labels don't affect the target `hash`. func TestDiscoveredLabelsUpdate(t *testing.T) { - sp := &scrapePool{} + sp := &scrapePool{ + metrics: newTestScrapeMetrics(t), + } + // These are used when syncing so need this to avoid a panic. sp.config = &config.ScrapeConfig{ ScrapeInterval: model.Duration(1), @@ -184,6 +194,7 @@ func TestScrapePoolStop(t *testing.T) { loops: map[uint64]loop{}, cancel: func() {}, client: http.DefaultClient, + metrics: newTestScrapeMetrics(t), } var mtx sync.Mutex stopped := map[uint64]bool{} @@ -262,6 +273,7 @@ func TestScrapePoolReload(t *testing.T) { } return l } + sp := &scrapePool{ appendable: &nopAppendable{}, activeTargets: map[uint64]*Target{}, @@ -269,6 +281,7 @@ func TestScrapePoolReload(t *testing.T) { newLoop: newLoop, logger: nil, client: http.DefaultClient, + metrics: newTestScrapeMetrics(t), } // Reloading a scrape pool with a new scrape configuration must stop all scrape @@ -352,6 +365,7 @@ func TestScrapePoolReloadPreserveRelabeledIntervalTimeout(t *testing.T) { newLoop: newLoop, logger: nil, client: http.DefaultClient, + metrics: newTestScrapeMetrics(t), } err := sp.reload(reloadCfg) @@ -381,6 +395,7 @@ func TestScrapePoolTargetLimit(t *testing.T) { newLoop: newLoop, logger: log.NewNopLogger(), client: http.DefaultClient, + metrics: newTestScrapeMetrics(t), } tgs := []*targetgroup.Group{} @@ -489,7 +504,7 @@ func TestScrapePoolTargetLimit(t *testing.T) { func TestScrapePoolAppender(t *testing.T) { cfg := &config.ScrapeConfig{} app := &nopAppendable{} - sp, _ := newScrapePool(cfg, app, 0, nil, &Options{}) + sp, _ := newScrapePool(cfg, app, 0, nil, &Options{}, newTestScrapeMetrics(t)) loop := sp.newLoop(scrapeLoopOptions{ target: &Target{}, @@ -545,7 +560,7 @@ func TestScrapePoolRaces(t *testing.T) { newConfig := func() *config.ScrapeConfig { return &config.ScrapeConfig{ScrapeInterval: interval, ScrapeTimeout: timeout} } - sp, _ := newScrapePool(newConfig(), &nopAppendable{}, 0, nil, &Options{}) + sp, _ := newScrapePool(newConfig(), &nopAppendable{}, 0, nil, &Options{}, newTestScrapeMetrics(t)) tgts := []*targetgroup.Group{ { Targets: []model.LabelSet{ @@ -595,6 +610,7 @@ func TestScrapePoolScrapeLoopsStarted(t *testing.T) { newLoop: newLoop, logger: nil, client: http.DefaultClient, + metrics: newTestScrapeMetrics(t), } tgs := []*targetgroup.Group{ @@ -643,6 +659,7 @@ func TestScrapeLoopStopBeforeRun(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) // The scrape pool synchronizes on stopping scrape loops. However, new scrape @@ -716,6 +733,7 @@ func TestScrapeLoopStop(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) // Terminate loop after 2 scrapes. @@ -793,6 +811,7 @@ func TestScrapeLoopRun(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) // The loop must terminate during the initial offset if the context @@ -849,6 +868,7 @@ func TestScrapeLoopRun(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) go func() { @@ -909,6 +929,7 @@ func TestScrapeLoopForcedErr(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) forcedErr := fmt.Errorf("forced err") @@ -945,7 +966,7 @@ func TestScrapeLoopMetadata(t *testing.T) { var ( signal = make(chan struct{}) scraper = &testScraper{} - cache = newScrapeCache() + cache = newScrapeCache(newTestScrapeMetrics(t)) ) defer close(signal) @@ -968,6 +989,7 @@ func TestScrapeLoopMetadata(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) defer cancel() @@ -1026,6 +1048,7 @@ func simpleTestScrapeLoop(t testing.TB) (context.Context, *scrapeLoop) { false, nil, false, + newTestScrapeMetrics(t), ) t.Cleanup(func() { cancel() }) @@ -1087,6 +1110,7 @@ func TestScrapeLoopFailWithInvalidLabelsAfterRelabel(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) slApp := sl.appender(ctx) @@ -1166,6 +1190,7 @@ func TestScrapeLoopRunCreatesStaleMarkersOnFailedScrape(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) // Succeed once, several failures, then stop. numScrapes := 0 @@ -1230,6 +1255,7 @@ func TestScrapeLoopRunCreatesStaleMarkersOnParseFailure(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) // Succeed once, several failures, then stop. @@ -1297,6 +1323,7 @@ func TestScrapeLoopCache(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) numScrapes := 0 @@ -1381,6 +1408,7 @@ func TestScrapeLoopCacheMemoryExhaustionProtection(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) numScrapes := 0 @@ -1496,6 +1524,7 @@ func TestScrapeLoopAppend(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) now := time.Now() @@ -1583,7 +1612,8 @@ func TestScrapeLoopAppendForConflictingPrefixedLabels(t *testing.T) { return mutateSampleLabels(l, &Target{labels: labels.FromStrings(tc.targetLabels...)}, false, nil) }, nil, - func(ctx context.Context) storage.Appender { return app }, nil, 0, true, 0, 0, nil, 0, 0, false, false, false, nil, false, + func(ctx context.Context) storage.Appender { return app }, + nil, 0, true, 0, 0, nil, 0, 0, false, false, false, nil, false, newTestScrapeMetrics(t), ) slApp := sl.appender(context.Background()) _, _, _, err := sl.append(slApp, []byte(tc.exposedLabels), "", time.Date(2000, 1, 1, 1, 0, 0, 0, time.UTC)) @@ -1623,6 +1653,7 @@ func TestScrapeLoopAppendCacheEntryButErrNotFound(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) fakeRef := storage.SeriesRef(1) @@ -1682,11 +1713,12 @@ func TestScrapeLoopAppendSampleLimit(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) // Get the value of the Counter before performing the append. beforeMetric := dto.Metric{} - err := targetScrapeSampleLimit.Write(&beforeMetric) + err := sl.metrics.targetScrapeSampleLimit.Write(&beforeMetric) require.NoError(t, err) beforeMetricValue := beforeMetric.GetCounter().GetValue() @@ -1705,7 +1737,7 @@ func TestScrapeLoopAppendSampleLimit(t *testing.T) { // Check that the Counter has been incremented a single time for the scrape, // not multiple times for each sample. metric := dto.Metric{} - err = targetScrapeSampleLimit.Write(&metric) + err = sl.metrics.targetScrapeSampleLimit.Write(&metric) require.NoError(t, err) value := metric.GetCounter().GetValue() @@ -1760,10 +1792,11 @@ func TestScrapeLoop_HistogramBucketLimit(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) metric := dto.Metric{} - err := targetScrapeNativeHistogramBucketLimit.Write(&metric) + err := sl.metrics.targetScrapeNativeHistogramBucketLimit.Write(&metric) require.NoError(t, err) beforeMetricValue := metric.GetCounter().GetValue() @@ -1801,7 +1834,7 @@ func TestScrapeLoop_HistogramBucketLimit(t *testing.T) { require.Equal(t, 3, added) require.Equal(t, 3, seriesAdded) - err = targetScrapeNativeHistogramBucketLimit.Write(&metric) + err = sl.metrics.targetScrapeNativeHistogramBucketLimit.Write(&metric) require.NoError(t, err) metricValue := metric.GetCounter().GetValue() require.Equal(t, beforeMetricValue, metricValue) @@ -1827,7 +1860,7 @@ func TestScrapeLoop_HistogramBucketLimit(t *testing.T) { require.Equal(t, 3, added) require.Equal(t, 0, seriesAdded) - err = targetScrapeNativeHistogramBucketLimit.Write(&metric) + err = sl.metrics.targetScrapeNativeHistogramBucketLimit.Write(&metric) require.NoError(t, err) metricValue = metric.GetCounter().GetValue() require.Equal(t, beforeMetricValue+1, metricValue) @@ -1859,6 +1892,7 @@ func TestScrapeLoop_ChangingMetricString(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) now := time.Now() @@ -1908,6 +1942,7 @@ func TestScrapeLoopAppendStaleness(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) now := time.Now() @@ -1960,6 +1995,7 @@ func TestScrapeLoopAppendNoStalenessIfTimestamp(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) now := time.Now() @@ -2286,6 +2322,7 @@ metric: < false, nil, false, + newTestScrapeMetrics(t), ) now := time.Now() @@ -2374,6 +2411,7 @@ func TestScrapeLoopAppendExemplarSeries(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) now := time.Now() @@ -2427,6 +2465,7 @@ func TestScrapeLoopRunReportsTargetDownOnScrapeError(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) scraper.scrapeFunc = func(ctx context.Context, w io.Writer) error { @@ -2464,6 +2503,7 @@ func TestScrapeLoopRunReportsTargetDownOnInvalidUTF8(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) scraper.scrapeFunc = func(ctx context.Context, w io.Writer) error { @@ -2514,6 +2554,7 @@ func TestScrapeLoopAppendGracefullyIfAmendOrOutOfOrderOrOutOfBounds(t *testing.T false, nil, false, + newTestScrapeMetrics(t), ) now := time.Unix(1, 0) @@ -2560,6 +2601,7 @@ func TestScrapeLoopOutOfBoundsTimeError(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) now := time.Now().Add(20 * time.Minute) @@ -2755,6 +2797,7 @@ func TestTargetScraperBodySizeLimit(t *testing.T) { client: http.DefaultClient, bodySizeLimit: bodySizeLimit, acceptHeader: acceptHeader(config.DefaultGlobalConfig.ScrapeProtocols), + metrics: newTestScrapeMetrics(t), } var buf bytes.Buffer @@ -2849,6 +2892,7 @@ func TestScrapeLoop_RespectTimestamps(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) now := time.Now() @@ -2891,6 +2935,7 @@ func TestScrapeLoop_DiscardTimestamps(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) now := time.Now() @@ -2932,6 +2977,7 @@ func TestScrapeLoopDiscardDuplicateLabels(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) defer cancel() @@ -2991,6 +3037,7 @@ func TestScrapeLoopDiscardUnnamedMetrics(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) defer cancel() @@ -3083,7 +3130,7 @@ func TestReuseScrapeCache(t *testing.T) { ScrapeInterval: model.Duration(5 * time.Second), MetricsPath: "/metrics", } - sp, _ = newScrapePool(cfg, app, 0, nil, &Options{}) + sp, _ = newScrapePool(cfg, app, 0, nil, &Options{}, newTestScrapeMetrics(t)) t1 = &Target{ discoveredLabels: labels.FromStrings("labelNew", "nameNew", "labelNew1", "nameNew1", "labelNew2", "nameNew2"), } @@ -3255,6 +3302,7 @@ func TestScrapeAddFast(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) defer cancel() @@ -3275,7 +3323,7 @@ func TestScrapeAddFast(t *testing.T) { require.NoError(t, slApp.Commit()) } -func TestReuseCacheRace(*testing.T) { +func TestReuseCacheRace(t *testing.T) { var ( app = &nopAppendable{} cfg = &config.ScrapeConfig{ @@ -3284,7 +3332,7 @@ func TestReuseCacheRace(*testing.T) { ScrapeInterval: model.Duration(5 * time.Second), MetricsPath: "/metrics", } - sp, _ = newScrapePool(cfg, app, 0, nil, &Options{}) + sp, _ = newScrapePool(cfg, app, 0, nil, &Options{}, newTestScrapeMetrics(t)) t1 = &Target{ discoveredLabels: labels.FromStrings("labelNew", "nameNew"), } @@ -3309,7 +3357,7 @@ func TestReuseCacheRace(*testing.T) { func TestCheckAddError(t *testing.T) { var appErrs appendErrors - sl := scrapeLoop{l: log.NewNopLogger()} + sl := scrapeLoop{l: log.NewNopLogger(), metrics: newTestScrapeMetrics(t)} sl.checkAddError(nil, nil, nil, storage.ErrOutOfOrderSample, nil, nil, &appErrs) require.Equal(t, 1, appErrs.numOutOfOrder) } @@ -3342,6 +3390,7 @@ func TestScrapeReportSingleAppender(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) numScrapes := 0 @@ -3412,7 +3461,7 @@ func TestScrapeReportLimit(t *testing.T) { })) defer ts.Close() - sp, err := newScrapePool(cfg, s, 0, nil, &Options{}) + sp, err := newScrapePool(cfg, s, 0, nil, &Options{}, newTestScrapeMetrics(t)) require.NoError(t, err) defer sp.stop() @@ -3545,6 +3594,7 @@ func TestScrapeLoopLabelLimit(t *testing.T) { false, nil, false, + newTestScrapeMetrics(t), ) slApp := sl.appender(context.Background()) @@ -3583,7 +3633,7 @@ func TestTargetScrapeIntervalAndTimeoutRelabel(t *testing.T) { }, }, } - sp, _ := newScrapePool(config, &nopAppendable{}, 0, nil, &Options{}) + sp, _ := newScrapePool(config, &nopAppendable{}, 0, nil, &Options{}, newTestScrapeMetrics(t)) tgts := []*targetgroup.Group{ { Targets: []model.LabelSet{{model.AddressLabel: "127.0.0.1:9090"}}, From 26fa2e8356cb80c00d2241d196b27479ad1bb580 Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Tue, 17 Oct 2023 17:31:21 +0000 Subject: [PATCH 02/17] TSDB: Pre-size buffer to read samples from WAL When reading the WAL this method is called with buffers from a pool, on multiple goroutines. Pre-allocating sufficient size avoids slow growth and many reallocations in `append`. Signed-off-by: Bryan Boreham --- tsdb/record/record.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tsdb/record/record.go b/tsdb/record/record.go index 4cd51d46c0..442e6cd8cb 100644 --- a/tsdb/record/record.go +++ b/tsdb/record/record.go @@ -304,6 +304,10 @@ func (d *Decoder) Samples(rec []byte, samples []RefSample) ([]RefSample, error) baseRef = dec.Be64() baseTime = dec.Be64int64() ) + // Allow 1 byte for each varint and 8 for the value; the output slice must be at least that big. + if minSize := dec.Len() / (1 + 1 + 8); cap(samples) < minSize { + samples = make([]RefSample, 0, minSize) + } for len(dec.B) > 0 && dec.Err() == nil { dref := dec.Varint64() dtime := dec.Varint64() From 8fededf6adc1c7c1bd9598329627556b3091f22e Mon Sep 17 00:00:00 2001 From: Marc Tuduri Date: Fri, 28 Jul 2023 10:49:36 +0200 Subject: [PATCH 03/17] promql(histograms): Change sample total calculation for histograms Signed-off-by: Marc Tuduri --- model/histogram/float_histogram.go | 27 ++++++++++ model/histogram/float_histogram_test.go | 52 +++++++++++++++++++ promql/engine.go | 69 +++++++++++++++---------- promql/value.go | 35 ++++++++++++- 4 files changed, 154 insertions(+), 29 deletions(-) diff --git a/model/histogram/float_histogram.go b/model/histogram/float_histogram.go index 6ee72f24e4..3c394c85e7 100644 --- a/model/histogram/float_histogram.go +++ b/model/histogram/float_histogram.go @@ -338,6 +338,33 @@ func (h *FloatHistogram) Equals(h2 *FloatHistogram) bool { return true } +// Size returns the size of the whole fields histogram in bytes. +// NOTE: this is only valid for 64 bit architectures. +func (fh *FloatHistogram) Size() int { + // Size of each slice separately + posSpanSize := len(fh.PositiveSpans) * 8 // 8 bytes (int32 + uint32) + negSpanSize := len(fh.NegativeSpans) * 8 // 8 bytes (int32 + uint32) + posBucketSize := len(fh.PositiveBuckets) * 8 // 8 bytes (float64) + negBucketSize := len(fh.NegativeBuckets) * 8 // 8 bytes (float64) + + // Total size of the struct + + // fh is 8 bytes + // fh.CounterResetHint is 1 byte + // fh.Schema is 4 bytes + // fh.ZeroThreshold is 8 bytes + // fh.ZeroCount is 8 bytes + // fh.Count is 8 bytes + // fh.Sum is 8 bytes + // fh.PositiveSpans is 24 bytes + // fh.NegativeSpans is 24 bytes + // fh.PositiveBuckets is 24 bytes + // fh.NegativeBuckets is 24 bytes + structSize := 141 + + return structSize + posSpanSize + negSpanSize + posBucketSize + negBucketSize +} + // Compact eliminates empty buckets at the beginning and end of each span, then // merges spans that are consecutive or at most maxEmptyBuckets apart, and // finally splits spans that contain more consecutive empty buckets than diff --git a/model/histogram/float_histogram_test.go b/model/histogram/float_histogram_test.go index ae8ba3ea2e..7f5c29e3bf 100644 --- a/model/histogram/float_histogram_test.go +++ b/model/histogram/float_histogram_test.go @@ -2341,3 +2341,55 @@ func TestFloatHistogramEquals(t *testing.T) { notEquals(h1, *hNegBucketNaN) equals(*hNegBucketNaN, *hNegBucketNaN) } + +func TestFloatHistogramSize(t *testing.T) { + cases := []struct { + name string + fh *FloatHistogram + expected int + }{ + { + "without spans and buckets", + &FloatHistogram{ // 8 bytes + CounterResetHint: 0, // 1 byte + Schema: 1, // 4 bytes + ZeroThreshold: 0.01, // 8 bytes + ZeroCount: 5.5, // 8 bytes + Count: 3493.3, // 8 bytes + Sum: 2349209.324, // 8 bytes + PositiveSpans: nil, // 24 bytes + PositiveBuckets: nil, // 24 bytes + NegativeSpans: nil, // 24 bytes + NegativeBuckets: nil, // 24 bytes + }, + 8 + 1 + 4 + 8 + 8 + 8 + 8 + 24 + 24 + 24 + 24, + }, + { + "complete struct", + &FloatHistogram{ // 8 bytes + CounterResetHint: 0, // 1 byte + Schema: 1, // 4 bytes + ZeroThreshold: 0.01, // 8 bytes + ZeroCount: 5.5, // 8 bytes + Count: 3493.3, // 8 bytes + Sum: 2349209.324, // 8 bytes + PositiveSpans: []Span{ // 24 bytes + {-2, 1}, // 2 * 4 bytes + {2, 3}, // 2 * 4 bytes + }, + PositiveBuckets: []float64{1, 3.3, 4.2, 0.1}, // 24 bytes + 4 * 8 bytes + NegativeSpans: []Span{ // 24 bytes + {3, 2}, // 2 * 4 bytes + {3, 2}}, // 2 * 4 bytes + NegativeBuckets: []float64{3.1, 3, 1.234e5, 1000}, // 24 bytes + 4 * 8 bytes + }, + 8 + 1 + 4 + 8 + 8 + 8 + 8 + (24 + 2*4 + 2*4) + (24 + 2*4 + 2*4) + (24 + 4*8) + (24 + 4*8), + }, + } + + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + require.Equal(t, c.expected, c.fh.Size()) + }) + } +} diff --git a/promql/engine.go b/promql/engine.go index 161aa85acb..120964ca0e 100644 --- a/promql/engine.go +++ b/promql/engine.go @@ -1224,10 +1224,11 @@ func (ev *evaluator) rangeEval(prepSeries func(labels.Labels, *EvalSeriesHelper) enh.Out = result[:0] // Reuse result vector. warnings.Merge(ws) - ev.currentSamples += len(result) + vecNumSamples := result.TotalSamples() + ev.currentSamples += vecNumSamples // When we reset currentSamples to tempNumSamples during the next iteration of the loop it also // needs to include the samples from the result here, as they're still in memory. - tempNumSamples += len(result) + tempNumSamples += vecNumSamples ev.samplesStats.UpdatePeak(ev.currentSamples) if ev.currentSamples > ev.maxSamples { @@ -1323,12 +1324,10 @@ func (ev *evaluator) evalSubquery(subq *parser.SubqueryExpr) (*parser.MatrixSele Range: subq.Range, VectorSelector: vs, } - totalSamples := 0 for _, s := range mat { - totalSamples += len(s.Floats) + len(s.Histograms) vs.Series = append(vs.Series, NewStorageSeries(s)) } - return ms, totalSamples, ws + return ms, mat.TotalSamples(), ws } // eval evaluates the given expression as the given AST expression node requires. @@ -1470,7 +1469,7 @@ func (ev *evaluator) eval(expr parser.Expr) (parser.Value, annotations.Annotatio it := storage.NewBuffer(selRange) var chkIter chunkenc.Iterator for i, s := range selVS.Series { - ev.currentSamples -= len(floats) + len(histograms) + ev.currentSamples -= len(floats) + totalHPointSize(histograms) if floats != nil { floats = floats[:0] } @@ -1514,7 +1513,7 @@ func (ev *evaluator) eval(expr parser.Expr) (parser.Value, annotations.Annotatio // Make the function call. outVec, annos := call(inArgs, e.Args, enh) warnings.Merge(annos) - ev.samplesStats.IncrementSamplesAtStep(step, int64(len(floats)+len(histograms))) + ev.samplesStats.IncrementSamplesAtStep(step, int64(len(floats)+totalHPointSize(histograms))) enh.Out = outVec[:0] if len(outVec) > 0 { @@ -1533,10 +1532,11 @@ func (ev *evaluator) eval(expr parser.Expr) (parser.Value, annotations.Annotatio // Only buffer stepRange milliseconds from the second step on. it.ReduceDelta(stepRange) } - if len(ss.Floats)+len(ss.Histograms) > 0 { - if ev.currentSamples+len(ss.Floats)+len(ss.Histograms) <= ev.maxSamples { + histSamples := totalHPointSize(ss.Histograms) + if len(ss.Floats)+histSamples > 0 { + if ev.currentSamples+len(ss.Floats)+histSamples <= ev.maxSamples { mat = append(mat, ss) - ev.currentSamples += len(ss.Floats) + len(ss.Histograms) + ev.currentSamples += len(ss.Floats) + histSamples } else { ev.error(ErrTooManySamples(env)) } @@ -1545,7 +1545,7 @@ func (ev *evaluator) eval(expr parser.Expr) (parser.Value, annotations.Annotatio } ev.samplesStats.UpdatePeak(ev.currentSamples) - ev.currentSamples -= len(floats) + len(histograms) + ev.currentSamples -= len(floats) + totalHPointSize(histograms) putFPointSlice(floats) putHPointSlice(histograms) @@ -1692,14 +1692,18 @@ func (ev *evaluator) eval(expr parser.Expr) (parser.Value, annotations.Annotatio ss.Floats = getFPointSlice(numSteps) } ss.Floats = append(ss.Floats, FPoint{F: f, T: ts}) + ev.currentSamples++ + ev.samplesStats.IncrementSamplesAtStep(step, 1) } else { if ss.Histograms == nil { ss.Histograms = getHPointSlice(numSteps) } - ss.Histograms = append(ss.Histograms, HPoint{H: h, T: ts}) + point := HPoint{H: h, T: ts} + ss.Histograms = append(ss.Histograms, point) + histSize := point.histogramSize() + ev.currentSamples += histSize + ev.samplesStats.IncrementSamplesAtStep(step, int64(histSize)) } - ev.samplesStats.IncrementSamplesAtStep(step, 1) - ev.currentSamples++ } else { ev.error(ErrTooManySamples(env)) } @@ -1807,13 +1811,15 @@ func (ev *evaluator) eval(expr parser.Expr) (parser.Value, annotations.Annotatio T: ts, F: mat[i].Floats[0].F, }) + ev.currentSamples++ } else { - mat[i].Histograms = append(mat[i].Histograms, HPoint{ + point := HPoint{ T: ts, H: mat[i].Histograms[0].H, - }) + } + mat[i].Histograms = append(mat[i].Histograms, point) + ev.currentSamples += point.histogramSize() } - ev.currentSamples++ if ev.currentSamples > ev.maxSamples { ev.error(ErrTooManySamples(env)) } @@ -1857,9 +1863,14 @@ func (ev *evaluator) rangeEvalTimestampFunctionOverVectorSelector(vs *parser.Vec F: f, H: h, }) - + histSize := 0 + if h != nil { + histSize := h.Size() / 16 // 16 bytes per sample. + ev.currentSamples += histSize + } ev.currentSamples++ - ev.samplesStats.IncrementSamplesAtTimestamp(enh.Ts, 1) + + ev.samplesStats.IncrementSamplesAtTimestamp(enh.Ts, int64(1+histSize)) if ev.currentSamples > ev.maxSamples { ev.error(ErrTooManySamples(env)) } @@ -1981,10 +1992,10 @@ func (ev *evaluator) matrixSelector(node *parser.MatrixSelector) (Matrix, annota } ss.Floats, ss.Histograms = ev.matrixIterSlice(it, mint, maxt, nil, nil) - totalLen := int64(len(ss.Floats)) + int64(len(ss.Histograms)) - ev.samplesStats.IncrementSamplesAtTimestamp(ev.startTimestamp, totalLen) + totalSize := int64(len(ss.Floats)) + int64(totalHPointSize(ss.Histograms)) + ev.samplesStats.IncrementSamplesAtTimestamp(ev.startTimestamp, totalSize) - if totalLen > 0 { + if totalSize > 0 { matrix = append(matrix, ss) } else { putFPointSlice(ss.Floats) @@ -2040,13 +2051,13 @@ func (ev *evaluator) matrixIterSlice( var drop int for drop = 0; histograms[drop].T < mint; drop++ { // nolint:revive } - ev.currentSamples -= drop copy(histograms, histograms[drop:]) histograms = histograms[:len(histograms)-drop] + ev.currentSamples -= totalHPointSize(histograms) // Only append points with timestamps after the last timestamp we have. mintHistograms = histograms[len(histograms)-1].T + 1 } else { - ev.currentSamples -= len(histograms) + ev.currentSamples -= totalHPointSize(histograms) if histograms != nil { histograms = histograms[:0] } @@ -2075,11 +2086,12 @@ loop: if ev.currentSamples >= ev.maxSamples { ev.error(ErrTooManySamples(env)) } - ev.currentSamples++ + point := HPoint{T: t, H: h} if histograms == nil { histograms = getHPointSlice(16) } - histograms = append(histograms, HPoint{T: t, H: h}) + histograms = append(histograms, point) + ev.currentSamples += point.histogramSize() } case chunkenc.ValFloat: t, f := buf.At() @@ -2110,8 +2122,9 @@ loop: if histograms == nil { histograms = getHPointSlice(16) } - histograms = append(histograms, HPoint{T: t, H: h}) - ev.currentSamples++ + point := HPoint{T: t, H: h} + histograms = append(histograms, point) + ev.currentSamples += point.histogramSize() } case chunkenc.ValFloat: t, f := it.At() diff --git a/promql/value.go b/promql/value.go index 68e37f37ee..5fa339ad57 100644 --- a/promql/value.go +++ b/promql/value.go @@ -168,6 +168,23 @@ func (p HPoint) MarshalJSON() ([]byte, error) { return json.Marshal([...]interface{}{float64(p.T) / 1000, h}) } +// histogramSize returns the size of the HPoint compared to the size of an FPoint. +// The total size is calculated considering the histogram timestamp (p.T - 8 bytes), +// and then a number of bytes in the histogram. +// This sum is divided by 16, as samples are 16 bytes. +func (p HPoint) histogramSize() int { + return (p.H.Size() + 8) / 16 +} + +// totalHPointSize returns the total number of samples in the given slice of HPoints. +func totalHPointSize(histograms []HPoint) int { + var total int + for _, h := range histograms { + total += h.histogramSize() + } + return total +} + // Sample is a single sample belonging to a metric. It represents either a float // sample or a histogram sample. If H is nil, it is a float sample. Otherwise, // it is a histogram sample. @@ -226,6 +243,21 @@ func (vec Vector) String() string { return strings.Join(entries, "\n") } +// TotalSamples returns the total number of samples in the series within a vector. +// Float samples have a weight of 1 in this number, while histogram samples have a higher +// weight according to their size compared with the size of a float sample. +// See HPoint.histogramSize for details. +func (vec Vector) TotalSamples() int { + numSamples := 0 + for _, sample := range vec { + numSamples++ + if sample.H != nil { + numSamples += sample.H.Size() / 16 + } + } + return numSamples +} + // ContainsSameLabelset checks if a vector has samples with the same labelset // Such a behavior is semantically undefined // https://github.com/prometheus/prometheus/issues/4562 @@ -264,10 +296,11 @@ func (m Matrix) String() string { } // TotalSamples returns the total number of samples in the series within a matrix. +// It takes into account the number of samples in the histograms using the histogramSize method. func (m Matrix) TotalSamples() int { numSamples := 0 for _, series := range m { - numSamples += len(series.Floats) + len(series.Histograms) + numSamples += len(series.Floats) + totalHPointSize(series.Histograms) } return numSamples } From af7c31ee10fa11d2d39425979204a82eb4c838d1 Mon Sep 17 00:00:00 2001 From: Marc Tuduri Date: Wed, 18 Oct 2023 11:49:56 +0200 Subject: [PATCH 04/17] PR feedback Signed-off-by: Marc Tuduri --- model/histogram/float_histogram.go | 39 +++++++++++++------------ model/histogram/float_histogram_test.go | 4 +-- promql/engine.go | 8 ++--- promql/value.go | 12 ++++---- 4 files changed, 33 insertions(+), 30 deletions(-) diff --git a/model/histogram/float_histogram.go b/model/histogram/float_histogram.go index 3c394c85e7..3cb7fe7da3 100644 --- a/model/histogram/float_histogram.go +++ b/model/histogram/float_histogram.go @@ -338,29 +338,30 @@ func (h *FloatHistogram) Equals(h2 *FloatHistogram) bool { return true } -// Size returns the size of the whole fields histogram in bytes. +// Size returns the total size of the FloatHistogram, which includes the size of the pointer +// to FloatHistogram, all its fields, and all elements contained in slices. // NOTE: this is only valid for 64 bit architectures. func (fh *FloatHistogram) Size() int { - // Size of each slice separately - posSpanSize := len(fh.PositiveSpans) * 8 // 8 bytes (int32 + uint32) - negSpanSize := len(fh.NegativeSpans) * 8 // 8 bytes (int32 + uint32) - posBucketSize := len(fh.PositiveBuckets) * 8 // 8 bytes (float64) - negBucketSize := len(fh.NegativeBuckets) * 8 // 8 bytes (float64) + // Size of each slice separately. + posSpanSize := len(fh.PositiveSpans) * 8 // 8 bytes (int32 + uint32). + negSpanSize := len(fh.NegativeSpans) * 8 // 8 bytes (int32 + uint32). + posBucketSize := len(fh.PositiveBuckets) * 8 // 8 bytes (float64). + negBucketSize := len(fh.NegativeBuckets) * 8 // 8 bytes (float64). - // Total size of the struct + // Total size of the struct. - // fh is 8 bytes - // fh.CounterResetHint is 1 byte - // fh.Schema is 4 bytes - // fh.ZeroThreshold is 8 bytes - // fh.ZeroCount is 8 bytes - // fh.Count is 8 bytes - // fh.Sum is 8 bytes - // fh.PositiveSpans is 24 bytes - // fh.NegativeSpans is 24 bytes - // fh.PositiveBuckets is 24 bytes - // fh.NegativeBuckets is 24 bytes - structSize := 141 + // fh is 8 bytes. + // fh.CounterResetHint is 4 bytes (1 byte bool + 3 bytes padding). + // fh.Schema is 4 bytes. + // fh.ZeroThreshold is 8 bytes. + // fh.ZeroCount is 8 bytes. + // fh.Count is 8 bytes. + // fh.Sum is 8 bytes. + // fh.PositiveSpans is 24 bytes. + // fh.NegativeSpans is 24 bytes. + // fh.PositiveBuckets is 24 bytes. + // fh.NegativeBuckets is 24 bytes. + structSize := 144 return structSize + posSpanSize + negSpanSize + posBucketSize + negBucketSize } diff --git a/model/histogram/float_histogram_test.go b/model/histogram/float_histogram_test.go index 7f5c29e3bf..d40cecd7a3 100644 --- a/model/histogram/float_histogram_test.go +++ b/model/histogram/float_histogram_test.go @@ -2362,7 +2362,7 @@ func TestFloatHistogramSize(t *testing.T) { NegativeSpans: nil, // 24 bytes NegativeBuckets: nil, // 24 bytes }, - 8 + 1 + 4 + 8 + 8 + 8 + 8 + 24 + 24 + 24 + 24, + 8 + 4 + 4 + 8 + 8 + 8 + 8 + 24 + 24 + 24 + 24, }, { "complete struct", @@ -2383,7 +2383,7 @@ func TestFloatHistogramSize(t *testing.T) { {3, 2}}, // 2 * 4 bytes NegativeBuckets: []float64{3.1, 3, 1.234e5, 1000}, // 24 bytes + 4 * 8 bytes }, - 8 + 1 + 4 + 8 + 8 + 8 + 8 + (24 + 2*4 + 2*4) + (24 + 2*4 + 2*4) + (24 + 4*8) + (24 + 4*8), + 8 + 4 + 4 + 8 + 8 + 8 + 8 + (24 + 2*4 + 2*4) + (24 + 2*4 + 2*4) + (24 + 4*8) + (24 + 4*8), }, } diff --git a/promql/engine.go b/promql/engine.go index 120964ca0e..4e0769f993 100644 --- a/promql/engine.go +++ b/promql/engine.go @@ -1700,7 +1700,7 @@ func (ev *evaluator) eval(expr parser.Expr) (parser.Value, annotations.Annotatio } point := HPoint{H: h, T: ts} ss.Histograms = append(ss.Histograms, point) - histSize := point.histogramSize() + histSize := point.size() ev.currentSamples += histSize ev.samplesStats.IncrementSamplesAtStep(step, int64(histSize)) } @@ -1818,7 +1818,7 @@ func (ev *evaluator) eval(expr parser.Expr) (parser.Value, annotations.Annotatio H: mat[i].Histograms[0].H, } mat[i].Histograms = append(mat[i].Histograms, point) - ev.currentSamples += point.histogramSize() + ev.currentSamples += point.size() } if ev.currentSamples > ev.maxSamples { ev.error(ErrTooManySamples(env)) @@ -2091,7 +2091,7 @@ loop: histograms = getHPointSlice(16) } histograms = append(histograms, point) - ev.currentSamples += point.histogramSize() + ev.currentSamples += point.size() } case chunkenc.ValFloat: t, f := buf.At() @@ -2124,7 +2124,7 @@ loop: } point := HPoint{T: t, H: h} histograms = append(histograms, point) - ev.currentSamples += point.histogramSize() + ev.currentSamples += point.size() } case chunkenc.ValFloat: t, f := it.At() diff --git a/promql/value.go b/promql/value.go index 5fa339ad57..28cf3fe31c 100644 --- a/promql/value.go +++ b/promql/value.go @@ -168,11 +168,11 @@ func (p HPoint) MarshalJSON() ([]byte, error) { return json.Marshal([...]interface{}{float64(p.T) / 1000, h}) } -// histogramSize returns the size of the HPoint compared to the size of an FPoint. +// size returns the size of the HPoint compared to the size of an FPoint. // The total size is calculated considering the histogram timestamp (p.T - 8 bytes), // and then a number of bytes in the histogram. // This sum is divided by 16, as samples are 16 bytes. -func (p HPoint) histogramSize() int { +func (p HPoint) size() int { return (p.H.Size() + 8) / 16 } @@ -180,7 +180,7 @@ func (p HPoint) histogramSize() int { func totalHPointSize(histograms []HPoint) int { var total int for _, h := range histograms { - total += h.histogramSize() + total += h.size() } return total } @@ -246,7 +246,7 @@ func (vec Vector) String() string { // TotalSamples returns the total number of samples in the series within a vector. // Float samples have a weight of 1 in this number, while histogram samples have a higher // weight according to their size compared with the size of a float sample. -// See HPoint.histogramSize for details. +// See HPoint.size for details. func (vec Vector) TotalSamples() int { numSamples := 0 for _, sample := range vec { @@ -296,7 +296,9 @@ func (m Matrix) String() string { } // TotalSamples returns the total number of samples in the series within a matrix. -// It takes into account the number of samples in the histograms using the histogramSize method. +// Float samples have a weight of 1 in this number, while histogram samples have a higher +// weight according to their size compared with the size of a float sample. +// See HPoint.size for details. func (m Matrix) TotalSamples() int { numSamples := 0 for _, series := range m { From 1ce066e51c539ef6dd826012460cfc722d911ba2 Mon Sep 17 00:00:00 2001 From: Marc Tuduri Date: Wed, 18 Oct 2023 11:53:42 +0200 Subject: [PATCH 05/17] More periods Signed-off-by: Marc Tuduri --- model/histogram/float_histogram_test.go | 52 ++++++++++++------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/model/histogram/float_histogram_test.go b/model/histogram/float_histogram_test.go index d40cecd7a3..e2f9106966 100644 --- a/model/histogram/float_histogram_test.go +++ b/model/histogram/float_histogram_test.go @@ -2350,38 +2350,38 @@ func TestFloatHistogramSize(t *testing.T) { }{ { "without spans and buckets", - &FloatHistogram{ // 8 bytes - CounterResetHint: 0, // 1 byte - Schema: 1, // 4 bytes - ZeroThreshold: 0.01, // 8 bytes - ZeroCount: 5.5, // 8 bytes - Count: 3493.3, // 8 bytes - Sum: 2349209.324, // 8 bytes - PositiveSpans: nil, // 24 bytes - PositiveBuckets: nil, // 24 bytes - NegativeSpans: nil, // 24 bytes - NegativeBuckets: nil, // 24 bytes + &FloatHistogram{ // 8 bytes. + CounterResetHint: 0, // 1 byte. + Schema: 1, // 4 bytes. + ZeroThreshold: 0.01, // 8 bytes. + ZeroCount: 5.5, // 8 bytes. + Count: 3493.3, // 8 bytes. + Sum: 2349209.324, // 8 bytes. + PositiveSpans: nil, // 24 bytes. + PositiveBuckets: nil, // 24 bytes. + NegativeSpans: nil, // 24 bytes. + NegativeBuckets: nil, // 24 bytes. }, 8 + 4 + 4 + 8 + 8 + 8 + 8 + 24 + 24 + 24 + 24, }, { "complete struct", - &FloatHistogram{ // 8 bytes - CounterResetHint: 0, // 1 byte - Schema: 1, // 4 bytes - ZeroThreshold: 0.01, // 8 bytes - ZeroCount: 5.5, // 8 bytes - Count: 3493.3, // 8 bytes - Sum: 2349209.324, // 8 bytes - PositiveSpans: []Span{ // 24 bytes - {-2, 1}, // 2 * 4 bytes - {2, 3}, // 2 * 4 bytes + &FloatHistogram{ // 8 bytes. + CounterResetHint: 0, // 1 byte. + Schema: 1, // 4 bytes. + ZeroThreshold: 0.01, // 8 bytes. + ZeroCount: 5.5, // 8 bytes. + Count: 3493.3, // 8 bytes. + Sum: 2349209.324, // 8 bytes. + PositiveSpans: []Span{ // 24 bytes. + {-2, 1}, // 2 * 4 bytes. + {2, 3}, // 2 * 4 bytes. }, - PositiveBuckets: []float64{1, 3.3, 4.2, 0.1}, // 24 bytes + 4 * 8 bytes - NegativeSpans: []Span{ // 24 bytes - {3, 2}, // 2 * 4 bytes - {3, 2}}, // 2 * 4 bytes - NegativeBuckets: []float64{3.1, 3, 1.234e5, 1000}, // 24 bytes + 4 * 8 bytes + PositiveBuckets: []float64{1, 3.3, 4.2, 0.1}, // 24 bytes + 4 * 8 bytes. + NegativeSpans: []Span{ // 24 bytes. + {3, 2}, // 2 * 4 bytes. + {3, 2}}, // 2 * 4 bytes. + NegativeBuckets: []float64{3.1, 3, 1.234e5, 1000}, // 24 bytes + 4 * 8 bytes. }, 8 + 4 + 4 + 8 + 8 + 8 + 8 + (24 + 2*4 + 2*4) + (24 + 2*4 + 2*4) + (24 + 4*8) + (24 + 4*8), }, From ef8e6ae78040613fdc1ceb25f22b76f87dd2c80a Mon Sep 17 00:00:00 2001 From: Arthur Silva Sens Date: Wed, 18 Oct 2023 13:04:02 -0500 Subject: [PATCH 06/17] Parse created timestamps from Prometheus Protobuf (#12973) Signed-off-by: Arthur Silva Sens --- model/textparse/interface.go | 7 + model/textparse/openmetricsparse.go | 7 + model/textparse/promparse.go | 7 + model/textparse/protobufparse.go | 19 ++ model/textparse/protobufparse_test.go | 239 +++++++++++++++++++++++++- 5 files changed, 276 insertions(+), 3 deletions(-) diff --git a/model/textparse/interface.go b/model/textparse/interface.go index 38903afc96..2f5fdbc3bf 100644 --- a/model/textparse/interface.go +++ b/model/textparse/interface.go @@ -16,6 +16,8 @@ package textparse import ( "mime" + "github.com/gogo/protobuf/types" + "github.com/prometheus/prometheus/model/exemplar" "github.com/prometheus/prometheus/model/histogram" "github.com/prometheus/prometheus/model/labels" @@ -64,6 +66,11 @@ type Parser interface { // retrieved (including the case where no exemplars exist at all). Exemplar(l *exemplar.Exemplar) bool + // CreatedTimestamp writes the created timestamp of the current sample + // into the passed timestamp. It returns false if no created timestamp + // exists or if the metric type does not support created timestamps. + CreatedTimestamp(ct *types.Timestamp) bool + // Next advances the parser to the next sample. It returns false if no // more samples were read or an error occurred. Next() (Entry, error) diff --git a/model/textparse/openmetricsparse.go b/model/textparse/openmetricsparse.go index 5623e6833f..bb50755441 100644 --- a/model/textparse/openmetricsparse.go +++ b/model/textparse/openmetricsparse.go @@ -24,6 +24,8 @@ import ( "strings" "unicode/utf8" + "github.com/gogo/protobuf/types" + "github.com/prometheus/prometheus/model/exemplar" "github.com/prometheus/prometheus/model/histogram" "github.com/prometheus/prometheus/model/labels" @@ -211,6 +213,11 @@ func (p *OpenMetricsParser) Exemplar(e *exemplar.Exemplar) bool { return true } +// CreatedTimestamp returns false because OpenMetricsParser does not support created timestamps (yet). +func (p *OpenMetricsParser) CreatedTimestamp(_ *types.Timestamp) bool { + return false +} + // nextToken returns the next token from the openMetricsLexer. func (p *OpenMetricsParser) nextToken() token { tok := p.l.Lex() diff --git a/model/textparse/promparse.go b/model/textparse/promparse.go index 04c295dd00..b3fa2d8a6d 100644 --- a/model/textparse/promparse.go +++ b/model/textparse/promparse.go @@ -26,6 +26,8 @@ import ( "unicode/utf8" "unsafe" + "github.com/gogo/protobuf/types" + "github.com/prometheus/prometheus/model/exemplar" "github.com/prometheus/prometheus/model/histogram" "github.com/prometheus/prometheus/model/labels" @@ -245,6 +247,11 @@ func (p *PromParser) Exemplar(*exemplar.Exemplar) bool { return false } +// CreatedTimestamp returns false because PromParser does not support created timestamps. +func (p *PromParser) CreatedTimestamp(_ *types.Timestamp) bool { + return false +} + // nextToken returns the next token from the promlexer. It skips over tabs // and spaces. func (p *PromParser) nextToken() token { diff --git a/model/textparse/protobufparse.go b/model/textparse/protobufparse.go index fbb84a2bd3..94ea5e4a35 100644 --- a/model/textparse/protobufparse.go +++ b/model/textparse/protobufparse.go @@ -23,6 +23,7 @@ import ( "unicode/utf8" "github.com/gogo/protobuf/proto" + "github.com/gogo/protobuf/types" "github.com/pkg/errors" "github.com/prometheus/common/model" @@ -347,6 +348,24 @@ func (p *ProtobufParser) Exemplar(ex *exemplar.Exemplar) bool { return true } +func (p *ProtobufParser) CreatedTimestamp(ct *types.Timestamp) bool { + var foundCT *types.Timestamp + switch p.mf.GetType() { + case dto.MetricType_COUNTER: + foundCT = p.mf.GetMetric()[p.metricPos].GetCounter().GetCreatedTimestamp() + case dto.MetricType_SUMMARY: + foundCT = p.mf.GetMetric()[p.metricPos].GetSummary().GetCreatedTimestamp() + case dto.MetricType_HISTOGRAM, dto.MetricType_GAUGE_HISTOGRAM: + foundCT = p.mf.GetMetric()[p.metricPos].GetHistogram().GetCreatedTimestamp() + default: + } + if foundCT == nil { + return false + } + *ct = *foundCT + return true +} + // Next advances the parser to the next "sample" (emulating the behavior of a // text format parser). It returns (EntryInvalid, io.EOF) if no samples were // read. diff --git a/model/textparse/protobufparse_test.go b/model/textparse/protobufparse_test.go index 5436d7f3e3..10ec5f4405 100644 --- a/model/textparse/protobufparse_test.go +++ b/model/textparse/protobufparse_test.go @@ -21,6 +21,7 @@ import ( "testing" "github.com/gogo/protobuf/proto" + "github.com/gogo/protobuf/types" "github.com/stretchr/testify/require" "github.com/prometheus/prometheus/model/exemplar" @@ -530,6 +531,69 @@ metric: < > > +`, + `name: "test_counter_with_createdtimestamp" +help: "A counter with a created timestamp." +type: COUNTER +metric: < + counter: < + value: 42 + created_timestamp: < + seconds: 1 + nanos: 1 + > + > +> + +`, + `name: "test_summary_with_createdtimestamp" +help: "A summary with a created timestamp." +type: SUMMARY +metric: < + summary: < + sample_count: 42 + sample_sum: 1.234 + created_timestamp: < + seconds: 1 + nanos: 1 + > + > +> + +`, + `name: "test_histogram_with_createdtimestamp" +help: "A histogram with a created timestamp." +type: HISTOGRAM +metric: < + histogram: < + created_timestamp: < + seconds: 1 + nanos: 1 + > + positive_span: < + offset: 0 + length: 0 + > + > +> + +`, + `name: "test_gaugehistogram_with_createdtimestamp" +help: "A gauge histogram with a created timestamp." +type: GAUGE_HISTOGRAM +metric: < + histogram: < + created_timestamp: < + seconds: 1 + nanos: 1 + > + positive_span: < + offset: 0 + length: 0 + > + > +> + `, } @@ -566,6 +630,7 @@ func TestProtobufParse(t *testing.T) { shs *histogram.Histogram fhs *histogram.FloatHistogram e []exemplar.Exemplar + ct *types.Timestamp } inputBuf := createTestProtoBuf(t) @@ -997,6 +1062,86 @@ func TestProtobufParse(t *testing.T) { "__name__", "empty_histogram", ), }, + { + m: "test_counter_with_createdtimestamp", + help: "A counter with a created timestamp.", + }, + { + m: "test_counter_with_createdtimestamp", + typ: MetricTypeCounter, + }, + { + m: "test_counter_with_createdtimestamp", + v: 42, + ct: &types.Timestamp{Seconds: 1, Nanos: 1}, + lset: labels.FromStrings( + "__name__", "test_counter_with_createdtimestamp", + ), + }, + { + m: "test_summary_with_createdtimestamp", + help: "A summary with a created timestamp.", + }, + { + m: "test_summary_with_createdtimestamp", + typ: MetricTypeSummary, + }, + { + m: "test_summary_with_createdtimestamp_count", + v: 42, + ct: &types.Timestamp{Seconds: 1, Nanos: 1}, + lset: labels.FromStrings( + "__name__", "test_summary_with_createdtimestamp_count", + ), + }, + { + m: "test_summary_with_createdtimestamp_sum", + v: 1.234, + ct: &types.Timestamp{Seconds: 1, Nanos: 1}, + lset: labels.FromStrings( + "__name__", "test_summary_with_createdtimestamp_sum", + ), + }, + { + m: "test_histogram_with_createdtimestamp", + help: "A histogram with a created timestamp.", + }, + { + m: "test_histogram_with_createdtimestamp", + typ: MetricTypeHistogram, + }, + { + m: "test_histogram_with_createdtimestamp", + ct: &types.Timestamp{Seconds: 1, Nanos: 1}, + shs: &histogram.Histogram{ + CounterResetHint: histogram.UnknownCounterReset, + PositiveSpans: []histogram.Span{}, + NegativeSpans: []histogram.Span{}, + }, + lset: labels.FromStrings( + "__name__", "test_histogram_with_createdtimestamp", + ), + }, + { + m: "test_gaugehistogram_with_createdtimestamp", + help: "A gauge histogram with a created timestamp.", + }, + { + m: "test_gaugehistogram_with_createdtimestamp", + typ: MetricTypeGaugeHistogram, + }, + { + m: "test_gaugehistogram_with_createdtimestamp", + ct: &types.Timestamp{Seconds: 1, Nanos: 1}, + shs: &histogram.Histogram{ + CounterResetHint: histogram.GaugeType, + PositiveSpans: []histogram.Span{}, + NegativeSpans: []histogram.Span{}, + }, + lset: labels.FromStrings( + "__name__", "test_gaugehistogram_with_createdtimestamp", + ), + }, }, }, { @@ -1739,6 +1884,86 @@ func TestProtobufParse(t *testing.T) { "__name__", "empty_histogram", ), }, + { // 81 + m: "test_counter_with_createdtimestamp", + help: "A counter with a created timestamp.", + }, + { // 82 + m: "test_counter_with_createdtimestamp", + typ: MetricTypeCounter, + }, + { // 83 + m: "test_counter_with_createdtimestamp", + v: 42, + ct: &types.Timestamp{Seconds: 1, Nanos: 1}, + lset: labels.FromStrings( + "__name__", "test_counter_with_createdtimestamp", + ), + }, + { // 84 + m: "test_summary_with_createdtimestamp", + help: "A summary with a created timestamp.", + }, + { // 85 + m: "test_summary_with_createdtimestamp", + typ: MetricTypeSummary, + }, + { // 86 + m: "test_summary_with_createdtimestamp_count", + v: 42, + ct: &types.Timestamp{Seconds: 1, Nanos: 1}, + lset: labels.FromStrings( + "__name__", "test_summary_with_createdtimestamp_count", + ), + }, + { // 87 + m: "test_summary_with_createdtimestamp_sum", + v: 1.234, + ct: &types.Timestamp{Seconds: 1, Nanos: 1}, + lset: labels.FromStrings( + "__name__", "test_summary_with_createdtimestamp_sum", + ), + }, + { // 88 + m: "test_histogram_with_createdtimestamp", + help: "A histogram with a created timestamp.", + }, + { // 89 + m: "test_histogram_with_createdtimestamp", + typ: MetricTypeHistogram, + }, + { // 90 + m: "test_histogram_with_createdtimestamp", + ct: &types.Timestamp{Seconds: 1, Nanos: 1}, + shs: &histogram.Histogram{ + CounterResetHint: histogram.UnknownCounterReset, + PositiveSpans: []histogram.Span{}, + NegativeSpans: []histogram.Span{}, + }, + lset: labels.FromStrings( + "__name__", "test_histogram_with_createdtimestamp", + ), + }, + { // 91 + m: "test_gaugehistogram_with_createdtimestamp", + help: "A gauge histogram with a created timestamp.", + }, + { // 92 + m: "test_gaugehistogram_with_createdtimestamp", + typ: MetricTypeGaugeHistogram, + }, + { // 93 + m: "test_gaugehistogram_with_createdtimestamp", + ct: &types.Timestamp{Seconds: 1, Nanos: 1}, + shs: &histogram.Histogram{ + CounterResetHint: histogram.GaugeType, + PositiveSpans: []histogram.Span{}, + NegativeSpans: []histogram.Span{}, + }, + lset: labels.FromStrings( + "__name__", "test_gaugehistogram_with_createdtimestamp", + ), + }, }, }, } @@ -1764,8 +1989,10 @@ func TestProtobufParse(t *testing.T) { m, ts, v := p.Series() var e exemplar.Exemplar + var ct types.Timestamp p.Metric(&res) - found := p.Exemplar(&e) + eFound := p.Exemplar(&e) + ctFound := p.CreatedTimestamp(&ct) require.Equal(t, exp[i].m, string(m), "i: %d", i) if ts != nil { require.Equal(t, exp[i].t, *ts, "i: %d", i) @@ -1775,12 +2002,18 @@ func TestProtobufParse(t *testing.T) { require.Equal(t, exp[i].v, v, "i: %d", i) require.Equal(t, exp[i].lset, res, "i: %d", i) if len(exp[i].e) == 0 { - require.Equal(t, false, found, "i: %d", i) + require.Equal(t, false, eFound, "i: %d", i) } else { - require.Equal(t, true, found, "i: %d", i) + require.Equal(t, true, eFound, "i: %d", i) require.Equal(t, exp[i].e[0], e, "i: %d", i) require.False(t, p.Exemplar(&e), "too many exemplars returned, i: %d", i) } + if exp[i].ct != nil { + require.Equal(t, true, ctFound, "i: %d", i) + require.Equal(t, exp[i].ct.String(), ct.String(), "i: %d", i) + } else { + require.Equal(t, false, ctFound, "i: %d", i) + } case EntryHistogram: m, ts, shs, fhs := p.Histogram() From 71a36d239656273e746d5c75391e205c2bc8d58f Mon Sep 17 00:00:00 2001 From: Jeanette Tan Date: Thu, 19 Oct 2023 13:17:46 +0800 Subject: [PATCH 07/17] Very minor refactor of the integer overflow fix Signed-off-by: Jeanette Tan --- tsdb/ooo_head_read.go | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/tsdb/ooo_head_read.go b/tsdb/ooo_head_read.go index c30c2b5650..242d19eed4 100644 --- a/tsdb/ooo_head_read.go +++ b/tsdb/ooo_head_read.go @@ -178,42 +178,36 @@ type chunkMetaAndChunkDiskMapperRef struct { } func refLessByMinTimeAndMinRef(a, b chunkMetaAndChunkDiskMapperRef) int { - if a.meta.MinTime == b.meta.MinTime { - switch { - case a.meta.Ref < b.meta.Ref: - return -1 - case a.meta.Ref > b.meta.Ref: - return 1 - default: - return 0 - } - } switch { case a.meta.MinTime < b.meta.MinTime: return -1 case a.meta.MinTime > b.meta.MinTime: return 1 + } + + switch { + case a.meta.Ref < b.meta.Ref: + return -1 + case a.meta.Ref > b.meta.Ref: + return 1 default: return 0 } } func lessByMinTimeAndMinRef(a, b chunks.Meta) int { - if a.MinTime == b.MinTime { - switch { - case a.Ref < b.Ref: - return -1 - case a.Ref > b.Ref: - return 1 - default: - return 0 - } - } switch { case a.MinTime < b.MinTime: return -1 case a.MinTime > b.MinTime: return 1 + } + + switch { + case a.Ref < b.Ref: + return -1 + case a.Ref > b.Ref: + return 1 default: return 0 } From b428416f06ab075bd71fe870ff5318524747776a Mon Sep 17 00:00:00 2001 From: beorn7 Date: Thu, 19 Oct 2023 17:54:42 +0200 Subject: [PATCH 08/17] textparse: Update comment about timestamp_ms protobuf parsing By now, we know better what the plan is. Signed-off-by: beorn7 --- model/textparse/protobufparse.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/model/textparse/protobufparse.go b/model/textparse/protobufparse.go index 94ea5e4a35..d6d87ee368 100644 --- a/model/textparse/protobufparse.go +++ b/model/textparse/protobufparse.go @@ -148,9 +148,15 @@ func (p *ProtobufParser) Series() ([]byte, *int64, float64) { if ts != 0 { return p.metricBytes.Bytes(), &ts, v } - // Nasty hack: Assume that ts==0 means no timestamp. That's not true in - // general, but proto3 has no distinction between unset and - // default. Need to avoid in the final format. + // TODO(beorn7): We assume here that ts==0 means no timestamp. That's + // not true in general, but proto3 originally has no distinction between + // unset and default. At a later stage, the `optional` keyword was + // (re-)introduced in proto3, but gogo-protobuf never got updated to + // support it. (Note that setting `[(gogoproto.nullable) = true]` for + // the `timestamp_ms` field doesn't help, either.) We plan to migrate + // away from gogo-protobuf to an actively maintained protobuf + // implementation. Once that's done, we can simply use the `optional` + // keyword and check for the unset state explicitly. return p.metricBytes.Bytes(), nil, v } From a5abd92541df54727a11614dbab0916ab8b56a74 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Thu, 19 Oct 2023 17:59:07 +0200 Subject: [PATCH 09/17] prompb: Remove `gogoproto.nullable) = true]` from created_timestamp In proto3, this doesn't change anything. However, since the `CreatedTimestamp` field is generated as a pointer (`*types.Timestamp`), we are still able to detect the unset state. (This is in contrast to the `timestamp_ms` field, which is a plain int64, for which we cannot enforce generation as a pointer, see comment updated in the previous commit for future actions.) Signed-off-by: beorn7 --- prompb/io/prometheus/client/metrics.pb.go | 121 +++++++++++----------- prompb/io/prometheus/client/metrics.proto | 8 +- 2 files changed, 64 insertions(+), 65 deletions(-) diff --git a/prompb/io/prometheus/client/metrics.pb.go b/prompb/io/prometheus/client/metrics.pb.go index 05d25747b4..5538395117 100644 --- a/prompb/io/prometheus/client/metrics.pb.go +++ b/prompb/io/prometheus/client/metrics.pb.go @@ -965,68 +965,67 @@ func init() { } var fileDescriptor_d1e5ddb18987a258 = []byte{ - // 963 bytes of a gzipped FileDescriptorProto + // 960 bytes of a gzipped FileDescriptorProto 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0xa4, 0x56, 0xdd, 0x6e, 0x1b, 0x45, - 0x14, 0xee, 0x76, 0xfd, 0x93, 0x3d, 0x8e, 0x93, 0xcd, 0x60, 0x55, 0xab, 0x40, 0x62, 0xb3, 0x12, - 0x52, 0x40, 0xc8, 0x16, 0x50, 0x04, 0x2a, 0x45, 0x22, 0x69, 0xd3, 0x14, 0x15, 0xb7, 0x65, 0x6c, - 0x5f, 0x94, 0x9b, 0xd5, 0xd8, 0x9e, 0xac, 0x57, 0xec, 0xee, 0x2c, 0xfb, 0x53, 0x11, 0xee, 0x79, - 0x06, 0x5e, 0x01, 0xf1, 0x1c, 0x08, 0xf5, 0x92, 0x07, 0x40, 0x08, 0xe5, 0x49, 0xd0, 0xfc, 0xed, - 0x3a, 0xd5, 0xba, 0x90, 0xf6, 0x6e, 0xe6, 0xf3, 0x77, 0xce, 0x7c, 0xe7, 0x9b, 0xf1, 0x39, 0x0b, - 0x6e, 0xc0, 0x46, 0x49, 0xca, 0x22, 0x9a, 0xaf, 0x68, 0x91, 0x8d, 0x16, 0x61, 0x40, 0xe3, 0x7c, - 0x14, 0xd1, 0x3c, 0x0d, 0x16, 0xd9, 0x30, 0x49, 0x59, 0xce, 0x50, 0x2f, 0x60, 0xc3, 0x8a, 0x33, - 0x94, 0x9c, 0xfd, 0x9e, 0xcf, 0x7c, 0x26, 0x08, 0x23, 0xbe, 0x92, 0xdc, 0xfd, 0xbe, 0xcf, 0x98, - 0x1f, 0xd2, 0x91, 0xd8, 0xcd, 0x8b, 0xf3, 0x51, 0x1e, 0x44, 0x34, 0xcb, 0x49, 0x94, 0x48, 0x82, - 0xfb, 0x29, 0x58, 0xdf, 0x90, 0x39, 0x0d, 0x9f, 0x92, 0x20, 0x45, 0x08, 0x1a, 0x31, 0x89, 0xa8, - 0x63, 0x0c, 0x8c, 0x23, 0x0b, 0x8b, 0x35, 0xea, 0x41, 0xf3, 0x39, 0x09, 0x0b, 0xea, 0xdc, 0x14, - 0xa0, 0xdc, 0xb8, 0x07, 0xd0, 0x3c, 0x23, 0x85, 0xbf, 0xf6, 0x33, 0x8f, 0x31, 0xf4, 0xcf, 0xbf, - 0x19, 0xd0, 0xbe, 0xc7, 0x8a, 0x38, 0xa7, 0x69, 0x3d, 0x03, 0xdd, 0x81, 0x2d, 0xfa, 0x23, 0x8d, - 0x92, 0x90, 0xa4, 0x22, 0x73, 0xe7, 0xe3, 0xc3, 0x61, 0x5d, 0x5d, 0xc3, 0x53, 0xc5, 0xc2, 0x25, - 0x1f, 0x8d, 0x61, 0x6f, 0x91, 0x52, 0x92, 0xd3, 0xa5, 0x57, 0x96, 0xe3, 0x98, 0x22, 0xc9, 0xfe, - 0x50, 0x16, 0x3c, 0xd4, 0x05, 0x0f, 0xa7, 0x9a, 0x71, 0xd2, 0x78, 0xf1, 0x77, 0xdf, 0xc0, 0xb6, - 0x0a, 0x2d, 0x71, 0xf7, 0x2e, 0x6c, 0x7d, 0x5b, 0x90, 0x38, 0x0f, 0x42, 0x8a, 0xf6, 0x61, 0xeb, - 0x07, 0xb5, 0x56, 0x7a, 0xcb, 0xfd, 0x55, 0x27, 0xca, 0x52, 0xff, 0x32, 0xa0, 0x3d, 0x29, 0xa2, - 0x88, 0xa4, 0x17, 0xe8, 0x5d, 0xd8, 0xce, 0x48, 0x94, 0x84, 0xd4, 0x5b, 0xf0, 0xe2, 0x45, 0x86, - 0x06, 0xee, 0x48, 0x4c, 0xf8, 0x81, 0x0e, 0x00, 0x14, 0x25, 0x2b, 0x22, 0x95, 0xc9, 0x92, 0xc8, - 0xa4, 0x88, 0xd0, 0x57, 0x6b, 0xe7, 0x9b, 0x03, 0x73, 0xb3, 0x2d, 0x5a, 0xb1, 0xa8, 0xea, 0xc6, - 0x9a, 0xca, 0x5a, 0x73, 0x1a, 0xaf, 0x6d, 0x4e, 0x1f, 0xda, 0xb3, 0x38, 0xbf, 0x48, 0xe8, 0x72, - 0xc3, 0x55, 0xff, 0xde, 0x04, 0xeb, 0x61, 0x90, 0xe5, 0xcc, 0x4f, 0x49, 0xf4, 0x7f, 0x1c, 0xf8, - 0x10, 0xd0, 0x3a, 0xc5, 0x3b, 0x0f, 0x19, 0xc9, 0x85, 0x42, 0x03, 0xdb, 0x6b, 0xc4, 0x07, 0x1c, - 0xff, 0x2f, 0xbf, 0xee, 0x40, 0x6b, 0x5e, 0x2c, 0xbe, 0xa7, 0xb9, 0x72, 0xeb, 0x9d, 0x7a, 0xb7, - 0x4e, 0x04, 0x47, 0x79, 0xa5, 0x22, 0xea, 0x9d, 0xda, 0x7d, 0x5d, 0xa7, 0xd0, 0x2d, 0x68, 0x65, - 0x8b, 0x15, 0x8d, 0x88, 0xd3, 0x1c, 0x18, 0x47, 0x7b, 0x58, 0xed, 0xd0, 0x7b, 0xb0, 0xf3, 0x13, - 0x4d, 0x99, 0x97, 0xaf, 0x52, 0x9a, 0xad, 0x58, 0xb8, 0x74, 0x5a, 0xa2, 0x8a, 0x2e, 0x47, 0xa7, - 0x1a, 0xe4, 0x85, 0x0a, 0x9a, 0xf4, 0xad, 0x2d, 0x7c, 0xb3, 0x38, 0x22, 0x5d, 0x3b, 0x02, 0xbb, - 0xfa, 0x59, 0x79, 0xb6, 0x25, 0xf2, 0xec, 0x94, 0x24, 0xe9, 0xd8, 0x23, 0xe8, 0xc6, 0xd4, 0x27, - 0x79, 0xf0, 0x9c, 0x7a, 0x59, 0x42, 0x62, 0xc7, 0x12, 0xce, 0x0c, 0x5e, 0xe5, 0xcc, 0x24, 0x21, - 0xb1, 0x72, 0x67, 0x5b, 0x07, 0x73, 0x8c, 0x8b, 0x2f, 0x93, 0x2d, 0x69, 0x98, 0x13, 0x07, 0x06, - 0xe6, 0x11, 0xc2, 0xe5, 0x11, 0xf7, 0x39, 0x78, 0x85, 0x26, 0x0b, 0xe8, 0x0c, 0x4c, 0x5e, 0xa3, - 0x46, 0x65, 0x11, 0x8f, 0xa0, 0x9b, 0xb0, 0x2c, 0xa8, 0xa4, 0x6d, 0x5f, 0x4f, 0x9a, 0x0e, 0xd6, - 0xd2, 0xca, 0x64, 0x52, 0x5a, 0x57, 0x4a, 0xd3, 0x68, 0x29, 0xad, 0xa4, 0x49, 0x69, 0x3b, 0x52, - 0x9a, 0x46, 0x85, 0x34, 0xf7, 0x0f, 0x03, 0x5a, 0xf2, 0x40, 0xf4, 0x3e, 0xd8, 0x8b, 0x22, 0x2a, - 0xc2, 0xf5, 0x72, 0xe4, 0x3b, 0xde, 0xad, 0x70, 0x59, 0xd0, 0x6d, 0xb8, 0xf5, 0x32, 0xf5, 0xca, - 0x7b, 0xee, 0xbd, 0x14, 0x20, 0x6f, 0xa8, 0x0f, 0x9d, 0x22, 0x49, 0x68, 0xea, 0xcd, 0x59, 0x11, - 0x2f, 0xd5, 0xa3, 0x06, 0x01, 0x9d, 0x70, 0xe4, 0x4a, 0x73, 0x34, 0xaf, 0xd7, 0x1c, 0xdd, 0xbb, - 0x00, 0x95, 0x71, 0xfc, 0x51, 0xb2, 0xf3, 0xf3, 0x8c, 0xca, 0x0a, 0xf6, 0xb0, 0xda, 0x71, 0x3c, - 0xa4, 0xb1, 0x9f, 0xaf, 0xc4, 0xe9, 0x5d, 0xac, 0x76, 0xee, 0x2f, 0x06, 0x6c, 0xe9, 0xa4, 0xe8, - 0x0b, 0x68, 0x86, 0x7c, 0x36, 0x38, 0x86, 0xb8, 0xa6, 0x7e, 0xbd, 0x86, 0x72, 0x7c, 0xa8, 0x5b, - 0x92, 0x31, 0xf5, 0xdd, 0x12, 0x7d, 0x0e, 0xd6, 0x35, 0x5a, 0x36, 0xae, 0xc8, 0xee, 0xcf, 0x26, - 0xb4, 0xc6, 0x62, 0x0e, 0xbe, 0x99, 0xae, 0x8f, 0xa0, 0xe9, 0xf3, 0xc9, 0xa5, 0xa6, 0xce, 0xdb, - 0xf5, 0xc1, 0x62, 0xb8, 0x61, 0xc9, 0x44, 0x9f, 0x41, 0x7b, 0x21, 0x87, 0x99, 0x92, 0x7c, 0x50, - 0x1f, 0xa4, 0x26, 0x1e, 0xd6, 0x6c, 0x1e, 0x98, 0xc9, 0xd1, 0xa0, 0x3a, 0xf0, 0x86, 0x40, 0x35, - 0x3f, 0xb0, 0x66, 0xf3, 0xc0, 0x42, 0x76, 0x5d, 0xd1, 0x4c, 0x36, 0x06, 0xaa, 0xd6, 0x8c, 0x35, - 0x1b, 0x7d, 0x09, 0xd6, 0x4a, 0x37, 0x63, 0xd1, 0x44, 0x36, 0xda, 0x53, 0xf6, 0x6c, 0x5c, 0x45, - 0xf0, 0xf6, 0x5d, 0x3a, 0xee, 0x45, 0x99, 0xe8, 0x54, 0x26, 0xee, 0x94, 0xd8, 0x38, 0x73, 0x7f, - 0x35, 0x60, 0x5b, 0xde, 0xc3, 0x03, 0x12, 0x05, 0xe1, 0x45, 0xed, 0x47, 0x03, 0x82, 0xc6, 0x8a, - 0x86, 0x89, 0xfa, 0x66, 0x10, 0x6b, 0x74, 0x1b, 0x1a, 0x5c, 0xa3, 0xb0, 0x70, 0x67, 0xd3, 0x7f, - 0x5e, 0x66, 0x9e, 0x5e, 0x24, 0x14, 0x0b, 0x36, 0x6f, 0xf0, 0xf2, 0xeb, 0xc7, 0x69, 0xbc, 0xaa, - 0xc1, 0xcb, 0x38, 0xdd, 0xe0, 0x65, 0xc4, 0x07, 0x73, 0x80, 0x2a, 0x1f, 0xea, 0x40, 0xfb, 0xde, - 0x93, 0xd9, 0xe3, 0xe9, 0x29, 0xb6, 0x6f, 0x20, 0x0b, 0x9a, 0x67, 0xc7, 0xb3, 0xb3, 0x53, 0xdb, - 0xe0, 0xf8, 0x64, 0x36, 0x1e, 0x1f, 0xe3, 0x67, 0xf6, 0x4d, 0xbe, 0x99, 0x3d, 0x9e, 0x3e, 0x7b, - 0x7a, 0x7a, 0xdf, 0x36, 0x51, 0x17, 0xac, 0x87, 0x5f, 0x4f, 0xa6, 0x4f, 0xce, 0xf0, 0xf1, 0xd8, - 0x6e, 0xa0, 0xb7, 0x60, 0x57, 0xc4, 0x78, 0x15, 0xd8, 0x3c, 0x71, 0x5f, 0x5c, 0x1e, 0x1a, 0x7f, - 0x5e, 0x1e, 0x1a, 0xff, 0x5c, 0x1e, 0x1a, 0xdf, 0xf5, 0x02, 0xe6, 0x55, 0xe2, 0x3c, 0x29, 0x6e, - 0xde, 0x12, 0x2f, 0xfb, 0x93, 0x7f, 0x03, 0x00, 0x00, 0xff, 0xff, 0x68, 0x3f, 0xd9, 0x07, 0xdd, - 0x09, 0x00, 0x00, + 0x14, 0xee, 0xd6, 0xbf, 0x7b, 0x1c, 0x27, 0x9b, 0xc1, 0xaa, 0x56, 0x81, 0xc4, 0x66, 0x25, 0xa4, + 0x80, 0x90, 0x2d, 0xa0, 0x08, 0x54, 0x8a, 0x44, 0xd2, 0xa6, 0x2e, 0x2a, 0x6e, 0xcb, 0xd8, 0xbe, + 0x28, 0x37, 0xab, 0xb1, 0x3d, 0x59, 0xaf, 0xd8, 0xdd, 0x59, 0xf6, 0xa7, 0x22, 0xdc, 0xf3, 0x0c, + 0xbc, 0x00, 0x17, 0x3c, 0x05, 0x97, 0xa8, 0x97, 0x5c, 0x71, 0x89, 0x50, 0x9e, 0x04, 0xcd, 0xdf, + 0xae, 0x53, 0xad, 0x03, 0x81, 0xbb, 0x99, 0xcf, 0xdf, 0x39, 0xf3, 0x9d, 0x6f, 0xc6, 0xe7, 0x2c, + 0x38, 0x3e, 0x1b, 0xc5, 0x09, 0x0b, 0x69, 0xb6, 0xa6, 0x79, 0x3a, 0x5a, 0x06, 0x3e, 0x8d, 0xb2, + 0x51, 0x48, 0xb3, 0xc4, 0x5f, 0xa6, 0xc3, 0x38, 0x61, 0x19, 0x43, 0x3d, 0x9f, 0x0d, 0x4b, 0xce, + 0x50, 0x72, 0x0e, 0x7a, 0x1e, 0xf3, 0x98, 0x20, 0x8c, 0xf8, 0x4a, 0x72, 0x0f, 0xfa, 0x1e, 0x63, + 0x5e, 0x40, 0x47, 0x62, 0xb7, 0xc8, 0xcf, 0x47, 0x99, 0x1f, 0xd2, 0x34, 0x23, 0x61, 0x2c, 0x09, + 0xce, 0xc7, 0x60, 0x7e, 0x45, 0x16, 0x34, 0x78, 0x4e, 0xfc, 0x04, 0x21, 0xa8, 0x47, 0x24, 0xa4, + 0xb6, 0x31, 0x30, 0x8e, 0x4d, 0x2c, 0xd6, 0xa8, 0x07, 0x8d, 0x97, 0x24, 0xc8, 0xa9, 0x7d, 0x5b, + 0x80, 0x72, 0xe3, 0x1c, 0x42, 0x63, 0x4c, 0x72, 0x6f, 0xe3, 0x67, 0x1e, 0x63, 0xe8, 0x9f, 0x7f, + 0x36, 0xa0, 0xf5, 0x80, 0xe5, 0x51, 0x46, 0x93, 0x6a, 0x06, 0xba, 0x07, 0x6d, 0xfa, 0x3d, 0x0d, + 0xe3, 0x80, 0x24, 0x22, 0x73, 0xe7, 0xc3, 0xa3, 0x61, 0x55, 0x5d, 0xc3, 0x33, 0xc5, 0xc2, 0x05, + 0x1f, 0x8d, 0x61, 0x7f, 0x99, 0x50, 0x92, 0xd1, 0x95, 0x5b, 0x94, 0x63, 0xd7, 0x44, 0x92, 0x83, + 0xa1, 0x2c, 0x78, 0xa8, 0x0b, 0x1e, 0xce, 0x34, 0x03, 0x5b, 0x2a, 0xa8, 0x40, 0x9c, 0xfb, 0xd0, + 0xfe, 0x3a, 0x27, 0x51, 0xe6, 0x07, 0x14, 0x1d, 0x40, 0xfb, 0x3b, 0xb5, 0x56, 0x4a, 0x8b, 0xfd, + 0x55, 0x0f, 0x8a, 0x22, 0xff, 0x30, 0xa0, 0x35, 0xcd, 0xc3, 0x90, 0x24, 0x17, 0xe8, 0x6d, 0xd8, + 0x49, 0x49, 0x18, 0x07, 0xd4, 0x5d, 0xf2, 0xb2, 0x45, 0x86, 0x3a, 0xee, 0x48, 0x4c, 0x38, 0x81, + 0x0e, 0x01, 0x14, 0x25, 0xcd, 0x43, 0x95, 0xc9, 0x94, 0xc8, 0x34, 0x0f, 0xd1, 0x17, 0x1b, 0xe7, + 0xd7, 0x06, 0xb5, 0xed, 0x86, 0x68, 0xc5, 0xa7, 0xf5, 0x57, 0x7f, 0xf6, 0x6f, 0x6d, 0xa8, 0xac, + 0xb4, 0xa5, 0xfe, 0x1f, 0x6c, 0xe9, 0x43, 0x6b, 0x1e, 0x65, 0x17, 0x31, 0x5d, 0x6d, 0xb9, 0xde, + 0x5f, 0x1b, 0x60, 0x3e, 0xf6, 0xd3, 0x8c, 0x79, 0x09, 0x09, 0xff, 0x4d, 0xed, 0xef, 0x03, 0xda, + 0xa4, 0xb8, 0xe7, 0x01, 0x23, 0x99, 0xd0, 0x66, 0x60, 0x6b, 0x83, 0xf8, 0x88, 0xe3, 0xff, 0xe4, + 0xd4, 0x3d, 0x68, 0x2e, 0xf2, 0xe5, 0xb7, 0x34, 0x53, 0x3e, 0xbd, 0x55, 0xed, 0xd3, 0xa9, 0xe0, + 0x28, 0x97, 0x54, 0x44, 0xb5, 0x47, 0x7b, 0x37, 0xf7, 0x08, 0xdd, 0x81, 0x66, 0xba, 0x5c, 0xd3, + 0x90, 0xd8, 0x8d, 0x81, 0x71, 0xbc, 0x8f, 0xd5, 0x0e, 0xbd, 0x03, 0xbb, 0x3f, 0xd0, 0x84, 0xb9, + 0xd9, 0x3a, 0xa1, 0xe9, 0x9a, 0x05, 0x2b, 0xbb, 0x29, 0xf4, 0x77, 0x39, 0x3a, 0xd3, 0x20, 0x2f, + 0x51, 0xd0, 0xa4, 0x63, 0x2d, 0xe1, 0x98, 0xc9, 0x11, 0xe9, 0xd7, 0x31, 0x58, 0xe5, 0xcf, 0xca, + 0xad, 0xb6, 0xc8, 0xb3, 0x5b, 0x90, 0xa4, 0x57, 0x4f, 0xa0, 0x1b, 0x51, 0x8f, 0x64, 0xfe, 0x4b, + 0xea, 0xa6, 0x31, 0x89, 0x6c, 0x53, 0x78, 0x32, 0xb8, 0xce, 0x93, 0x69, 0x4c, 0x22, 0xe5, 0xcb, + 0x8e, 0x0e, 0xe6, 0x18, 0x17, 0x5f, 0x24, 0x5b, 0xd1, 0x20, 0x23, 0x36, 0x0c, 0x6a, 0xc7, 0x08, + 0x17, 0x47, 0x3c, 0xe4, 0xe0, 0x15, 0x9a, 0x2c, 0xa0, 0x33, 0xa8, 0xf1, 0x1a, 0x35, 0x2a, 0x8b, + 0x78, 0x02, 0xdd, 0x98, 0xa5, 0x7e, 0x29, 0x6d, 0xe7, 0x66, 0xd2, 0x74, 0xb0, 0x96, 0x56, 0x24, + 0x93, 0xd2, 0xba, 0x52, 0x9a, 0x46, 0x0b, 0x69, 0x05, 0x4d, 0x4a, 0xdb, 0x95, 0xd2, 0x34, 0x2a, + 0xa4, 0x39, 0xbf, 0x19, 0xd0, 0x94, 0x07, 0xa2, 0x77, 0xc1, 0x5a, 0xe6, 0x61, 0x1e, 0x6c, 0x96, + 0x23, 0x5f, 0xf0, 0x5e, 0x89, 0xcb, 0x82, 0xee, 0xc2, 0x9d, 0xd7, 0xa9, 0x57, 0x5e, 0x72, 0xef, + 0xb5, 0x00, 0x79, 0x43, 0x7d, 0xe8, 0xe4, 0x71, 0x4c, 0x13, 0x77, 0xc1, 0xf2, 0x68, 0xa5, 0x9e, + 0x33, 0x08, 0xe8, 0x94, 0x23, 0x57, 0x5a, 0x61, 0xed, 0x66, 0xad, 0xd0, 0xb9, 0x0f, 0x50, 0x1a, + 0xc7, 0x1f, 0x25, 0x3b, 0x3f, 0x4f, 0xa9, 0xac, 0x60, 0x1f, 0xab, 0x1d, 0xc7, 0x03, 0x1a, 0x79, + 0xd9, 0x5a, 0x9c, 0xde, 0xc5, 0x6a, 0xe7, 0xfc, 0x64, 0x40, 0x5b, 0x27, 0x45, 0x9f, 0x41, 0x23, + 0xe0, 0x93, 0xc0, 0x36, 0xc4, 0x35, 0xf5, 0xab, 0x35, 0x14, 0xc3, 0x42, 0xdd, 0x92, 0x8c, 0xa9, + 0xee, 0x90, 0xe8, 0x53, 0x30, 0x6f, 0xd2, 0xa0, 0x4b, 0xb2, 0xf3, 0x63, 0x0d, 0x9a, 0x13, 0x31, + 0xf5, 0xfe, 0x9f, 0xae, 0x0f, 0xa0, 0xe1, 0xf1, 0x39, 0xa5, 0x66, 0xcc, 0x9b, 0xd5, 0xc1, 0x62, + 0x94, 0x61, 0xc9, 0x44, 0x9f, 0x40, 0x6b, 0x29, 0x47, 0x97, 0x92, 0x7c, 0x58, 0x1d, 0xa4, 0xe6, + 0x1b, 0xd6, 0x6c, 0x1e, 0x98, 0xca, 0x71, 0xa0, 0xba, 0xee, 0x96, 0x40, 0x35, 0x33, 0xb0, 0x66, + 0xf3, 0xc0, 0x5c, 0xf6, 0x5b, 0xd1, 0x4c, 0xb6, 0x06, 0xaa, 0xa6, 0x8c, 0x35, 0x1b, 0x7d, 0x0e, + 0xe6, 0x5a, 0xb7, 0x61, 0xd1, 0x44, 0xb6, 0xda, 0x53, 0x74, 0x6b, 0x5c, 0x46, 0xf0, 0xc6, 0x5d, + 0x38, 0xee, 0x86, 0xa9, 0xe8, 0x54, 0x35, 0xdc, 0x29, 0xb0, 0x49, 0xea, 0xfc, 0x62, 0xc0, 0x8e, + 0xbc, 0x87, 0x47, 0x24, 0xf4, 0x83, 0x8b, 0xca, 0x4f, 0x04, 0x04, 0xf5, 0x35, 0x0d, 0x62, 0xf5, + 0x85, 0x20, 0xd6, 0xe8, 0x2e, 0xd4, 0xb9, 0x46, 0x61, 0xe1, 0xee, 0xb6, 0xff, 0xbc, 0xcc, 0x3c, + 0xbb, 0x88, 0x29, 0x16, 0x6c, 0xde, 0xda, 0xe5, 0xb7, 0x8e, 0x5d, 0xbf, 0xae, 0xb5, 0xcb, 0x38, + 0xdd, 0xda, 0x65, 0xc4, 0x7b, 0x0b, 0x80, 0x32, 0x1f, 0xea, 0x40, 0xeb, 0xc1, 0xb3, 0xf9, 0xd3, + 0xd9, 0x19, 0xb6, 0x6e, 0x21, 0x13, 0x1a, 0xe3, 0x93, 0xf9, 0xf8, 0xcc, 0x32, 0x38, 0x3e, 0x9d, + 0x4f, 0x26, 0x27, 0xf8, 0x85, 0x75, 0x9b, 0x6f, 0xe6, 0x4f, 0x67, 0x2f, 0x9e, 0x9f, 0x3d, 0xb4, + 0x6a, 0xa8, 0x0b, 0xe6, 0xe3, 0x2f, 0xa7, 0xb3, 0x67, 0x63, 0x7c, 0x32, 0xb1, 0xea, 0xe8, 0x0d, + 0xd8, 0x13, 0x31, 0x6e, 0x09, 0x36, 0x4e, 0x9d, 0x57, 0x97, 0x47, 0xc6, 0xef, 0x97, 0x47, 0xc6, + 0x5f, 0x97, 0x47, 0xc6, 0x37, 0x3d, 0x9f, 0xb9, 0xa5, 0x38, 0x57, 0x8a, 0x5b, 0x34, 0xc5, 0xcb, + 0xfe, 0xe8, 0xef, 0x00, 0x00, 0x00, 0xff, 0xff, 0x0d, 0x2e, 0x66, 0xc1, 0xcb, 0x09, 0x00, 0x00, } func (m *LabelPair) Marshal() (dAtA []byte, err error) { diff --git a/prompb/io/prometheus/client/metrics.proto b/prompb/io/prometheus/client/metrics.proto index 8e225bb3b9..be4d2dbae1 100644 --- a/prompb/io/prometheus/client/metrics.proto +++ b/prompb/io/prometheus/client/metrics.proto @@ -52,7 +52,7 @@ message Counter { double value = 1; Exemplar exemplar = 2; - google.protobuf.Timestamp created_timestamp = 3 [(gogoproto.nullable) = true]; + google.protobuf.Timestamp created_timestamp = 3; } message Quantile { @@ -65,7 +65,7 @@ message Summary { double sample_sum = 2; repeated Quantile quantile = 3 [(gogoproto.nullable) = false]; - google.protobuf.Timestamp created_timestamp = 4 [(gogoproto.nullable) = true]; + google.protobuf.Timestamp created_timestamp = 4; } message Untyped { @@ -79,7 +79,7 @@ message Histogram { // Buckets for the conventional histogram. repeated Bucket bucket = 3 [(gogoproto.nullable) = false]; // Ordered in increasing order of upper_bound, +Inf bucket is optional. - google.protobuf.Timestamp created_timestamp = 15 [(gogoproto.nullable) = true]; + google.protobuf.Timestamp created_timestamp = 15; // Everything below here is for native histograms (also known as sparse histograms). // Native histograms are an experimental feature without stability guarantees. @@ -153,4 +153,4 @@ message MetricFamily { string help = 2; MetricType type = 3; repeated Metric metric = 4 [(gogoproto.nullable) = false]; -} \ No newline at end of file +} From 6d083312e7b93c60375cd977399f4e660d75cddb Mon Sep 17 00:00:00 2001 From: Bartlomiej Plotka Date: Thu, 19 Oct 2023 20:38:45 +0100 Subject: [PATCH 10/17] native-histograms: Fixed PrometheusProto scrape format preference. (#13010) Broken by https://github.com/prometheus/prometheus/pull/12738. We have to update both global variables (as GlobalConfig is not a pointer here). DefaultConfig is used when no global: section is provided, whereas DefaultGlobalConfig is used when it's provided and for individual scrape configs. Reported on #prometheus-dev (thanks to @beorn7): https://cloud-native.slack.com/archives/C01AUBA4PFE/p1697733267205649 Tested manually, it would be nice to add test at some point (quick fix for now). Signed-off-by: bwplotka --- cmd/prometheus/main.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index cdfb42b185..dfc06fd4e6 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -202,9 +202,10 @@ func (c *flagConfig) setFeatureListOptions(logger log.Logger) error { level.Info(logger).Log("msg", "No default port will be appended to scrape targets' addresses.") case "native-histograms": c.tsdb.EnableNativeHistograms = true - // Change global variable. Hacky, but it's hard to pass new option or default to unmarshaller. + // Change relevant global variables. Hacky, but it's hard to pass a new option or default to unmarshallers. config.DefaultConfig.GlobalConfig.ScrapeProtocols = config.DefaultNativeHistogramScrapeProtocols - level.Info(logger).Log("msg", "Experimental native histogram support enabled. Changed default scrape_protocols to prefer PrometheusProto format.", "global.scrape_protocols", fmt.Sprintf("%v", config.DefaultConfig.GlobalConfig.ScrapeProtocols)) + config.DefaultGlobalConfig.ScrapeProtocols = config.DefaultNativeHistogramScrapeProtocols + level.Info(logger).Log("msg", "Experimental native histogram support enabled. Changed default scrape_protocols to prefer PrometheusProto format.", "global.scrape_protocols", fmt.Sprintf("%v", config.DefaultGlobalConfig.ScrapeProtocols)) case "": continue case "promql-at-modifier", "promql-negative-offset": From 122f9506e9c6e5ad8e43282c6a2856f769d83f32 Mon Sep 17 00:00:00 2001 From: Rens Groothuijsen Date: Fri, 20 Oct 2023 12:32:46 +0200 Subject: [PATCH 11/17] Set test group interval default to evaluation interval (#13011) Signed-off-by: Rens Groothuijsen --- cmd/promtool/testdata/no-test-group-interval.yml | 15 +++++++++++++++ cmd/promtool/unittest.go | 3 +++ cmd/promtool/unittest_test.go | 10 ++++++++++ docs/configuration/unit_testing_rules.md | 2 +- 4 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 cmd/promtool/testdata/no-test-group-interval.yml diff --git a/cmd/promtool/testdata/no-test-group-interval.yml b/cmd/promtool/testdata/no-test-group-interval.yml new file mode 100644 index 0000000000..d1f6935cd6 --- /dev/null +++ b/cmd/promtool/testdata/no-test-group-interval.yml @@ -0,0 +1,15 @@ +tests: + - input_series: + - series: test + values: 0 1 + promql_expr_test: + - expr: test + eval_time: 59s + exp_samples: + - value: 0 + labels: test + - expr: test + eval_time: 1m + exp_samples: + - value: 1 + labels: test \ No newline at end of file diff --git a/cmd/promtool/unittest.go b/cmd/promtool/unittest.go index 5bec5c60b0..d37e03e52b 100644 --- a/cmd/promtool/unittest.go +++ b/cmd/promtool/unittest.go @@ -96,6 +96,9 @@ func ruleUnitTest(filename string, queryOpts promql.LazyLoaderOpts) []error { // Testing. var errs []error for _, t := range unitTestInp.Tests { + if t.Interval == 0 { + t.Interval = unitTestInp.EvaluationInterval + } ers := t.test(evalInterval, groupOrderMap, queryOpts, unitTestInp.RuleFiles...) if ers != nil { errs = append(errs, ers...) diff --git a/cmd/promtool/unittest_test.go b/cmd/promtool/unittest_test.go index 1e02440541..c96883113a 100644 --- a/cmd/promtool/unittest_test.go +++ b/cmd/promtool/unittest_test.go @@ -112,6 +112,16 @@ func TestRulesUnitTest(t *testing.T) { }, want: 0, }, + { + name: "No test group interval", + args: args{ + files: []string{"./testdata/no-test-group-interval.yml"}, + }, + queryOpts: promql.LazyLoaderOpts{ + EnableNegativeOffset: true, + }, + want: 0, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/docs/configuration/unit_testing_rules.md b/docs/configuration/unit_testing_rules.md index c583ecfdaa..163fcb91f1 100644 --- a/docs/configuration/unit_testing_rules.md +++ b/docs/configuration/unit_testing_rules.md @@ -39,7 +39,7 @@ tests: ``` yaml # Series data -interval: +[ interval: | default = evaluation_interval ] input_series: [ - ] From 498b836654e074451d24bb10151446c5b70441e0 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Sat, 21 Oct 2023 10:57:19 +0200 Subject: [PATCH 12/17] Refactoring manager.go into separate concerns Signed-off-by: Danny Kopping --- rules/group.go | 863 ++++++++++++++++++++++++++++++++++++++++++++++ rules/manager.go | 871 ----------------------------------------------- rules/rule.go | 64 ++++ 3 files changed, 927 insertions(+), 871 deletions(-) create mode 100644 rules/group.go create mode 100644 rules/rule.go diff --git a/rules/group.go b/rules/group.go new file mode 100644 index 0000000000..5eba20767f --- /dev/null +++ b/rules/group.go @@ -0,0 +1,863 @@ +// Copyright 2013 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package rules + +import ( + "context" + "errors" + "math" + "strings" + "sync" + "time" + + "golang.org/x/exp/slices" + + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/model" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/codes" + + "github.com/prometheus/prometheus/model/labels" + "github.com/prometheus/prometheus/model/timestamp" + "github.com/prometheus/prometheus/model/value" + "github.com/prometheus/prometheus/promql" + "github.com/prometheus/prometheus/storage" + "github.com/prometheus/prometheus/tsdb/chunkenc" +) + +// Group is a set of rules that have a logical relation. +type Group struct { + name string + file string + interval time.Duration + limit int + rules []Rule + seriesInPreviousEval []map[string]labels.Labels // One per Rule. + staleSeries []labels.Labels + opts *ManagerOptions + mtx sync.Mutex + evaluationTime time.Duration + lastEvaluation time.Time // Wall-clock time of most recent evaluation. + lastEvalTimestamp time.Time // Time slot used for most recent evaluation. + + shouldRestore bool + + markStale bool + done chan struct{} + terminated chan struct{} + managerDone chan struct{} + + logger log.Logger + + metrics *Metrics + + // Rule group evaluation iteration function, + // defaults to DefaultEvalIterationFunc. + evalIterationFunc GroupEvalIterationFunc +} + +// GroupEvalIterationFunc is used to implement and extend rule group +// evaluation iteration logic. It is configured in Group.evalIterationFunc, +// and periodically invoked at each group evaluation interval to +// evaluate the rules in the group at that point in time. +// DefaultEvalIterationFunc is the default implementation. +type GroupEvalIterationFunc func(ctx context.Context, g *Group, evalTimestamp time.Time) + +type GroupOptions struct { + Name, File string + Interval time.Duration + Limit int + Rules []Rule + ShouldRestore bool + Opts *ManagerOptions + done chan struct{} + EvalIterationFunc GroupEvalIterationFunc +} + +// NewGroup makes a new Group with the given name, options, and rules. +func NewGroup(o GroupOptions) *Group { + metrics := o.Opts.Metrics + if metrics == nil { + metrics = NewGroupMetrics(o.Opts.Registerer) + } + + key := GroupKey(o.File, o.Name) + metrics.IterationsMissed.WithLabelValues(key) + metrics.IterationsScheduled.WithLabelValues(key) + metrics.EvalTotal.WithLabelValues(key) + metrics.EvalFailures.WithLabelValues(key) + metrics.GroupLastEvalTime.WithLabelValues(key) + metrics.GroupLastDuration.WithLabelValues(key) + metrics.GroupRules.WithLabelValues(key).Set(float64(len(o.Rules))) + metrics.GroupSamples.WithLabelValues(key) + metrics.GroupInterval.WithLabelValues(key).Set(o.Interval.Seconds()) + + evalIterationFunc := o.EvalIterationFunc + if evalIterationFunc == nil { + evalIterationFunc = DefaultEvalIterationFunc + } + + return &Group{ + name: o.Name, + file: o.File, + interval: o.Interval, + limit: o.Limit, + rules: o.Rules, + shouldRestore: o.ShouldRestore, + opts: o.Opts, + seriesInPreviousEval: make([]map[string]labels.Labels, len(o.Rules)), + done: make(chan struct{}), + managerDone: o.done, + terminated: make(chan struct{}), + logger: log.With(o.Opts.Logger, "file", o.File, "group", o.Name), + metrics: metrics, + evalIterationFunc: evalIterationFunc, + } +} + +// Name returns the group name. +func (g *Group) Name() string { return g.name } + +// File returns the group's file. +func (g *Group) File() string { return g.file } + +// Rules returns the group's rules. +func (g *Group) Rules() []Rule { return g.rules } + +// Queryable returns the group's querable. +func (g *Group) Queryable() storage.Queryable { return g.opts.Queryable } + +// Context returns the group's context. +func (g *Group) Context() context.Context { return g.opts.Context } + +// Interval returns the group's interval. +func (g *Group) Interval() time.Duration { return g.interval } + +// Limit returns the group's limit. +func (g *Group) Limit() int { return g.limit } + +func (g *Group) Logger() log.Logger { return g.logger } + +func (g *Group) run(ctx context.Context) { + defer close(g.terminated) + + // Wait an initial amount to have consistently slotted intervals. + evalTimestamp := g.EvalTimestamp(time.Now().UnixNano()).Add(g.interval) + select { + case <-time.After(time.Until(evalTimestamp)): + case <-g.done: + return + } + + ctx = promql.NewOriginContext(ctx, map[string]interface{}{ + "ruleGroup": map[string]string{ + "file": g.File(), + "name": g.Name(), + }, + }) + + // The assumption here is that since the ticker was started after having + // waited for `evalTimestamp` to pass, the ticks will trigger soon + // after each `evalTimestamp + N * g.interval` occurrence. + tick := time.NewTicker(g.interval) + defer tick.Stop() + + defer func() { + if !g.markStale { + return + } + go func(now time.Time) { + for _, rule := range g.seriesInPreviousEval { + for _, r := range rule { + g.staleSeries = append(g.staleSeries, r) + } + } + // That can be garbage collected at this point. + g.seriesInPreviousEval = nil + // Wait for 2 intervals to give the opportunity to renamed rules + // to insert new series in the tsdb. At this point if there is a + // renamed rule, it should already be started. + select { + case <-g.managerDone: + case <-time.After(2 * g.interval): + g.cleanupStaleSeries(ctx, now) + } + }(time.Now()) + }() + + g.evalIterationFunc(ctx, g, evalTimestamp) + if g.shouldRestore { + // If we have to restore, we wait for another Eval to finish. + // The reason behind this is, during first eval (or before it) + // we might not have enough data scraped, and recording rules would not + // have updated the latest values, on which some alerts might depend. + select { + case <-g.done: + return + case <-tick.C: + missed := (time.Since(evalTimestamp) / g.interval) - 1 + if missed > 0 { + g.metrics.IterationsMissed.WithLabelValues(GroupKey(g.file, g.name)).Add(float64(missed)) + g.metrics.IterationsScheduled.WithLabelValues(GroupKey(g.file, g.name)).Add(float64(missed)) + } + evalTimestamp = evalTimestamp.Add((missed + 1) * g.interval) + g.evalIterationFunc(ctx, g, evalTimestamp) + } + + g.RestoreForState(time.Now()) + g.shouldRestore = false + } + + for { + select { + case <-g.done: + return + default: + select { + case <-g.done: + return + case <-tick.C: + missed := (time.Since(evalTimestamp) / g.interval) - 1 + if missed > 0 { + g.metrics.IterationsMissed.WithLabelValues(GroupKey(g.file, g.name)).Add(float64(missed)) + g.metrics.IterationsScheduled.WithLabelValues(GroupKey(g.file, g.name)).Add(float64(missed)) + } + evalTimestamp = evalTimestamp.Add((missed + 1) * g.interval) + + g.evalIterationFunc(ctx, g, evalTimestamp) + } + } + } +} + +func (g *Group) stop() { + close(g.done) + <-g.terminated +} + +func (g *Group) hash() uint64 { + l := labels.New( + labels.Label{Name: "name", Value: g.name}, + labels.Label{Name: "file", Value: g.file}, + ) + return l.Hash() +} + +// AlertingRules returns the list of the group's alerting rules. +func (g *Group) AlertingRules() []*AlertingRule { + g.mtx.Lock() + defer g.mtx.Unlock() + + var alerts []*AlertingRule + for _, rule := range g.rules { + if alertingRule, ok := rule.(*AlertingRule); ok { + alerts = append(alerts, alertingRule) + } + } + slices.SortFunc(alerts, func(a, b *AlertingRule) int { + if a.State() == b.State() { + return strings.Compare(a.Name(), b.Name()) + } + return int(b.State() - a.State()) + }) + return alerts +} + +// HasAlertingRules returns true if the group contains at least one AlertingRule. +func (g *Group) HasAlertingRules() bool { + g.mtx.Lock() + defer g.mtx.Unlock() + + for _, rule := range g.rules { + if _, ok := rule.(*AlertingRule); ok { + return true + } + } + return false +} + +// GetEvaluationTime returns the time in seconds it took to evaluate the rule group. +func (g *Group) GetEvaluationTime() time.Duration { + g.mtx.Lock() + defer g.mtx.Unlock() + return g.evaluationTime +} + +// setEvaluationTime sets the time in seconds the last evaluation took. +func (g *Group) setEvaluationTime(dur time.Duration) { + g.metrics.GroupLastDuration.WithLabelValues(GroupKey(g.file, g.name)).Set(dur.Seconds()) + + g.mtx.Lock() + defer g.mtx.Unlock() + g.evaluationTime = dur +} + +// GetLastEvaluation returns the time the last evaluation of the rule group took place. +func (g *Group) GetLastEvaluation() time.Time { + g.mtx.Lock() + defer g.mtx.Unlock() + return g.lastEvaluation +} + +// setLastEvaluation updates evaluationTimestamp to the timestamp of when the rule group was last evaluated. +func (g *Group) setLastEvaluation(ts time.Time) { + g.metrics.GroupLastEvalTime.WithLabelValues(GroupKey(g.file, g.name)).Set(float64(ts.UnixNano()) / 1e9) + + g.mtx.Lock() + defer g.mtx.Unlock() + g.lastEvaluation = ts +} + +// GetLastEvalTimestamp returns the timestamp of the last evaluation. +func (g *Group) GetLastEvalTimestamp() time.Time { + g.mtx.Lock() + defer g.mtx.Unlock() + return g.lastEvalTimestamp +} + +// setLastEvalTimestamp updates lastEvalTimestamp to the timestamp of the last evaluation. +func (g *Group) setLastEvalTimestamp(ts time.Time) { + g.mtx.Lock() + defer g.mtx.Unlock() + g.lastEvalTimestamp = ts +} + +// EvalTimestamp returns the immediately preceding consistently slotted evaluation time. +func (g *Group) EvalTimestamp(startTime int64) time.Time { + var ( + offset = int64(g.hash() % uint64(g.interval)) + + // This group's evaluation times differ from the perfect time intervals by `offset` nanoseconds. + // But we can only use `% interval` to align with the interval. And `% interval` will always + // align with the perfect time intervals, instead of this group's. Because of this we add + // `offset` _after_ aligning with the perfect time interval. + // + // There can be cases where adding `offset` to the perfect evaluation time can yield a + // timestamp in the future, which is not what EvalTimestamp should do. + // So we subtract one `offset` to make sure that `now - (now % interval) + offset` gives an + // evaluation time in the past. + adjNow = startTime - offset + + // Adjust to perfect evaluation intervals. + base = adjNow - (adjNow % int64(g.interval)) + + // Add one offset to randomize the evaluation times of this group. + next = base + offset + ) + + return time.Unix(0, next).UTC() +} + +func nameAndLabels(rule Rule) string { + return rule.Name() + rule.Labels().String() +} + +// CopyState copies the alerting rule and staleness related state from the given group. +// +// Rules are matched based on their name and labels. If there are duplicates, the +// first is matched with the first, second with the second etc. +func (g *Group) CopyState(from *Group) { + g.evaluationTime = from.evaluationTime + g.lastEvaluation = from.lastEvaluation + + ruleMap := make(map[string][]int, len(from.rules)) + + for fi, fromRule := range from.rules { + nameAndLabels := nameAndLabels(fromRule) + l := ruleMap[nameAndLabels] + ruleMap[nameAndLabels] = append(l, fi) + } + + for i, rule := range g.rules { + nameAndLabels := nameAndLabels(rule) + indexes := ruleMap[nameAndLabels] + if len(indexes) == 0 { + continue + } + fi := indexes[0] + g.seriesInPreviousEval[i] = from.seriesInPreviousEval[fi] + ruleMap[nameAndLabels] = indexes[1:] + + ar, ok := rule.(*AlertingRule) + if !ok { + continue + } + far, ok := from.rules[fi].(*AlertingRule) + if !ok { + continue + } + + for fp, a := range far.active { + ar.active[fp] = a + } + } + + // Handle deleted and unmatched duplicate rules. + g.staleSeries = from.staleSeries + for fi, fromRule := range from.rules { + nameAndLabels := nameAndLabels(fromRule) + l := ruleMap[nameAndLabels] + if len(l) != 0 { + for _, series := range from.seriesInPreviousEval[fi] { + g.staleSeries = append(g.staleSeries, series) + } + } + } +} + +// Eval runs a single evaluation cycle in which all rules are evaluated sequentially. +func (g *Group) Eval(ctx context.Context, ts time.Time) { + var samplesTotal float64 + for i, rule := range g.rules { + select { + case <-g.done: + return + default: + } + + func(i int, rule Rule) { + ctx, sp := otel.Tracer("").Start(ctx, "rule") + sp.SetAttributes(attribute.String("name", rule.Name())) + defer func(t time.Time) { + sp.End() + + since := time.Since(t) + g.metrics.EvalDuration.Observe(since.Seconds()) + rule.SetEvaluationDuration(since) + rule.SetEvaluationTimestamp(t) + }(time.Now()) + + g.metrics.EvalTotal.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() + + vector, err := rule.Eval(ctx, ts, g.opts.QueryFunc, g.opts.ExternalURL, g.Limit()) + if err != nil { + rule.SetHealth(HealthBad) + rule.SetLastError(err) + sp.SetStatus(codes.Error, err.Error()) + g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() + + // Canceled queries are intentional termination of queries. This normally + // happens on shutdown and thus we skip logging of any errors here. + var eqc promql.ErrQueryCanceled + if !errors.As(err, &eqc) { + level.Warn(g.logger).Log("name", rule.Name(), "index", i, "msg", "Evaluating rule failed", "rule", rule, "err", err) + } + return + } + rule.SetHealth(HealthGood) + rule.SetLastError(nil) + samplesTotal += float64(len(vector)) + + if ar, ok := rule.(*AlertingRule); ok { + ar.sendAlerts(ctx, ts, g.opts.ResendDelay, g.interval, g.opts.NotifyFunc) + } + var ( + numOutOfOrder = 0 + numTooOld = 0 + numDuplicates = 0 + ) + + app := g.opts.Appendable.Appender(ctx) + seriesReturned := make(map[string]labels.Labels, len(g.seriesInPreviousEval[i])) + defer func() { + if err := app.Commit(); err != nil { + rule.SetHealth(HealthBad) + rule.SetLastError(err) + sp.SetStatus(codes.Error, err.Error()) + g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() + + level.Warn(g.logger).Log("name", rule.Name(), "index", i, "msg", "Rule sample appending failed", "err", err) + return + } + g.seriesInPreviousEval[i] = seriesReturned + }() + + for _, s := range vector { + if s.H != nil { + _, err = app.AppendHistogram(0, s.Metric, s.T, nil, s.H) + } else { + _, err = app.Append(0, s.Metric, s.T, s.F) + } + + if err != nil { + rule.SetHealth(HealthBad) + rule.SetLastError(err) + sp.SetStatus(codes.Error, err.Error()) + unwrappedErr := errors.Unwrap(err) + if unwrappedErr == nil { + unwrappedErr = err + } + switch { + case errors.Is(unwrappedErr, storage.ErrOutOfOrderSample): + numOutOfOrder++ + level.Debug(g.logger).Log("name", rule.Name(), "index", i, "msg", "Rule evaluation result discarded", "err", err, "sample", s) + case errors.Is(unwrappedErr, storage.ErrTooOldSample): + numTooOld++ + level.Debug(g.logger).Log("name", rule.Name(), "index", i, "msg", "Rule evaluation result discarded", "err", err, "sample", s) + case errors.Is(unwrappedErr, storage.ErrDuplicateSampleForTimestamp): + numDuplicates++ + level.Debug(g.logger).Log("name", rule.Name(), "index", i, "msg", "Rule evaluation result discarded", "err", err, "sample", s) + default: + level.Warn(g.logger).Log("name", rule.Name(), "index", i, "msg", "Rule evaluation result discarded", "err", err, "sample", s) + } + } else { + buf := [1024]byte{} + seriesReturned[string(s.Metric.Bytes(buf[:]))] = s.Metric + } + } + if numOutOfOrder > 0 { + level.Warn(g.logger).Log("name", rule.Name(), "index", i, "msg", "Error on ingesting out-of-order result from rule evaluation", "numDropped", numOutOfOrder) + } + if numTooOld > 0 { + level.Warn(g.logger).Log("name", rule.Name(), "index", i, "msg", "Error on ingesting too old result from rule evaluation", "numDropped", numTooOld) + } + if numDuplicates > 0 { + level.Warn(g.logger).Log("name", rule.Name(), "index", i, "msg", "Error on ingesting results from rule evaluation with different value but same timestamp", "numDropped", numDuplicates) + } + + for metric, lset := range g.seriesInPreviousEval[i] { + if _, ok := seriesReturned[metric]; !ok { + // Series no longer exposed, mark it stale. + _, err = app.Append(0, lset, timestamp.FromTime(ts), math.Float64frombits(value.StaleNaN)) + unwrappedErr := errors.Unwrap(err) + if unwrappedErr == nil { + unwrappedErr = err + } + switch { + case unwrappedErr == nil: + case errors.Is(unwrappedErr, storage.ErrOutOfOrderSample), + errors.Is(unwrappedErr, storage.ErrTooOldSample), + errors.Is(unwrappedErr, storage.ErrDuplicateSampleForTimestamp): + // Do not count these in logging, as this is expected if series + // is exposed from a different rule. + default: + level.Warn(g.logger).Log("name", rule.Name(), "index", i, "msg", "Adding stale sample failed", "sample", lset.String(), "err", err) + } + } + } + }(i, rule) + } + if g.metrics != nil { + g.metrics.GroupSamples.WithLabelValues(GroupKey(g.File(), g.Name())).Set(samplesTotal) + } + g.cleanupStaleSeries(ctx, ts) +} + +func (g *Group) cleanupStaleSeries(ctx context.Context, ts time.Time) { + if len(g.staleSeries) == 0 { + return + } + app := g.opts.Appendable.Appender(ctx) + for _, s := range g.staleSeries { + // Rule that produced series no longer configured, mark it stale. + _, err := app.Append(0, s, timestamp.FromTime(ts), math.Float64frombits(value.StaleNaN)) + unwrappedErr := errors.Unwrap(err) + if unwrappedErr == nil { + unwrappedErr = err + } + switch { + case unwrappedErr == nil: + case errors.Is(unwrappedErr, storage.ErrOutOfOrderSample), + errors.Is(unwrappedErr, storage.ErrTooOldSample), + errors.Is(unwrappedErr, storage.ErrDuplicateSampleForTimestamp): + // Do not count these in logging, as this is expected if series + // is exposed from a different rule. + default: + level.Warn(g.logger).Log("msg", "Adding stale sample for previous configuration failed", "sample", s, "err", err) + } + } + if err := app.Commit(); err != nil { + level.Warn(g.logger).Log("msg", "Stale sample appending for previous configuration failed", "err", err) + } else { + g.staleSeries = nil + } +} + +// RestoreForState restores the 'for' state of the alerts +// by looking up last ActiveAt from storage. +func (g *Group) RestoreForState(ts time.Time) { + maxtMS := int64(model.TimeFromUnixNano(ts.UnixNano())) + // We allow restoration only if alerts were active before after certain time. + mint := ts.Add(-g.opts.OutageTolerance) + mintMS := int64(model.TimeFromUnixNano(mint.UnixNano())) + q, err := g.opts.Queryable.Querier(mintMS, maxtMS) + if err != nil { + level.Error(g.logger).Log("msg", "Failed to get Querier", "err", err) + return + } + defer func() { + if err := q.Close(); err != nil { + level.Error(g.logger).Log("msg", "Failed to close Querier", "err", err) + } + }() + + for _, rule := range g.Rules() { + alertRule, ok := rule.(*AlertingRule) + if !ok { + continue + } + + alertHoldDuration := alertRule.HoldDuration() + if alertHoldDuration < g.opts.ForGracePeriod { + // If alertHoldDuration is already less than grace period, we would not + // like to make it wait for `g.opts.ForGracePeriod` time before firing. + // Hence we skip restoration, which will make it wait for alertHoldDuration. + alertRule.SetRestored(true) + continue + } + + alertRule.ForEachActiveAlert(func(a *Alert) { + var s storage.Series + + s, err := alertRule.QueryforStateSeries(g.opts.Context, a, q) + if err != nil { + // Querier Warnings are ignored. We do not care unless we have an error. + level.Error(g.logger).Log( + "msg", "Failed to restore 'for' state", + labels.AlertName, alertRule.Name(), + "stage", "Select", + "err", err, + ) + return + } + + if s == nil { + return + } + + // Series found for the 'for' state. + var t int64 + var v float64 + it := s.Iterator(nil) + for it.Next() == chunkenc.ValFloat { + t, v = it.At() + } + if it.Err() != nil { + level.Error(g.logger).Log("msg", "Failed to restore 'for' state", + labels.AlertName, alertRule.Name(), "stage", "Iterator", "err", it.Err()) + return + } + if value.IsStaleNaN(v) { // Alert was not active. + return + } + + downAt := time.Unix(t/1000, 0).UTC() + restoredActiveAt := time.Unix(int64(v), 0).UTC() + timeSpentPending := downAt.Sub(restoredActiveAt) + timeRemainingPending := alertHoldDuration - timeSpentPending + + switch { + case timeRemainingPending <= 0: + // It means that alert was firing when prometheus went down. + // In the next Eval, the state of this alert will be set back to + // firing again if it's still firing in that Eval. + // Nothing to be done in this case. + case timeRemainingPending < g.opts.ForGracePeriod: + // (new) restoredActiveAt = (ts + m.opts.ForGracePeriod) - alertHoldDuration + // /* new firing time */ /* moving back by hold duration */ + // + // Proof of correctness: + // firingTime = restoredActiveAt.Add(alertHoldDuration) + // = ts + m.opts.ForGracePeriod - alertHoldDuration + alertHoldDuration + // = ts + m.opts.ForGracePeriod + // + // Time remaining to fire = firingTime.Sub(ts) + // = (ts + m.opts.ForGracePeriod) - ts + // = m.opts.ForGracePeriod + restoredActiveAt = ts.Add(g.opts.ForGracePeriod).Add(-alertHoldDuration) + default: + // By shifting ActiveAt to the future (ActiveAt + some_duration), + // the total pending time from the original ActiveAt + // would be `alertHoldDuration + some_duration`. + // Here, some_duration = downDuration. + downDuration := ts.Sub(downAt) + restoredActiveAt = restoredActiveAt.Add(downDuration) + } + + a.ActiveAt = restoredActiveAt + level.Debug(g.logger).Log("msg", "'for' state restored", + labels.AlertName, alertRule.Name(), "restored_time", a.ActiveAt.Format(time.RFC850), + "labels", a.Labels.String()) + }) + + alertRule.SetRestored(true) + } +} + +// Equals return if two groups are the same. +func (g *Group) Equals(ng *Group) bool { + if g.name != ng.name { + return false + } + + if g.file != ng.file { + return false + } + + if g.interval != ng.interval { + return false + } + + if g.limit != ng.limit { + return false + } + + if len(g.rules) != len(ng.rules) { + return false + } + + for i, gr := range g.rules { + if gr.String() != ng.rules[i].String() { + return false + } + } + + return true +} + +// GroupKey group names need not be unique across filenames. +func GroupKey(file, name string) string { + return file + ";" + name +} + +// Constants for instrumentation. +const namespace = "prometheus" + +// Metrics for rule evaluation. +type Metrics struct { + EvalDuration prometheus.Summary + IterationDuration prometheus.Summary + IterationsMissed *prometheus.CounterVec + IterationsScheduled *prometheus.CounterVec + EvalTotal *prometheus.CounterVec + EvalFailures *prometheus.CounterVec + GroupInterval *prometheus.GaugeVec + GroupLastEvalTime *prometheus.GaugeVec + GroupLastDuration *prometheus.GaugeVec + GroupRules *prometheus.GaugeVec + GroupSamples *prometheus.GaugeVec +} + +// NewGroupMetrics creates a new instance of Metrics and registers it with the provided registerer, +// if not nil. +func NewGroupMetrics(reg prometheus.Registerer) *Metrics { + m := &Metrics{ + EvalDuration: prometheus.NewSummary( + prometheus.SummaryOpts{ + Namespace: namespace, + Name: "rule_evaluation_duration_seconds", + Help: "The duration for a rule to execute.", + Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, + }), + IterationDuration: prometheus.NewSummary(prometheus.SummaryOpts{ + Namespace: namespace, + Name: "rule_group_duration_seconds", + Help: "The duration of rule group evaluations.", + Objectives: map[float64]float64{0.01: 0.001, 0.05: 0.005, 0.5: 0.05, 0.90: 0.01, 0.99: 0.001}, + }), + IterationsMissed: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Name: "rule_group_iterations_missed_total", + Help: "The total number of rule group evaluations missed due to slow rule group evaluation.", + }, + []string{"rule_group"}, + ), + IterationsScheduled: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Name: "rule_group_iterations_total", + Help: "The total number of scheduled rule group evaluations, whether executed or missed.", + }, + []string{"rule_group"}, + ), + EvalTotal: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Name: "rule_evaluations_total", + Help: "The total number of rule evaluations.", + }, + []string{"rule_group"}, + ), + EvalFailures: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Name: "rule_evaluation_failures_total", + Help: "The total number of rule evaluation failures.", + }, + []string{"rule_group"}, + ), + GroupInterval: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: "rule_group_interval_seconds", + Help: "The interval of a rule group.", + }, + []string{"rule_group"}, + ), + GroupLastEvalTime: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: "rule_group_last_evaluation_timestamp_seconds", + Help: "The timestamp of the last rule group evaluation in seconds.", + }, + []string{"rule_group"}, + ), + GroupLastDuration: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: "rule_group_last_duration_seconds", + Help: "The duration of the last rule group evaluation.", + }, + []string{"rule_group"}, + ), + GroupRules: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: "rule_group_rules", + Help: "The number of rules.", + }, + []string{"rule_group"}, + ), + GroupSamples: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: "rule_group_last_evaluation_samples", + Help: "The number of samples returned during the last rule group evaluation.", + }, + []string{"rule_group"}, + ), + } + + if reg != nil { + reg.MustRegister( + m.EvalDuration, + m.IterationDuration, + m.IterationsMissed, + m.IterationsScheduled, + m.EvalTotal, + m.EvalFailures, + m.GroupInterval, + m.GroupLastEvalTime, + m.GroupLastDuration, + m.GroupRules, + m.GroupSamples, + ) + } + + return m +} diff --git a/rules/manager.go b/rules/manager.go index 4e10d39c7d..eed314ade2 100644 --- a/rules/manager.go +++ b/rules/manager.go @@ -17,7 +17,6 @@ import ( "context" "errors" "fmt" - "math" "net/url" "strings" "sync" @@ -26,162 +25,17 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/common/model" - "go.opentelemetry.io/otel" - "go.opentelemetry.io/otel/attribute" - "go.opentelemetry.io/otel/codes" "golang.org/x/exp/slices" "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/model/rulefmt" - "github.com/prometheus/prometheus/model/timestamp" - "github.com/prometheus/prometheus/model/value" "github.com/prometheus/prometheus/notifier" "github.com/prometheus/prometheus/promql" "github.com/prometheus/prometheus/promql/parser" "github.com/prometheus/prometheus/storage" - "github.com/prometheus/prometheus/tsdb/chunkenc" "github.com/prometheus/prometheus/util/strutil" ) -// RuleHealth describes the health state of a rule. -type RuleHealth string - -// The possible health states of a rule based on the last execution. -const ( - HealthUnknown RuleHealth = "unknown" - HealthGood RuleHealth = "ok" - HealthBad RuleHealth = "err" -) - -// Constants for instrumentation. -const namespace = "prometheus" - -// Metrics for rule evaluation. -type Metrics struct { - EvalDuration prometheus.Summary - IterationDuration prometheus.Summary - IterationsMissed *prometheus.CounterVec - IterationsScheduled *prometheus.CounterVec - EvalTotal *prometheus.CounterVec - EvalFailures *prometheus.CounterVec - GroupInterval *prometheus.GaugeVec - GroupLastEvalTime *prometheus.GaugeVec - GroupLastDuration *prometheus.GaugeVec - GroupRules *prometheus.GaugeVec - GroupSamples *prometheus.GaugeVec -} - -// NewGroupMetrics creates a new instance of Metrics and registers it with the provided registerer, -// if not nil. -func NewGroupMetrics(reg prometheus.Registerer) *Metrics { - m := &Metrics{ - EvalDuration: prometheus.NewSummary( - prometheus.SummaryOpts{ - Namespace: namespace, - Name: "rule_evaluation_duration_seconds", - Help: "The duration for a rule to execute.", - Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, - }), - IterationDuration: prometheus.NewSummary(prometheus.SummaryOpts{ - Namespace: namespace, - Name: "rule_group_duration_seconds", - Help: "The duration of rule group evaluations.", - Objectives: map[float64]float64{0.01: 0.001, 0.05: 0.005, 0.5: 0.05, 0.90: 0.01, 0.99: 0.001}, - }), - IterationsMissed: prometheus.NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: "rule_group_iterations_missed_total", - Help: "The total number of rule group evaluations missed due to slow rule group evaluation.", - }, - []string{"rule_group"}, - ), - IterationsScheduled: prometheus.NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: "rule_group_iterations_total", - Help: "The total number of scheduled rule group evaluations, whether executed or missed.", - }, - []string{"rule_group"}, - ), - EvalTotal: prometheus.NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: "rule_evaluations_total", - Help: "The total number of rule evaluations.", - }, - []string{"rule_group"}, - ), - EvalFailures: prometheus.NewCounterVec( - prometheus.CounterOpts{ - Namespace: namespace, - Name: "rule_evaluation_failures_total", - Help: "The total number of rule evaluation failures.", - }, - []string{"rule_group"}, - ), - GroupInterval: prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: namespace, - Name: "rule_group_interval_seconds", - Help: "The interval of a rule group.", - }, - []string{"rule_group"}, - ), - GroupLastEvalTime: prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: namespace, - Name: "rule_group_last_evaluation_timestamp_seconds", - Help: "The timestamp of the last rule group evaluation in seconds.", - }, - []string{"rule_group"}, - ), - GroupLastDuration: prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: namespace, - Name: "rule_group_last_duration_seconds", - Help: "The duration of the last rule group evaluation.", - }, - []string{"rule_group"}, - ), - GroupRules: prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: namespace, - Name: "rule_group_rules", - Help: "The number of rules.", - }, - []string{"rule_group"}, - ), - GroupSamples: prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: namespace, - Name: "rule_group_last_evaluation_samples", - Help: "The number of samples returned during the last rule group evaluation.", - }, - []string{"rule_group"}, - ), - } - - if reg != nil { - reg.MustRegister( - m.EvalDuration, - m.IterationDuration, - m.IterationsMissed, - m.IterationsScheduled, - m.EvalTotal, - m.EvalFailures, - m.GroupInterval, - m.GroupLastEvalTime, - m.GroupLastDuration, - m.GroupRules, - m.GroupSamples, - ) - } - - return m -} - // QueryFunc processes PromQL queries. type QueryFunc func(ctx context.Context, q string, t time.Time) (promql.Vector, error) @@ -213,241 +67,6 @@ func EngineQueryFunc(engine *promql.Engine, q storage.Queryable) QueryFunc { } } -// A Rule encapsulates a vector expression which is evaluated at a specified -// interval and acted upon (currently either recorded or used for alerting). -type Rule interface { - Name() string - // Labels of the rule. - Labels() labels.Labels - // eval evaluates the rule, including any associated recording or alerting actions. - Eval(context.Context, time.Time, QueryFunc, *url.URL, int) (promql.Vector, error) - // String returns a human-readable string representation of the rule. - String() string - // Query returns the rule query expression. - Query() parser.Expr - // SetLastErr sets the current error experienced by the rule. - SetLastError(error) - // LastErr returns the last error experienced by the rule. - LastError() error - // SetHealth sets the current health of the rule. - SetHealth(RuleHealth) - // Health returns the current health of the rule. - Health() RuleHealth - SetEvaluationDuration(time.Duration) - // GetEvaluationDuration returns last evaluation duration. - // NOTE: Used dynamically by rules.html template. - GetEvaluationDuration() time.Duration - SetEvaluationTimestamp(time.Time) - // GetEvaluationTimestamp returns last evaluation timestamp. - // NOTE: Used dynamically by rules.html template. - GetEvaluationTimestamp() time.Time -} - -// Group is a set of rules that have a logical relation. -type Group struct { - name string - file string - interval time.Duration - limit int - rules []Rule - seriesInPreviousEval []map[string]labels.Labels // One per Rule. - staleSeries []labels.Labels - opts *ManagerOptions - mtx sync.Mutex - evaluationTime time.Duration - lastEvaluation time.Time // Wall-clock time of most recent evaluation. - lastEvalTimestamp time.Time // Time slot used for most recent evaluation. - - shouldRestore bool - - markStale bool - done chan struct{} - terminated chan struct{} - managerDone chan struct{} - - logger log.Logger - - metrics *Metrics - - // Rule group evaluation iteration function, - // defaults to DefaultEvalIterationFunc. - evalIterationFunc GroupEvalIterationFunc -} - -// GroupEvalIterationFunc is used to implement and extend rule group -// evaluation iteration logic. It is configured in Group.evalIterationFunc, -// and periodically invoked at each group evaluation interval to -// evaluate the rules in the group at that point in time. -// DefaultEvalIterationFunc is the default implementation. -type GroupEvalIterationFunc func(ctx context.Context, g *Group, evalTimestamp time.Time) - -type GroupOptions struct { - Name, File string - Interval time.Duration - Limit int - Rules []Rule - ShouldRestore bool - Opts *ManagerOptions - done chan struct{} - EvalIterationFunc GroupEvalIterationFunc -} - -// NewGroup makes a new Group with the given name, options, and rules. -func NewGroup(o GroupOptions) *Group { - metrics := o.Opts.Metrics - if metrics == nil { - metrics = NewGroupMetrics(o.Opts.Registerer) - } - - key := GroupKey(o.File, o.Name) - metrics.IterationsMissed.WithLabelValues(key) - metrics.IterationsScheduled.WithLabelValues(key) - metrics.EvalTotal.WithLabelValues(key) - metrics.EvalFailures.WithLabelValues(key) - metrics.GroupLastEvalTime.WithLabelValues(key) - metrics.GroupLastDuration.WithLabelValues(key) - metrics.GroupRules.WithLabelValues(key).Set(float64(len(o.Rules))) - metrics.GroupSamples.WithLabelValues(key) - metrics.GroupInterval.WithLabelValues(key).Set(o.Interval.Seconds()) - - evalIterationFunc := o.EvalIterationFunc - if evalIterationFunc == nil { - evalIterationFunc = DefaultEvalIterationFunc - } - - return &Group{ - name: o.Name, - file: o.File, - interval: o.Interval, - limit: o.Limit, - rules: o.Rules, - shouldRestore: o.ShouldRestore, - opts: o.Opts, - seriesInPreviousEval: make([]map[string]labels.Labels, len(o.Rules)), - done: make(chan struct{}), - managerDone: o.done, - terminated: make(chan struct{}), - logger: log.With(o.Opts.Logger, "file", o.File, "group", o.Name), - metrics: metrics, - evalIterationFunc: evalIterationFunc, - } -} - -// Name returns the group name. -func (g *Group) Name() string { return g.name } - -// File returns the group's file. -func (g *Group) File() string { return g.file } - -// Rules returns the group's rules. -func (g *Group) Rules() []Rule { return g.rules } - -// Queryable returns the group's querable. -func (g *Group) Queryable() storage.Queryable { return g.opts.Queryable } - -// Context returns the group's context. -func (g *Group) Context() context.Context { return g.opts.Context } - -// Interval returns the group's interval. -func (g *Group) Interval() time.Duration { return g.interval } - -// Limit returns the group's limit. -func (g *Group) Limit() int { return g.limit } - -func (g *Group) Logger() log.Logger { return g.logger } - -func (g *Group) run(ctx context.Context) { - defer close(g.terminated) - - // Wait an initial amount to have consistently slotted intervals. - evalTimestamp := g.EvalTimestamp(time.Now().UnixNano()).Add(g.interval) - select { - case <-time.After(time.Until(evalTimestamp)): - case <-g.done: - return - } - - ctx = promql.NewOriginContext(ctx, map[string]interface{}{ - "ruleGroup": map[string]string{ - "file": g.File(), - "name": g.Name(), - }, - }) - - // The assumption here is that since the ticker was started after having - // waited for `evalTimestamp` to pass, the ticks will trigger soon - // after each `evalTimestamp + N * g.interval` occurrence. - tick := time.NewTicker(g.interval) - defer tick.Stop() - - defer func() { - if !g.markStale { - return - } - go func(now time.Time) { - for _, rule := range g.seriesInPreviousEval { - for _, r := range rule { - g.staleSeries = append(g.staleSeries, r) - } - } - // That can be garbage collected at this point. - g.seriesInPreviousEval = nil - // Wait for 2 intervals to give the opportunity to renamed rules - // to insert new series in the tsdb. At this point if there is a - // renamed rule, it should already be started. - select { - case <-g.managerDone: - case <-time.After(2 * g.interval): - g.cleanupStaleSeries(ctx, now) - } - }(time.Now()) - }() - - g.evalIterationFunc(ctx, g, evalTimestamp) - if g.shouldRestore { - // If we have to restore, we wait for another Eval to finish. - // The reason behind this is, during first eval (or before it) - // we might not have enough data scraped, and recording rules would not - // have updated the latest values, on which some alerts might depend. - select { - case <-g.done: - return - case <-tick.C: - missed := (time.Since(evalTimestamp) / g.interval) - 1 - if missed > 0 { - g.metrics.IterationsMissed.WithLabelValues(GroupKey(g.file, g.name)).Add(float64(missed)) - g.metrics.IterationsScheduled.WithLabelValues(GroupKey(g.file, g.name)).Add(float64(missed)) - } - evalTimestamp = evalTimestamp.Add((missed + 1) * g.interval) - g.evalIterationFunc(ctx, g, evalTimestamp) - } - - g.RestoreForState(time.Now()) - g.shouldRestore = false - } - - for { - select { - case <-g.done: - return - default: - select { - case <-g.done: - return - case <-tick.C: - missed := (time.Since(evalTimestamp) / g.interval) - 1 - if missed > 0 { - g.metrics.IterationsMissed.WithLabelValues(GroupKey(g.file, g.name)).Add(float64(missed)) - g.metrics.IterationsScheduled.WithLabelValues(GroupKey(g.file, g.name)).Add(float64(missed)) - } - evalTimestamp = evalTimestamp.Add((missed + 1) * g.interval) - - g.evalIterationFunc(ctx, g, evalTimestamp) - } - } - } -} - // DefaultEvalIterationFunc is the default implementation of // GroupEvalIterationFunc that is periodically invoked to evaluate the rules // in a group at a given point in time and updates Group state and metrics @@ -467,491 +86,6 @@ func DefaultEvalIterationFunc(ctx context.Context, g *Group, evalTimestamp time. g.setLastEvalTimestamp(evalTimestamp) } -func (g *Group) stop() { - close(g.done) - <-g.terminated -} - -func (g *Group) hash() uint64 { - l := labels.New( - labels.Label{Name: "name", Value: g.name}, - labels.Label{Name: "file", Value: g.file}, - ) - return l.Hash() -} - -// AlertingRules returns the list of the group's alerting rules. -func (g *Group) AlertingRules() []*AlertingRule { - g.mtx.Lock() - defer g.mtx.Unlock() - - var alerts []*AlertingRule - for _, rule := range g.rules { - if alertingRule, ok := rule.(*AlertingRule); ok { - alerts = append(alerts, alertingRule) - } - } - slices.SortFunc(alerts, func(a, b *AlertingRule) int { - if a.State() == b.State() { - return strings.Compare(a.Name(), b.Name()) - } - return int(b.State() - a.State()) - }) - return alerts -} - -// HasAlertingRules returns true if the group contains at least one AlertingRule. -func (g *Group) HasAlertingRules() bool { - g.mtx.Lock() - defer g.mtx.Unlock() - - for _, rule := range g.rules { - if _, ok := rule.(*AlertingRule); ok { - return true - } - } - return false -} - -// GetEvaluationTime returns the time in seconds it took to evaluate the rule group. -func (g *Group) GetEvaluationTime() time.Duration { - g.mtx.Lock() - defer g.mtx.Unlock() - return g.evaluationTime -} - -// setEvaluationTime sets the time in seconds the last evaluation took. -func (g *Group) setEvaluationTime(dur time.Duration) { - g.metrics.GroupLastDuration.WithLabelValues(GroupKey(g.file, g.name)).Set(dur.Seconds()) - - g.mtx.Lock() - defer g.mtx.Unlock() - g.evaluationTime = dur -} - -// GetLastEvaluation returns the time the last evaluation of the rule group took place. -func (g *Group) GetLastEvaluation() time.Time { - g.mtx.Lock() - defer g.mtx.Unlock() - return g.lastEvaluation -} - -// setLastEvaluation updates evaluationTimestamp to the timestamp of when the rule group was last evaluated. -func (g *Group) setLastEvaluation(ts time.Time) { - g.metrics.GroupLastEvalTime.WithLabelValues(GroupKey(g.file, g.name)).Set(float64(ts.UnixNano()) / 1e9) - - g.mtx.Lock() - defer g.mtx.Unlock() - g.lastEvaluation = ts -} - -// GetLastEvalTimestamp returns the timestamp of the last evaluation. -func (g *Group) GetLastEvalTimestamp() time.Time { - g.mtx.Lock() - defer g.mtx.Unlock() - return g.lastEvalTimestamp -} - -// setLastEvalTimestamp updates lastEvalTimestamp to the timestamp of the last evaluation. -func (g *Group) setLastEvalTimestamp(ts time.Time) { - g.mtx.Lock() - defer g.mtx.Unlock() - g.lastEvalTimestamp = ts -} - -// EvalTimestamp returns the immediately preceding consistently slotted evaluation time. -func (g *Group) EvalTimestamp(startTime int64) time.Time { - var ( - offset = int64(g.hash() % uint64(g.interval)) - - // This group's evaluation times differ from the perfect time intervals by `offset` nanoseconds. - // But we can only use `% interval` to align with the interval. And `% interval` will always - // align with the perfect time intervals, instead of this group's. Because of this we add - // `offset` _after_ aligning with the perfect time interval. - // - // There can be cases where adding `offset` to the perfect evaluation time can yield a - // timestamp in the future, which is not what EvalTimestamp should do. - // So we subtract one `offset` to make sure that `now - (now % interval) + offset` gives an - // evaluation time in the past. - adjNow = startTime - offset - - // Adjust to perfect evaluation intervals. - base = adjNow - (adjNow % int64(g.interval)) - - // Add one offset to randomize the evaluation times of this group. - next = base + offset - ) - - return time.Unix(0, next).UTC() -} - -func nameAndLabels(rule Rule) string { - return rule.Name() + rule.Labels().String() -} - -// CopyState copies the alerting rule and staleness related state from the given group. -// -// Rules are matched based on their name and labels. If there are duplicates, the -// first is matched with the first, second with the second etc. -func (g *Group) CopyState(from *Group) { - g.evaluationTime = from.evaluationTime - g.lastEvaluation = from.lastEvaluation - - ruleMap := make(map[string][]int, len(from.rules)) - - for fi, fromRule := range from.rules { - nameAndLabels := nameAndLabels(fromRule) - l := ruleMap[nameAndLabels] - ruleMap[nameAndLabels] = append(l, fi) - } - - for i, rule := range g.rules { - nameAndLabels := nameAndLabels(rule) - indexes := ruleMap[nameAndLabels] - if len(indexes) == 0 { - continue - } - fi := indexes[0] - g.seriesInPreviousEval[i] = from.seriesInPreviousEval[fi] - ruleMap[nameAndLabels] = indexes[1:] - - ar, ok := rule.(*AlertingRule) - if !ok { - continue - } - far, ok := from.rules[fi].(*AlertingRule) - if !ok { - continue - } - - for fp, a := range far.active { - ar.active[fp] = a - } - } - - // Handle deleted and unmatched duplicate rules. - g.staleSeries = from.staleSeries - for fi, fromRule := range from.rules { - nameAndLabels := nameAndLabels(fromRule) - l := ruleMap[nameAndLabels] - if len(l) != 0 { - for _, series := range from.seriesInPreviousEval[fi] { - g.staleSeries = append(g.staleSeries, series) - } - } - } -} - -// Eval runs a single evaluation cycle in which all rules are evaluated sequentially. -func (g *Group) Eval(ctx context.Context, ts time.Time) { - var samplesTotal float64 - for i, rule := range g.rules { - select { - case <-g.done: - return - default: - } - - func(i int, rule Rule) { - ctx, sp := otel.Tracer("").Start(ctx, "rule") - sp.SetAttributes(attribute.String("name", rule.Name())) - defer func(t time.Time) { - sp.End() - - since := time.Since(t) - g.metrics.EvalDuration.Observe(since.Seconds()) - rule.SetEvaluationDuration(since) - rule.SetEvaluationTimestamp(t) - }(time.Now()) - - g.metrics.EvalTotal.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() - - vector, err := rule.Eval(ctx, ts, g.opts.QueryFunc, g.opts.ExternalURL, g.Limit()) - if err != nil { - rule.SetHealth(HealthBad) - rule.SetLastError(err) - sp.SetStatus(codes.Error, err.Error()) - g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() - - // Canceled queries are intentional termination of queries. This normally - // happens on shutdown and thus we skip logging of any errors here. - var eqc promql.ErrQueryCanceled - if !errors.As(err, &eqc) { - level.Warn(g.logger).Log("name", rule.Name(), "index", i, "msg", "Evaluating rule failed", "rule", rule, "err", err) - } - return - } - rule.SetHealth(HealthGood) - rule.SetLastError(nil) - samplesTotal += float64(len(vector)) - - if ar, ok := rule.(*AlertingRule); ok { - ar.sendAlerts(ctx, ts, g.opts.ResendDelay, g.interval, g.opts.NotifyFunc) - } - var ( - numOutOfOrder = 0 - numTooOld = 0 - numDuplicates = 0 - ) - - app := g.opts.Appendable.Appender(ctx) - seriesReturned := make(map[string]labels.Labels, len(g.seriesInPreviousEval[i])) - defer func() { - if err := app.Commit(); err != nil { - rule.SetHealth(HealthBad) - rule.SetLastError(err) - sp.SetStatus(codes.Error, err.Error()) - g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() - - level.Warn(g.logger).Log("name", rule.Name(), "index", i, "msg", "Rule sample appending failed", "err", err) - return - } - g.seriesInPreviousEval[i] = seriesReturned - }() - - for _, s := range vector { - if s.H != nil { - _, err = app.AppendHistogram(0, s.Metric, s.T, nil, s.H) - } else { - _, err = app.Append(0, s.Metric, s.T, s.F) - } - - if err != nil { - rule.SetHealth(HealthBad) - rule.SetLastError(err) - sp.SetStatus(codes.Error, err.Error()) - unwrappedErr := errors.Unwrap(err) - if unwrappedErr == nil { - unwrappedErr = err - } - switch { - case errors.Is(unwrappedErr, storage.ErrOutOfOrderSample): - numOutOfOrder++ - level.Debug(g.logger).Log("name", rule.Name(), "index", i, "msg", "Rule evaluation result discarded", "err", err, "sample", s) - case errors.Is(unwrappedErr, storage.ErrTooOldSample): - numTooOld++ - level.Debug(g.logger).Log("name", rule.Name(), "index", i, "msg", "Rule evaluation result discarded", "err", err, "sample", s) - case errors.Is(unwrappedErr, storage.ErrDuplicateSampleForTimestamp): - numDuplicates++ - level.Debug(g.logger).Log("name", rule.Name(), "index", i, "msg", "Rule evaluation result discarded", "err", err, "sample", s) - default: - level.Warn(g.logger).Log("name", rule.Name(), "index", i, "msg", "Rule evaluation result discarded", "err", err, "sample", s) - } - } else { - buf := [1024]byte{} - seriesReturned[string(s.Metric.Bytes(buf[:]))] = s.Metric - } - } - if numOutOfOrder > 0 { - level.Warn(g.logger).Log("name", rule.Name(), "index", i, "msg", "Error on ingesting out-of-order result from rule evaluation", "numDropped", numOutOfOrder) - } - if numTooOld > 0 { - level.Warn(g.logger).Log("name", rule.Name(), "index", i, "msg", "Error on ingesting too old result from rule evaluation", "numDropped", numTooOld) - } - if numDuplicates > 0 { - level.Warn(g.logger).Log("name", rule.Name(), "index", i, "msg", "Error on ingesting results from rule evaluation with different value but same timestamp", "numDropped", numDuplicates) - } - - for metric, lset := range g.seriesInPreviousEval[i] { - if _, ok := seriesReturned[metric]; !ok { - // Series no longer exposed, mark it stale. - _, err = app.Append(0, lset, timestamp.FromTime(ts), math.Float64frombits(value.StaleNaN)) - unwrappedErr := errors.Unwrap(err) - if unwrappedErr == nil { - unwrappedErr = err - } - switch { - case unwrappedErr == nil: - case errors.Is(unwrappedErr, storage.ErrOutOfOrderSample), - errors.Is(unwrappedErr, storage.ErrTooOldSample), - errors.Is(unwrappedErr, storage.ErrDuplicateSampleForTimestamp): - // Do not count these in logging, as this is expected if series - // is exposed from a different rule. - default: - level.Warn(g.logger).Log("name", rule.Name(), "index", i, "msg", "Adding stale sample failed", "sample", lset.String(), "err", err) - } - } - } - }(i, rule) - } - if g.metrics != nil { - g.metrics.GroupSamples.WithLabelValues(GroupKey(g.File(), g.Name())).Set(samplesTotal) - } - g.cleanupStaleSeries(ctx, ts) -} - -func (g *Group) cleanupStaleSeries(ctx context.Context, ts time.Time) { - if len(g.staleSeries) == 0 { - return - } - app := g.opts.Appendable.Appender(ctx) - for _, s := range g.staleSeries { - // Rule that produced series no longer configured, mark it stale. - _, err := app.Append(0, s, timestamp.FromTime(ts), math.Float64frombits(value.StaleNaN)) - unwrappedErr := errors.Unwrap(err) - if unwrappedErr == nil { - unwrappedErr = err - } - switch { - case unwrappedErr == nil: - case errors.Is(unwrappedErr, storage.ErrOutOfOrderSample), - errors.Is(unwrappedErr, storage.ErrTooOldSample), - errors.Is(unwrappedErr, storage.ErrDuplicateSampleForTimestamp): - // Do not count these in logging, as this is expected if series - // is exposed from a different rule. - default: - level.Warn(g.logger).Log("msg", "Adding stale sample for previous configuration failed", "sample", s, "err", err) - } - } - if err := app.Commit(); err != nil { - level.Warn(g.logger).Log("msg", "Stale sample appending for previous configuration failed", "err", err) - } else { - g.staleSeries = nil - } -} - -// RestoreForState restores the 'for' state of the alerts -// by looking up last ActiveAt from storage. -func (g *Group) RestoreForState(ts time.Time) { - maxtMS := int64(model.TimeFromUnixNano(ts.UnixNano())) - // We allow restoration only if alerts were active before after certain time. - mint := ts.Add(-g.opts.OutageTolerance) - mintMS := int64(model.TimeFromUnixNano(mint.UnixNano())) - q, err := g.opts.Queryable.Querier(mintMS, maxtMS) - if err != nil { - level.Error(g.logger).Log("msg", "Failed to get Querier", "err", err) - return - } - defer func() { - if err := q.Close(); err != nil { - level.Error(g.logger).Log("msg", "Failed to close Querier", "err", err) - } - }() - - for _, rule := range g.Rules() { - alertRule, ok := rule.(*AlertingRule) - if !ok { - continue - } - - alertHoldDuration := alertRule.HoldDuration() - if alertHoldDuration < g.opts.ForGracePeriod { - // If alertHoldDuration is already less than grace period, we would not - // like to make it wait for `g.opts.ForGracePeriod` time before firing. - // Hence we skip restoration, which will make it wait for alertHoldDuration. - alertRule.SetRestored(true) - continue - } - - alertRule.ForEachActiveAlert(func(a *Alert) { - var s storage.Series - - s, err := alertRule.QueryforStateSeries(g.opts.Context, a, q) - if err != nil { - // Querier Warnings are ignored. We do not care unless we have an error. - level.Error(g.logger).Log( - "msg", "Failed to restore 'for' state", - labels.AlertName, alertRule.Name(), - "stage", "Select", - "err", err, - ) - return - } - - if s == nil { - return - } - - // Series found for the 'for' state. - var t int64 - var v float64 - it := s.Iterator(nil) - for it.Next() == chunkenc.ValFloat { - t, v = it.At() - } - if it.Err() != nil { - level.Error(g.logger).Log("msg", "Failed to restore 'for' state", - labels.AlertName, alertRule.Name(), "stage", "Iterator", "err", it.Err()) - return - } - if value.IsStaleNaN(v) { // Alert was not active. - return - } - - downAt := time.Unix(t/1000, 0).UTC() - restoredActiveAt := time.Unix(int64(v), 0).UTC() - timeSpentPending := downAt.Sub(restoredActiveAt) - timeRemainingPending := alertHoldDuration - timeSpentPending - - switch { - case timeRemainingPending <= 0: - // It means that alert was firing when prometheus went down. - // In the next Eval, the state of this alert will be set back to - // firing again if it's still firing in that Eval. - // Nothing to be done in this case. - case timeRemainingPending < g.opts.ForGracePeriod: - // (new) restoredActiveAt = (ts + m.opts.ForGracePeriod) - alertHoldDuration - // /* new firing time */ /* moving back by hold duration */ - // - // Proof of correctness: - // firingTime = restoredActiveAt.Add(alertHoldDuration) - // = ts + m.opts.ForGracePeriod - alertHoldDuration + alertHoldDuration - // = ts + m.opts.ForGracePeriod - // - // Time remaining to fire = firingTime.Sub(ts) - // = (ts + m.opts.ForGracePeriod) - ts - // = m.opts.ForGracePeriod - restoredActiveAt = ts.Add(g.opts.ForGracePeriod).Add(-alertHoldDuration) - default: - // By shifting ActiveAt to the future (ActiveAt + some_duration), - // the total pending time from the original ActiveAt - // would be `alertHoldDuration + some_duration`. - // Here, some_duration = downDuration. - downDuration := ts.Sub(downAt) - restoredActiveAt = restoredActiveAt.Add(downDuration) - } - - a.ActiveAt = restoredActiveAt - level.Debug(g.logger).Log("msg", "'for' state restored", - labels.AlertName, alertRule.Name(), "restored_time", a.ActiveAt.Format(time.RFC850), - "labels", a.Labels.String()) - }) - - alertRule.SetRestored(true) - } -} - -// Equals return if two groups are the same. -func (g *Group) Equals(ng *Group) bool { - if g.name != ng.name { - return false - } - - if g.file != ng.file { - return false - } - - if g.interval != ng.interval { - return false - } - - if g.limit != ng.limit { - return false - } - - if len(g.rules) != len(ng.rules) { - return false - } - - for i, gr := range g.rules { - if gr.String() != ng.rules[i].String() { - return false - } - } - - return true -} - // The Manager manages recording and alerting rules. type Manager struct { opts *ManagerOptions @@ -1191,11 +325,6 @@ func (m *Manager) LoadGroups( return groups, nil } -// GroupKey group names need not be unique across filenames. -func GroupKey(file, name string) string { - return file + ";" + name -} - // RuleGroups returns the list of manager's rule groups. func (m *Manager) RuleGroups() []*Group { m.mtx.RLock() diff --git a/rules/rule.go b/rules/rule.go new file mode 100644 index 0000000000..a4a8c04459 --- /dev/null +++ b/rules/rule.go @@ -0,0 +1,64 @@ +// Copyright 2013 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package rules + +import ( + "context" + "net/url" + "time" + + "github.com/prometheus/prometheus/model/labels" + "github.com/prometheus/prometheus/promql" + "github.com/prometheus/prometheus/promql/parser" +) + +// RuleHealth describes the health state of a rule. +type RuleHealth string + +// The possible health states of a rule based on the last execution. +const ( + HealthUnknown RuleHealth = "unknown" + HealthGood RuleHealth = "ok" + HealthBad RuleHealth = "err" +) + +// A Rule encapsulates a vector expression which is evaluated at a specified +// interval and acted upon (currently either recorded or used for alerting). +type Rule interface { + Name() string + // Labels of the rule. + Labels() labels.Labels + // Eval evaluates the rule, including any associated recording or alerting actions. + Eval(context.Context, time.Time, QueryFunc, *url.URL, int) (promql.Vector, error) + // String returns a human-readable string representation of the rule. + String() string + // Query returns the rule query expression. + Query() parser.Expr + // SetLastError sets the current error experienced by the rule. + SetLastError(error) + // LastError returns the last error experienced by the rule. + LastError() error + // SetHealth sets the current health of the rule. + SetHealth(RuleHealth) + // Health returns the current health of the rule. + Health() RuleHealth + SetEvaluationDuration(time.Duration) + // GetEvaluationDuration returns last evaluation duration. + // NOTE: Used dynamically by rules.html template. + GetEvaluationDuration() time.Duration + SetEvaluationTimestamp(time.Time) + // GetEvaluationTimestamp returns last evaluation timestamp. + // NOTE: Used dynamically by rules.html template. + GetEvaluationTimestamp() time.Time +} From a807dd16160f0816d5f03bfed72a5321372b0de4 Mon Sep 17 00:00:00 2001 From: Yannick te Kulve <738464+YannickTeKulve@users.noreply.github.com> Date: Sun, 22 Oct 2023 22:42:01 +0200 Subject: [PATCH 13/17] Bump prometheus common to v0.45.0 (#13003) * Bump prometheus common to v0.44.0 Signed-off-by: Yannick te Kulve <738464+YannickTeKulve@users.noreply.github.com> * Fix golang_protobuf_extensions sum Signed-off-by: Yannick te Kulve <738464+YannickTeKulve@users.noreply.github.com> * Remove unused deps Signed-off-by: Yannick te Kulve <738464+YannickTeKulve@users.noreply.github.com> --------- Signed-off-by: Yannick te Kulve <738464+YannickTeKulve@users.noreply.github.com> --- go.mod | 4 ++-- go.sum | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index 01bface5b3..7adaacfd76 100644 --- a/go.mod +++ b/go.mod @@ -47,7 +47,7 @@ require ( github.com/prometheus/alertmanager v0.26.0 github.com/prometheus/client_golang v1.17.0 github.com/prometheus/client_model v0.5.0 - github.com/prometheus/common v0.44.0 + github.com/prometheus/common v0.45.0 github.com/prometheus/common/assets v0.2.0 github.com/prometheus/common/sigv4 v0.1.0 github.com/prometheus/exporter-toolkit v0.10.0 @@ -162,7 +162,7 @@ require ( github.com/mailru/easyjson v0.7.7 // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.19 // indirect - github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect + github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect github.com/mitchellh/go-homedir v1.1.0 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/moby/term v0.0.0-20210619224110-3f7ff695adc6 // indirect diff --git a/go.sum b/go.sum index b7084065a6..1a08b123cb 100644 --- a/go.sum +++ b/go.sum @@ -526,8 +526,8 @@ github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APP github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-runewidth v0.0.2/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU= github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= -github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo= -github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= +github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 h1:jWpvCLoY8Z/e3VKvlsiIGKtc+UG6U5vzxaoagmhXfyg= +github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0/go.mod h1:QUyp042oQthUoa9bqDv0ER0wrtXnBruoNd7aNjkbP+k= github.com/maxatome/go-testdeep v1.12.0 h1:Ql7Go8Tg0C1D/uMMX59LAoYK7LffeJQ6X2T04nTH68g= github.com/miekg/dns v1.0.14/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg= github.com/miekg/dns v1.1.26/go.mod h1:bPDLeHnStXmXAq1m/Ch/hvfNHr14JKNPMBo3VZKjuso= @@ -650,8 +650,8 @@ github.com/prometheus/common v0.9.1/go.mod h1:yhUN8i9wzaXS3w1O07YhxHEBxD+W35wd8b github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB80sz/V91rCo= github.com/prometheus/common v0.26.0/go.mod h1:M7rCNAaPfAosfx8veZJCuw84e35h3Cfd9VFqTh1DIvc= github.com/prometheus/common v0.29.0/go.mod h1:vu+V0TpY+O6vW9J44gczi3Ap/oXXR10b+M/gUGO4Hls= -github.com/prometheus/common v0.44.0 h1:+5BrQJwiBB9xsMygAB3TNvpQKOwlkc25LbISbrdOOfY= -github.com/prometheus/common v0.44.0/go.mod h1:ofAIvZbQ1e/nugmZGz4/qCb9Ap1VoSTIO7x0VV9VvuY= +github.com/prometheus/common v0.45.0 h1:2BGz0eBc2hdMDLnO/8n0jeB3oPrt2D08CekT0lneoxM= +github.com/prometheus/common v0.45.0/go.mod h1:YJmSTw9BoKxJplESWWxlbyttQR4uaEcGyv9MZjVOJsY= github.com/prometheus/common/assets v0.2.0 h1:0P5OrzoHrYBOSM1OigWL3mY8ZvV2N4zIE/5AahrSrfM= github.com/prometheus/common/assets v0.2.0/go.mod h1:D17UVUE12bHbim7HzwUvtqm6gwBEaDQ0F+hIGbFbccI= github.com/prometheus/common/sigv4 v0.1.0 h1:qoVebwtwwEhS85Czm2dSROY5fTo2PAPEVdDeppTwGX4= From 4912c82ed0ba24389e2236bda8fd7576047c7e6d Mon Sep 17 00:00:00 2001 From: Gilles De Mey Date: Mon, 23 Oct 2023 14:17:53 +0200 Subject: [PATCH 14/17] ui: Pass unexpected boot errors to StartingContent component (#13016) Signed-off-by: Gilles De Mey --- .../src/components/withStartingIndicator.test.tsx | 14 +++++++++++++- .../src/components/withStartingIndicator.tsx | 2 +- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/web/ui/react-app/src/components/withStartingIndicator.test.tsx b/web/ui/react-app/src/components/withStartingIndicator.test.tsx index 575115ba6a..49416563e7 100644 --- a/web/ui/react-app/src/components/withStartingIndicator.test.tsx +++ b/web/ui/react-app/src/components/withStartingIndicator.test.tsx @@ -2,7 +2,7 @@ import * as React from 'react'; import { shallow } from 'enzyme'; import { WALReplayData } from '../types/types'; import { StartingContent } from './withStartingIndicator'; -import { Progress } from 'reactstrap'; +import { Alert, Progress } from 'reactstrap'; describe('Starting', () => { describe('progress bar', () => { @@ -52,5 +52,17 @@ describe('Starting', () => { expect(progress.prop('value')).toBe(21); expect(progress.prop('color')).toBe('success'); }); + + it('shows unexpected error', () => { + const status: WALReplayData = { + min: 0, + max: 20, + current: 0, + }; + + const starting = shallow(); + const alert = starting.find(Alert); + expect(alert.prop('color')).toBe('danger'); + }); }); }); diff --git a/web/ui/react-app/src/components/withStartingIndicator.tsx b/web/ui/react-app/src/components/withStartingIndicator.tsx index eb2724ed13..505deab409 100644 --- a/web/ui/react-app/src/components/withStartingIndicator.tsx +++ b/web/ui/react-app/src/components/withStartingIndicator.tsx @@ -51,7 +51,7 @@ export const withStartingIndicator = const { ready, walReplayStatus, isUnexpected } = useFetchReadyInterval(pathPrefix); const staticReady = useReady(); - if (staticReady || ready || isUnexpected) { + if (staticReady || ready) { return ; } From dff1c395f6ed2215974f158e2f8db8a97b7572fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rcio=20Car=C3=B4so?= <18220855+msscaroso@users.noreply.github.com> Date: Tue, 24 Oct 2023 12:34:42 +0100 Subject: [PATCH 15/17] Expose --storage.tsdb.retention.time in metric prometheus_tsdb_retention_limit_seconds (#12986) * Expose --storage.tsdb.retention.time in a metric Signed-off-by: Marcio Caroso --------- Signed-off-by: Marcio Caroso --- tsdb/db.go | 7 +++++++ tsdb/db_test.go | 13 +++++++++++++ 2 files changed, 20 insertions(+) diff --git a/tsdb/db.go b/tsdb/db.go index 86cb39b8b5..8b3d4d3004 100644 --- a/tsdb/db.go +++ b/tsdb/db.go @@ -248,6 +248,7 @@ type dbMetrics struct { tombCleanTimer prometheus.Histogram blocksBytes prometheus.Gauge maxBytes prometheus.Gauge + retentionDuration prometheus.Gauge } func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics { @@ -321,6 +322,10 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics { Name: "prometheus_tsdb_retention_limit_bytes", Help: "Max number of bytes to be retained in the tsdb blocks, configured 0 means disabled", }) + m.retentionDuration = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "prometheus_tsdb_retention_limit_seconds", + Help: "How long to retain samples in storage.", + }) m.sizeRetentionCount = prometheus.NewCounter(prometheus.CounterOpts{ Name: "prometheus_tsdb_size_retentions_total", Help: "The number of times that blocks were deleted because the maximum number of bytes was exceeded.", @@ -341,6 +346,7 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics { m.tombCleanTimer, m.blocksBytes, m.maxBytes, + m.retentionDuration, ) } return m @@ -877,6 +883,7 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs maxBytes = 0 } db.metrics.maxBytes.Set(float64(maxBytes)) + db.metrics.retentionDuration.Set((time.Duration(opts.RetentionDuration) * time.Millisecond).Seconds()) if err := db.reload(); err != nil { return nil, err diff --git a/tsdb/db_test.go b/tsdb/db_test.go index 773561c6ce..243290c5e6 100644 --- a/tsdb/db_test.go +++ b/tsdb/db_test.go @@ -1494,6 +1494,19 @@ func TestTimeRetention(t *testing.T) { require.Equal(t, expBlocks[len(expBlocks)-1].MaxTime, actBlocks[len(actBlocks)-1].meta.MaxTime) } +func TestRetentionDurationMetric(t *testing.T) { + db := openTestDB(t, &Options{ + RetentionDuration: 1000, + }, []int64{100}) + defer func() { + require.NoError(t, db.Close()) + }() + + expRetentionDuration := 1.0 + actRetentionDuration := prom_testutil.ToFloat64(db.metrics.retentionDuration) + require.Equal(t, expRetentionDuration, actRetentionDuration, "metric retention duration mismatch") +} + func TestSizeRetention(t *testing.T) { opts := DefaultOptions() opts.OutOfOrderTimeWindow = 100 From 72cc93d22597558578c216da0046041e6cdcb9b3 Mon Sep 17 00:00:00 2001 From: Jeanette Tan Date: Wed, 25 Oct 2023 18:10:42 +0800 Subject: [PATCH 16/17] Hide position info for warnings when position is unknown (empty query string passed in) Signed-off-by: Jeanette Tan --- util/annotations/annotations.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/util/annotations/annotations.go b/util/annotations/annotations.go index 9cfbb121f6..52cfb114b1 100644 --- a/util/annotations/annotations.go +++ b/util/annotations/annotations.go @@ -116,6 +116,9 @@ type annoErr struct { } func (e annoErr) Error() string { + if e.Query == "" { + return e.Err.Error() + } return fmt.Sprintf("%s (%s)", e.Err, e.PositionRange.StartPosInput(e.Query, 0)) } From 05356e76de82724ebf104457cb4cf1e91920621e Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Wed, 25 Oct 2023 16:06:17 +0100 Subject: [PATCH 17/17] Build: remove -a from build to speed up rebuilds (#13026) I think this is a hold-over from when Go was less careful about separating architectures. Signed-off-by: Bryan Boreham --- .promu.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.promu.yml b/.promu.yml index 9f59485236..e5e01181cc 100644 --- a/.promu.yml +++ b/.promu.yml @@ -18,7 +18,6 @@ build: windows: - builtinassets - stringlabels - flags: -a ldflags: | -X github.com/prometheus/common/version.Version={{.Version}} -X github.com/prometheus/common/version.Revision={{.Revision}}