From c40a83f386d2d95faac91c237e180ff8beb60ce1 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Fri, 19 Jul 2019 14:53:26 -0700 Subject: [PATCH 1/3] Add metrics for max shards, min shards, and desired shards. Signed-off-by: Callum Styan --- storage/remote/queue_manager.go | 40 +++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/storage/remote/queue_manager.go b/storage/remote/queue_manager.go index e36a6c9875..0388512cb6 100644 --- a/storage/remote/queue_manager.go +++ b/storage/remote/queue_manager.go @@ -143,6 +143,33 @@ var ( }, []string{queue}, ) + maxNumShards = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "shards_max", + Help: "The maximum number of shards that the queue is allowed to run.", + }, + []string{queue}, + ) + minNumShards = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "shards_min", + Help: "The minimum number of shards that the queue is allowed to run.", + }, + []string{queue}, + ) + desiredNumShards = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "shards_desired", + Help: "The number of shards that the queues shard calculation wants to run based on the rate of samples in vs. samples out.", + }, + []string{queue}, + ) ) // StorageClient defines an interface for sending a batch of samples to an @@ -189,6 +216,9 @@ type QueueManager struct { succeededSamplesTotal prometheus.Counter retriedSamplesTotal prometheus.Counter shardCapacity prometheus.Gauge + maxNumShards prometheus.Gauge + minNumShards prometheus.Gauge + desiredNumShards prometheus.Gauge } // NewQueueManager builds a new QueueManager. @@ -293,10 +323,16 @@ func (t *QueueManager) Start() { t.succeededSamplesTotal = succeededSamplesTotal.WithLabelValues(name) t.retriedSamplesTotal = retriedSamplesTotal.WithLabelValues(name) t.shardCapacity = shardCapacity.WithLabelValues(name) + t.maxNumShards = maxNumShards.WithLabelValues(name) + t.minNumShards = minNumShards.WithLabelValues(name) + t.desiredNumShards = desiredNumShards.WithLabelValues(name) // Initialise some metrics. t.shardCapacity.Set(float64(t.cfg.Capacity)) t.pendingSamplesMetric.Set(0) + t.maxNumShards.Set(float64(t.cfg.MaxShards)) + t.minNumShards.Set(float64(t.cfg.MinShards)) + t.desiredNumShards.Set(float64(t.cfg.MinShards)) t.shards.start(t.numShards) t.watcher.Start() @@ -336,6 +372,9 @@ func (t *QueueManager) Stop() { succeededSamplesTotal.DeleteLabelValues(name) retriedSamplesTotal.DeleteLabelValues(name) shardCapacity.DeleteLabelValues(name) + maxNumShards.DeleteLabelValues(name) + minNumShards.DeleteLabelValues(name) + desiredNumShards.DeleteLabelValues(name) } // StoreSeries keeps track of which series we know about for lookups when sending samples to remote. @@ -483,6 +522,7 @@ func (t *QueueManager) calculateDesiredShards() { } numShards := int(math.Ceil(desiredShards)) + t.desiredNumShards.Set(float64(numShards)) if numShards > t.cfg.MaxShards { numShards = t.cfg.MaxShards } else if numShards < t.cfg.MinShards { From 3b756148922775906b8746aa93ffc30e7efad254 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Fri, 2 Aug 2019 17:39:32 -0700 Subject: [PATCH 2/3] Add a warning alert, since the remote write behind alert will probably already be going off, about desired shards being higher than max shards. Signed-off-by: Callum Styan --- .../prometheus-mixin/alerts.libsonnet | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index 06c5274579..0cb52901d2 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -225,6 +225,27 @@ description: 'Prometheus %(prometheusName)s remote write is {{ printf "%%.1f" $value }}s behind for queue {{$labels.queue}}.' % $._config, }, }, + { + alert: 'PrometheusRemoteWriteDesiredShards', + expr: ||| + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + max_over_time(prometheus_remote_storage_shards_desired{%(prometheusSelector)s}[5m]) + > on(job, instance) group_right + max_over_time(prometheus_remote_storage_shards_max{%(prometheusSelector)s}[5m]) + ) + == 1 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Prometheus remote write desired shards calculation wants to run more than configured max shards.', + description: 'Prometheus %(prometheusName)s remote write is {{ printf "%%.1f" $value }}s behind for queue {{$labels.queue}}.' % $._config, + }, + }, { alert: 'PrometheusRuleFailures', expr: ||| From a98599bea8769fbd84aac310579b3e389a11f0d2 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Wed, 4 Sep 2019 16:34:09 -0700 Subject: [PATCH 3/3] Update remote write max shards alert; properly template/query for max shards in description. Signed-off-by: Callum Styan --- documentation/prometheus-mixin/alerts.libsonnet | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index 0cb52901d2..f18c988090 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -235,7 +235,6 @@ > on(job, instance) group_right max_over_time(prometheus_remote_storage_shards_max{%(prometheusSelector)s}[5m]) ) - == 1 ||| % $._config, 'for': '15m', labels: { @@ -243,7 +242,7 @@ }, annotations: { summary: 'Prometheus remote write desired shards calculation wants to run more than configured max shards.', - description: 'Prometheus %(prometheusName)s remote write is {{ printf "%%.1f" $value }}s behind for queue {{$labels.queue}}.' % $._config, + description: 'Prometheus %(prometheusName)s remote write desired shards calculation wants to run {{ printf $value }} shards, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%%s",%(prometheusSelector)s}` $labels.instance | query | first | value }}.' % $._config, }, }, {