Rationalise retrieval metrics so we have the state (success/failed) on both samples and batches, in a consistent fashion.

Also, report total queue capacity of all queues, i.e. capacity * shards.
2025-03-05 20:59:13 -08:00 · 2016-08-29 18:13:26 +02:00 · 2016-08-29 18:13:26 +02:00 · a6931b71e8
parent ece12bff93
commit a6931b71e8
1 changed files with 25 additions and 43 deletions
--- a/storage/remote/queue_manager.go
+++ b/storage/remote/queue_manager.go
@ -65,12 +65,10 @@ type StorageQueueManager struct {
 	wg     sync.WaitGroup
 	done   chan struct{}
-	samplesCount  *prometheus.CounterVec
+	sentSamplesTotal  *prometheus.CounterVec
-	sendLatency   prometheus.Summary
+	sentBatchDuration *prometheus.HistogramVec
-	failedBatches prometheus.Counter
+	queueLength       prometheus.Gauge
-	failedSamples prometheus.Counter
+	queueCapacity     prometheus.Metric
 	queueLength   prometheus.Gauge
 	queueCapacity prometheus.Metric
 }
 // NewStorageQueueManager builds a new StorageQueueManager.
@ -94,37 +92,27 @@ func NewStorageQueueManager(tsdb StorageClient, cfg *StorageQueueManagerConfig)
 		shards: shards,
 		done:   make(chan struct{}),
-		samplesCount: prometheus.NewCounterVec(
+		sentSamplesTotal: prometheus.NewCounterVec(
 			prometheus.CounterOpts{
 				Namespace:   namespace,
 				Subsystem:   subsystem,
 				Name:        "sent_samples_total",
-				Help:        "Total number of processed samples to be sent to remote storage.",
+				Help:        "Total number of processed samples sent to remote storage.",
 				ConstLabels: constLabels,
 			},
 			[]string{result},
 		),
-		sendLatency: prometheus.NewSummary(prometheus.SummaryOpts{
+		sentBatchDuration: prometheus.NewHistogramVec(
-			Namespace:   namespace,
+			prometheus.HistogramOpts{
-			Subsystem:   subsystem,
+				Namespace:   namespace,
-			Name:        "send_latency_seconds",
+				Subsystem:   subsystem,
-			Help:        "Latency quantiles for sending sample batches to the remote storage.",
+				Name:        "sent_batch_duration_seconds",
-			ConstLabels: constLabels,
+				Help:        "Duration of sample batch send calls to the remote storage.",
-		}),
+				ConstLabels: constLabels,
-		failedBatches: prometheus.NewCounter(prometheus.CounterOpts{
+				Buckets:     prometheus.DefBuckets,
-			Namespace:   namespace,
+			},
-			Subsystem:   subsystem,
+			[]string{result},
-			Name:        "failed_batches_total",
+		),
 			Help:        "Total number of sample batches that encountered an error while being sent to the remote storage.",
 			ConstLabels: constLabels,
 		}),
 		failedSamples: prometheus.NewCounter(prometheus.CounterOpts{
 			Namespace:   namespace,
 			Subsystem:   subsystem,
 			Name:        "failed_samples_total",
 			Help:        "Total number of samples that encountered an error while being sent to the remote storage.",
 			ConstLabels: constLabels,
 		}),
 		queueLength: prometheus.NewGauge(prometheus.GaugeOpts{
 			Namespace:   namespace,
 			Subsystem:   subsystem,
@ -140,7 +128,7 @@ func NewStorageQueueManager(tsdb StorageClient, cfg *StorageQueueManagerConfig)
 				constLabels,
 			),
 			prometheus.GaugeValue,
-			float64(cfg.QueueCapacity),
+			float64(cfg.QueueCapacity*cfg.Shards),
 		),
 	}
@ -158,7 +146,7 @@ func (t *StorageQueueManager) Append(s *model.Sample) error {
 	select {
 	case t.shards[shard] <- s:
 	default:
-		t.samplesCount.WithLabelValues(dropped).Inc()
+		t.sentSamplesTotal.WithLabelValues(dropped).Inc()
 		log.Warn("Remote storage queue full, discarding sample.")
 	}
 	return nil
@ -173,10 +161,8 @@ func (*StorageQueueManager) NeedsThrottling() bool {
 // Describe implements prometheus.Collector.
 func (t *StorageQueueManager) Describe(ch chan<- *prometheus.Desc) {
-	t.samplesCount.Describe(ch)
+	t.sentSamplesTotal.Describe(ch)
-	t.sendLatency.Describe(ch)
+	t.sentBatchDuration.Describe(ch)
 	ch <- t.failedBatches.Desc()
 	ch <- t.failedSamples.Desc()
 	ch <- t.queueLength.Desc()
 	ch <- t.queueCapacity.Desc()
 }
@ -192,11 +178,9 @@ func (t *StorageQueueManager) queueLen() int {
 // Collect implements prometheus.Collector.
 func (t *StorageQueueManager) Collect(ch chan<- prometheus.Metric) {
-	t.samplesCount.Collect(ch)
+	t.sentSamplesTotal.Collect(ch)
-	t.sendLatency.Collect(ch)
+	t.sentBatchDuration.Collect(ch)
 	t.queueLength.Set(float64(t.queueLen()))
 	ch <- t.failedBatches
 	ch <- t.failedSamples
 	ch <- t.queueLength
 	ch <- t.queueCapacity
 }
@ -268,9 +252,7 @@ func (t *StorageQueueManager) sendSamples(s model.Samples) {
 	if err != nil {
 		log.Warnf("error sending %d samples to remote storage: %s", len(s), err)
 		labelValue = failure
 		t.failedBatches.Inc()
 		t.failedSamples.Add(float64(len(s)))
 	}
-	t.samplesCount.WithLabelValues(labelValue).Add(float64(len(s)))
+	t.sentSamplesTotal.WithLabelValues(labelValue).Add(float64(len(s)))
-	t.sendLatency.Observe(duration)
+	t.sentBatchDuration.WithLabelValues(labelValue).Observe(duration)
 }