Rationalise retrieval metrics so we have the state (success/failed) on both samples and batches, in a consistent fashion.

Also, report total queue capacity of all queues, i.e. capacity * shards.
This commit is contained in:
Tom Wilkie 2016-08-29 18:13:26 +02:00 committed by Tom Wilkie
parent ece12bff93
commit a6931b71e8

View file

@ -65,12 +65,10 @@ type StorageQueueManager struct {
wg sync.WaitGroup wg sync.WaitGroup
done chan struct{} done chan struct{}
samplesCount *prometheus.CounterVec sentSamplesTotal *prometheus.CounterVec
sendLatency prometheus.Summary sentBatchDuration *prometheus.HistogramVec
failedBatches prometheus.Counter queueLength prometheus.Gauge
failedSamples prometheus.Counter queueCapacity prometheus.Metric
queueLength prometheus.Gauge
queueCapacity prometheus.Metric
} }
// NewStorageQueueManager builds a new StorageQueueManager. // NewStorageQueueManager builds a new StorageQueueManager.
@ -94,37 +92,27 @@ func NewStorageQueueManager(tsdb StorageClient, cfg *StorageQueueManagerConfig)
shards: shards, shards: shards,
done: make(chan struct{}), done: make(chan struct{}),
samplesCount: prometheus.NewCounterVec( sentSamplesTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{ prometheus.CounterOpts{
Namespace: namespace, Namespace: namespace,
Subsystem: subsystem, Subsystem: subsystem,
Name: "sent_samples_total", Name: "sent_samples_total",
Help: "Total number of processed samples to be sent to remote storage.", Help: "Total number of processed samples sent to remote storage.",
ConstLabels: constLabels, ConstLabels: constLabels,
}, },
[]string{result}, []string{result},
), ),
sendLatency: prometheus.NewSummary(prometheus.SummaryOpts{ sentBatchDuration: prometheus.NewHistogramVec(
Namespace: namespace, prometheus.HistogramOpts{
Subsystem: subsystem, Namespace: namespace,
Name: "send_latency_seconds", Subsystem: subsystem,
Help: "Latency quantiles for sending sample batches to the remote storage.", Name: "sent_batch_duration_seconds",
ConstLabels: constLabels, Help: "Duration of sample batch send calls to the remote storage.",
}), ConstLabels: constLabels,
failedBatches: prometheus.NewCounter(prometheus.CounterOpts{ Buckets: prometheus.DefBuckets,
Namespace: namespace, },
Subsystem: subsystem, []string{result},
Name: "failed_batches_total", ),
Help: "Total number of sample batches that encountered an error while being sent to the remote storage.",
ConstLabels: constLabels,
}),
failedSamples: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "failed_samples_total",
Help: "Total number of samples that encountered an error while being sent to the remote storage.",
ConstLabels: constLabels,
}),
queueLength: prometheus.NewGauge(prometheus.GaugeOpts{ queueLength: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace, Namespace: namespace,
Subsystem: subsystem, Subsystem: subsystem,
@ -140,7 +128,7 @@ func NewStorageQueueManager(tsdb StorageClient, cfg *StorageQueueManagerConfig)
constLabels, constLabels,
), ),
prometheus.GaugeValue, prometheus.GaugeValue,
float64(cfg.QueueCapacity), float64(cfg.QueueCapacity*cfg.Shards),
), ),
} }
@ -158,7 +146,7 @@ func (t *StorageQueueManager) Append(s *model.Sample) error {
select { select {
case t.shards[shard] <- s: case t.shards[shard] <- s:
default: default:
t.samplesCount.WithLabelValues(dropped).Inc() t.sentSamplesTotal.WithLabelValues(dropped).Inc()
log.Warn("Remote storage queue full, discarding sample.") log.Warn("Remote storage queue full, discarding sample.")
} }
return nil return nil
@ -173,10 +161,8 @@ func (*StorageQueueManager) NeedsThrottling() bool {
// Describe implements prometheus.Collector. // Describe implements prometheus.Collector.
func (t *StorageQueueManager) Describe(ch chan<- *prometheus.Desc) { func (t *StorageQueueManager) Describe(ch chan<- *prometheus.Desc) {
t.samplesCount.Describe(ch) t.sentSamplesTotal.Describe(ch)
t.sendLatency.Describe(ch) t.sentBatchDuration.Describe(ch)
ch <- t.failedBatches.Desc()
ch <- t.failedSamples.Desc()
ch <- t.queueLength.Desc() ch <- t.queueLength.Desc()
ch <- t.queueCapacity.Desc() ch <- t.queueCapacity.Desc()
} }
@ -192,11 +178,9 @@ func (t *StorageQueueManager) queueLen() int {
// Collect implements prometheus.Collector. // Collect implements prometheus.Collector.
func (t *StorageQueueManager) Collect(ch chan<- prometheus.Metric) { func (t *StorageQueueManager) Collect(ch chan<- prometheus.Metric) {
t.samplesCount.Collect(ch) t.sentSamplesTotal.Collect(ch)
t.sendLatency.Collect(ch) t.sentBatchDuration.Collect(ch)
t.queueLength.Set(float64(t.queueLen())) t.queueLength.Set(float64(t.queueLen()))
ch <- t.failedBatches
ch <- t.failedSamples
ch <- t.queueLength ch <- t.queueLength
ch <- t.queueCapacity ch <- t.queueCapacity
} }
@ -268,9 +252,7 @@ func (t *StorageQueueManager) sendSamples(s model.Samples) {
if err != nil { if err != nil {
log.Warnf("error sending %d samples to remote storage: %s", len(s), err) log.Warnf("error sending %d samples to remote storage: %s", len(s), err)
labelValue = failure labelValue = failure
t.failedBatches.Inc()
t.failedSamples.Add(float64(len(s)))
} }
t.samplesCount.WithLabelValues(labelValue).Add(float64(len(s))) t.sentSamplesTotal.WithLabelValues(labelValue).Add(float64(len(s)))
t.sendLatency.Observe(duration) t.sentBatchDuration.WithLabelValues(labelValue).Observe(duration)
} }