mirror of
https://github.com/prometheus/prometheus.git
synced 2024-11-12 16:44:05 -08:00
Rationalise retrieval metrics so we have the state (success/failed) on both samples and batches, in a consistent fashion.
Also, report total queue capacity of all queues, i.e. capacity * shards.
This commit is contained in:
parent
ece12bff93
commit
a6931b71e8
|
@ -65,12 +65,10 @@ type StorageQueueManager struct {
|
||||||
wg sync.WaitGroup
|
wg sync.WaitGroup
|
||||||
done chan struct{}
|
done chan struct{}
|
||||||
|
|
||||||
samplesCount *prometheus.CounterVec
|
sentSamplesTotal *prometheus.CounterVec
|
||||||
sendLatency prometheus.Summary
|
sentBatchDuration *prometheus.HistogramVec
|
||||||
failedBatches prometheus.Counter
|
queueLength prometheus.Gauge
|
||||||
failedSamples prometheus.Counter
|
queueCapacity prometheus.Metric
|
||||||
queueLength prometheus.Gauge
|
|
||||||
queueCapacity prometheus.Metric
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewStorageQueueManager builds a new StorageQueueManager.
|
// NewStorageQueueManager builds a new StorageQueueManager.
|
||||||
|
@ -94,37 +92,27 @@ func NewStorageQueueManager(tsdb StorageClient, cfg *StorageQueueManagerConfig)
|
||||||
shards: shards,
|
shards: shards,
|
||||||
done: make(chan struct{}),
|
done: make(chan struct{}),
|
||||||
|
|
||||||
samplesCount: prometheus.NewCounterVec(
|
sentSamplesTotal: prometheus.NewCounterVec(
|
||||||
prometheus.CounterOpts{
|
prometheus.CounterOpts{
|
||||||
Namespace: namespace,
|
Namespace: namespace,
|
||||||
Subsystem: subsystem,
|
Subsystem: subsystem,
|
||||||
Name: "sent_samples_total",
|
Name: "sent_samples_total",
|
||||||
Help: "Total number of processed samples to be sent to remote storage.",
|
Help: "Total number of processed samples sent to remote storage.",
|
||||||
ConstLabels: constLabels,
|
ConstLabels: constLabels,
|
||||||
},
|
},
|
||||||
[]string{result},
|
[]string{result},
|
||||||
),
|
),
|
||||||
sendLatency: prometheus.NewSummary(prometheus.SummaryOpts{
|
sentBatchDuration: prometheus.NewHistogramVec(
|
||||||
Namespace: namespace,
|
prometheus.HistogramOpts{
|
||||||
Subsystem: subsystem,
|
Namespace: namespace,
|
||||||
Name: "send_latency_seconds",
|
Subsystem: subsystem,
|
||||||
Help: "Latency quantiles for sending sample batches to the remote storage.",
|
Name: "sent_batch_duration_seconds",
|
||||||
ConstLabels: constLabels,
|
Help: "Duration of sample batch send calls to the remote storage.",
|
||||||
}),
|
ConstLabels: constLabels,
|
||||||
failedBatches: prometheus.NewCounter(prometheus.CounterOpts{
|
Buckets: prometheus.DefBuckets,
|
||||||
Namespace: namespace,
|
},
|
||||||
Subsystem: subsystem,
|
[]string{result},
|
||||||
Name: "failed_batches_total",
|
),
|
||||||
Help: "Total number of sample batches that encountered an error while being sent to the remote storage.",
|
|
||||||
ConstLabels: constLabels,
|
|
||||||
}),
|
|
||||||
failedSamples: prometheus.NewCounter(prometheus.CounterOpts{
|
|
||||||
Namespace: namespace,
|
|
||||||
Subsystem: subsystem,
|
|
||||||
Name: "failed_samples_total",
|
|
||||||
Help: "Total number of samples that encountered an error while being sent to the remote storage.",
|
|
||||||
ConstLabels: constLabels,
|
|
||||||
}),
|
|
||||||
queueLength: prometheus.NewGauge(prometheus.GaugeOpts{
|
queueLength: prometheus.NewGauge(prometheus.GaugeOpts{
|
||||||
Namespace: namespace,
|
Namespace: namespace,
|
||||||
Subsystem: subsystem,
|
Subsystem: subsystem,
|
||||||
|
@ -140,7 +128,7 @@ func NewStorageQueueManager(tsdb StorageClient, cfg *StorageQueueManagerConfig)
|
||||||
constLabels,
|
constLabels,
|
||||||
),
|
),
|
||||||
prometheus.GaugeValue,
|
prometheus.GaugeValue,
|
||||||
float64(cfg.QueueCapacity),
|
float64(cfg.QueueCapacity*cfg.Shards),
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -158,7 +146,7 @@ func (t *StorageQueueManager) Append(s *model.Sample) error {
|
||||||
select {
|
select {
|
||||||
case t.shards[shard] <- s:
|
case t.shards[shard] <- s:
|
||||||
default:
|
default:
|
||||||
t.samplesCount.WithLabelValues(dropped).Inc()
|
t.sentSamplesTotal.WithLabelValues(dropped).Inc()
|
||||||
log.Warn("Remote storage queue full, discarding sample.")
|
log.Warn("Remote storage queue full, discarding sample.")
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
|
@ -173,10 +161,8 @@ func (*StorageQueueManager) NeedsThrottling() bool {
|
||||||
|
|
||||||
// Describe implements prometheus.Collector.
|
// Describe implements prometheus.Collector.
|
||||||
func (t *StorageQueueManager) Describe(ch chan<- *prometheus.Desc) {
|
func (t *StorageQueueManager) Describe(ch chan<- *prometheus.Desc) {
|
||||||
t.samplesCount.Describe(ch)
|
t.sentSamplesTotal.Describe(ch)
|
||||||
t.sendLatency.Describe(ch)
|
t.sentBatchDuration.Describe(ch)
|
||||||
ch <- t.failedBatches.Desc()
|
|
||||||
ch <- t.failedSamples.Desc()
|
|
||||||
ch <- t.queueLength.Desc()
|
ch <- t.queueLength.Desc()
|
||||||
ch <- t.queueCapacity.Desc()
|
ch <- t.queueCapacity.Desc()
|
||||||
}
|
}
|
||||||
|
@ -192,11 +178,9 @@ func (t *StorageQueueManager) queueLen() int {
|
||||||
|
|
||||||
// Collect implements prometheus.Collector.
|
// Collect implements prometheus.Collector.
|
||||||
func (t *StorageQueueManager) Collect(ch chan<- prometheus.Metric) {
|
func (t *StorageQueueManager) Collect(ch chan<- prometheus.Metric) {
|
||||||
t.samplesCount.Collect(ch)
|
t.sentSamplesTotal.Collect(ch)
|
||||||
t.sendLatency.Collect(ch)
|
t.sentBatchDuration.Collect(ch)
|
||||||
t.queueLength.Set(float64(t.queueLen()))
|
t.queueLength.Set(float64(t.queueLen()))
|
||||||
ch <- t.failedBatches
|
|
||||||
ch <- t.failedSamples
|
|
||||||
ch <- t.queueLength
|
ch <- t.queueLength
|
||||||
ch <- t.queueCapacity
|
ch <- t.queueCapacity
|
||||||
}
|
}
|
||||||
|
@ -268,9 +252,7 @@ func (t *StorageQueueManager) sendSamples(s model.Samples) {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warnf("error sending %d samples to remote storage: %s", len(s), err)
|
log.Warnf("error sending %d samples to remote storage: %s", len(s), err)
|
||||||
labelValue = failure
|
labelValue = failure
|
||||||
t.failedBatches.Inc()
|
|
||||||
t.failedSamples.Add(float64(len(s)))
|
|
||||||
}
|
}
|
||||||
t.samplesCount.WithLabelValues(labelValue).Add(float64(len(s)))
|
t.sentSamplesTotal.WithLabelValues(labelValue).Add(float64(len(s)))
|
||||||
t.sendLatency.Observe(duration)
|
t.sentBatchDuration.WithLabelValues(labelValue).Observe(duration)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue