Improve remote storage queue manager metrics.

This commit is contained in:
Julius Volz 2015-09-16 16:22:18 +02:00 committed by Fabian Reinartz
parent 9a6e7b3e3b
commit 09b557a085

View file

@ -62,7 +62,8 @@ type StorageQueueManager struct {
samplesCount *prometheus.CounterVec samplesCount *prometheus.CounterVec
sendLatency prometheus.Summary sendLatency prometheus.Summary
sendErrors prometheus.Counter failedBatches prometheus.Counter
failedSamples prometheus.Counter
queueLength prometheus.Gauge queueLength prometheus.Gauge
queueCapacity prometheus.Metric queueCapacity prometheus.Metric
} }
@ -92,15 +93,22 @@ func NewStorageQueueManager(tsdb StorageClient, queueCapacity int) *StorageQueue
sendLatency: prometheus.NewSummary(prometheus.SummaryOpts{ sendLatency: prometheus.NewSummary(prometheus.SummaryOpts{
Namespace: namespace, Namespace: namespace,
Subsystem: subsystem, Subsystem: subsystem,
Name: "sent_latency_milliseconds", Name: "send_latency_seconds",
Help: "Latency quantiles for sending sample batches to the remote storage.", Help: "Latency quantiles for sending sample batches to the remote storage.",
ConstLabels: constLabels, ConstLabels: constLabels,
}), }),
sendErrors: prometheus.NewCounter(prometheus.CounterOpts{ failedBatches: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace, Namespace: namespace,
Subsystem: subsystem, Subsystem: subsystem,
Name: "sent_errors_total", Name: "failed_batches_total",
Help: "Total number of errors sending sample batches to the remote storage.", Help: "Total number of sample batches that encountered an error while being sent to the remote storage.",
ConstLabels: constLabels,
}),
failedSamples: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "failed_samples_total",
Help: "Total number of samples that encountered an error while being sent to the remote storage.",
ConstLabels: constLabels, ConstLabels: constLabels,
}), }),
queueLength: prometheus.NewGauge(prometheus.GaugeOpts{ queueLength: prometheus.NewGauge(prometheus.GaugeOpts{
@ -151,6 +159,8 @@ func (t *StorageQueueManager) Stop() {
func (t *StorageQueueManager) Describe(ch chan<- *prometheus.Desc) { func (t *StorageQueueManager) Describe(ch chan<- *prometheus.Desc) {
t.samplesCount.Describe(ch) t.samplesCount.Describe(ch)
t.sendLatency.Describe(ch) t.sendLatency.Describe(ch)
ch <- t.failedBatches.Desc()
ch <- t.failedSamples.Desc()
ch <- t.queueLength.Desc() ch <- t.queueLength.Desc()
ch <- t.queueCapacity.Desc() ch <- t.queueCapacity.Desc()
} }
@ -160,6 +170,8 @@ func (t *StorageQueueManager) Collect(ch chan<- prometheus.Metric) {
t.samplesCount.Collect(ch) t.samplesCount.Collect(ch)
t.sendLatency.Collect(ch) t.sendLatency.Collect(ch)
t.queueLength.Set(float64(len(t.queue))) t.queueLength.Set(float64(len(t.queue)))
ch <- t.failedBatches
ch <- t.failedSamples
ch <- t.queueLength ch <- t.queueLength
ch <- t.queueCapacity ch <- t.queueCapacity
} }
@ -175,13 +187,14 @@ func (t *StorageQueueManager) sendSamples(s model.Samples) {
// floor. // floor.
begin := time.Now() begin := time.Now()
err := t.tsdb.Store(s) err := t.tsdb.Store(s)
duration := time.Since(begin) / time.Millisecond duration := time.Since(begin) / time.Second
labelValue := success labelValue := success
if err != nil { if err != nil {
log.Warnf("error sending %d samples to remote storage: %s", len(s), err) log.Warnf("error sending %d samples to remote storage: %s", len(s), err)
labelValue = failure labelValue = failure
t.sendErrors.Inc() t.failedBatches.Inc()
t.failedSamples.Add(float64(len(s)))
} }
t.samplesCount.WithLabelValues(labelValue).Add(float64(len(s))) t.samplesCount.WithLabelValues(labelValue).Add(float64(len(s)))
t.sendLatency.Observe(float64(duration)) t.sendLatency.Observe(float64(duration))