From 1909686789a5a5f1cfd7b87807bba819187d5b1f Mon Sep 17 00:00:00 2001 From: Bjoern Rabenstein Date: Wed, 23 Jul 2014 19:55:33 +0200 Subject: [PATCH] Make metrics exported by the Prometheus server itself more consistent. - Always spell out the time unit (e.g. milliseconds instead of ms). - Remove "_total" from the names of metrics that are not counters. - Make use of the "Namespace" and "Subsystem" fields in the options. - Removed the "capacity" facet from all metrics about channels/queues. These are all fixed via command line flags and will never change during the runtime of a process. Also, they should not be part of the same metric family. I have added separate metrics for the capacity of queues as convenience. (They will never change and are only set once.) - I left "metric_disk_latency_microseconds" unchanged, although that metric measures the latency of the storage device, even if it is not a spinning disk. "SSD" is read by many as "solid state disk", so it's not too far off. (It should be "solid state drive", of course, but "metric_drive_latency_microseconds" is probably confusing.) - Brian suggested to not mix "failure" and "success" outcome in the same metric family (distinguished by labels). For now, I left it as it is. We are touching some bigger issue here, especially as other parts in the Prometheus ecosystem are following the same principle. We still need to come to terms here and then change things consistently everywhere. Change-Id: If799458b450d18f78500f05990301c12525197d3 --- notification/notification.go | 47 ++++++++++++++--------- retrieval/target.go | 25 +++++++----- retrieval/target_provider.go | 5 ++- retrieval/targetpool.go | 5 ++- rules/manager/manager.go | 24 ++++++------ storage/metric/tiered/curator.go | 10 +++-- storage/metric/tiered/tiered.go | 65 +++++++++++++++++--------------- storage/remote/queue_manager.go | 55 ++++++++++++++++----------- 8 files changed, 136 insertions(+), 100 deletions(-) diff --git a/notification/notification.go b/notification/notification.go index 21012ba959..cf9d285ef2 100644 --- a/notification/notification.go +++ b/notification/notification.go @@ -37,14 +37,13 @@ const ( // String constants for instrumentation. const ( + namespace = "prometheus" + subsystem = "notifications" + result = "result" success = "success" failure = "failure" dropped = "dropped" - - facet = "facet" - occupancy = "occupancy" - capacity = "capacity" ) var ( @@ -86,8 +85,9 @@ type NotificationHandler struct { // HTTP client with custom timeout settings. httpClient httpPoster - notificationLatency *prometheus.SummaryVec - notificationsQueueSize *prometheus.GaugeVec + notificationLatency *prometheus.SummaryVec + notificationsQueueLength prometheus.Gauge + notificationsQueueCapacity prometheus.Metric } // Construct a new NotificationHandler. @@ -99,17 +99,27 @@ func NewNotificationHandler(alertmanagerUrl string, notificationReqs <-chan Noti notificationLatency: prometheus.NewSummaryVec( prometheus.SummaryOpts{ - Name: "prometheus_notifications_latency_ms", - Help: "Latency quantiles for sending alert notifications in milliseconds.", + Namespace: namespace, + Subsystem: subsystem, + Name: "latency_milliseconds", + Help: "Latency quantiles for sending alert notifications.", }, []string{result}, ), - notificationsQueueSize: prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "prometheus_notifications_queue_size_total", - Help: "The size and capacity of the alert notification queue.", - }, - []string{facet}, + notificationsQueueLength: prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "queue_length", + Help: "The number of alert notifications in the queue.", + }), + notificationsQueueCapacity: prometheus.MustNewConstMetric( + prometheus.NewDesc( + prometheus.BuildFQName(namespace, subsystem, "queue_capacity"), + "The capacity of the alert notifications queue.", + nil, nil, + ), + prometheus.GaugeValue, + float64(cap(notificationReqs)), ), } } @@ -180,13 +190,14 @@ func (n *NotificationHandler) Run() { // Describe implements prometheus.Collector. func (n *NotificationHandler) Describe(ch chan<- *prometheus.Desc) { n.notificationLatency.Describe(ch) - n.notificationsQueueSize.Describe(ch) + ch <- n.notificationsQueueLength.Desc() + ch <- n.notificationsQueueCapacity.Desc() } // Collect implements prometheus.Collector. func (n *NotificationHandler) Collect(ch chan<- prometheus.Metric) { n.notificationLatency.Collect(ch) - n.notificationsQueueSize.WithLabelValues(occupancy).Set(float64(len(n.pendingNotifications))) - n.notificationsQueueSize.WithLabelValues(capacity).Set(float64(cap(n.pendingNotifications))) - n.notificationsQueueSize.Collect(ch) + n.notificationsQueueLength.Set(float64(len(n.pendingNotifications))) + ch <- n.notificationsQueueLength + ch <- n.notificationsQueueCapacity } diff --git a/retrieval/target.go b/retrieval/target.go index 7a761593d7..976f8f871d 100644 --- a/retrieval/target.go +++ b/retrieval/target.go @@ -35,12 +35,12 @@ const ( ScrapeHealthMetricName clientmodel.LabelValue = "up" // Constants for instrumentation. - address = "instance" - alive = "alive" - failure = "failure" - outcome = "outcome" - state = "state" - success = "success" + namespace = "prometheus" + job = "target_job" + instance = "target_instance" + failure = "failure" + outcome = "outcome" + success = "success" ) var ( @@ -48,11 +48,12 @@ var ( targetOperationLatencies = prometheus.NewSummaryVec( prometheus.SummaryOpts{ - Name: "prometheus_target_operation_latency_ms", - Help: "The latencies for various target operations.", + Namespace: namespace, + Name: "target_operation_latency_milliseconds", + Help: "The latencies for target operations.", Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99}, }, - []string{address, outcome}, + []string{job, instance, outcome}, ) ) @@ -196,7 +197,11 @@ const acceptHeader = `application/vnd.google.protobuf;proto=io.prometheus.client func (t *target) scrape(timestamp clientmodel.Timestamp, ingester extraction.Ingester) (err error) { defer func(start time.Time) { ms := float64(time.Since(start)) / float64(time.Millisecond) - labels := prometheus.Labels{address: t.Address(), outcome: success} + labels := prometheus.Labels{ + job: string(t.baseLabels[clientmodel.JobLabel]), + instance: t.Address(), + outcome: success, + } if err != nil { labels[outcome] = failure } diff --git a/retrieval/target_provider.go b/retrieval/target_provider.go index 0b19b39c88..15cc282aea 100644 --- a/retrieval/target_provider.go +++ b/retrieval/target_provider.go @@ -35,8 +35,9 @@ const resolvConf = "/etc/resolv.conf" var ( dnsSDLookupsCount = prometheus.NewCounterVec( prometheus.CounterOpts{ - Name: "prometheus_dns_sd_lookups_total", - Help: "The number of DNS-SD lookup successes/failures per pool.", + Namespace: namespace, + Name: "dns_sd_lookups_total", + Help: "The number of DNS-SD lookup successes/failures per pool.", }, []string{outcome}, ) diff --git a/retrieval/targetpool.go b/retrieval/targetpool.go index f1a87d0e1f..bce49be4cd 100644 --- a/retrieval/targetpool.go +++ b/retrieval/targetpool.go @@ -32,8 +32,9 @@ const ( var ( retrievalDurations = prometheus.NewSummaryVec( prometheus.SummaryOpts{ - Name: "prometheus_targetpool_duration_ms", - Help: "The durations for each TargetPool to retrieve state from all included entities.", + Namespace: namespace, + Name: "targetpool_retrieve_time_milliseconds", + Help: "The time needed for each TargetPool to retrieve state from all included entities.", Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99}, }, []string{intervalKey}, diff --git a/rules/manager/manager.go b/rules/manager/manager.go index b69546d8ab..e1917454d7 100644 --- a/rules/manager/manager.go +++ b/rules/manager/manager.go @@ -33,7 +33,8 @@ import ( // Constants for instrumentation. const ( - intervalLabel = "interval" + namespace = "prometheus" + ruleTypeLabel = "rule_type" alertingRuleType = "alerting" recordingRuleType = "recording" @@ -42,19 +43,18 @@ const ( var ( evalDuration = prometheus.NewSummaryVec( prometheus.SummaryOpts{ - Name: "prometheus_rule_evaluation_duration_ms", - Help: "The duration for a rule to execute.", + Namespace: namespace, + Name: "rule_evaluation_duration_milliseconds", + Help: "The duration for a rule to execute.", }, []string{ruleTypeLabel}, ) - iterationDuration = prometheus.NewSummaryVec( - prometheus.SummaryOpts{ - Name: "prometheus_evaluator_duration_ms", - Help: "The duration for each evaluation pool to execute.", - Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99}, - }, - []string{intervalLabel}, - ) + iterationDuration = prometheus.NewSummary(prometheus.SummaryOpts{ + Namespace: namespace, + Name: "evaluator_duration_milliseconds", + Help: "The duration for all evaluations to execute.", + Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99}, + }) ) func init() { @@ -124,7 +124,7 @@ func (m *ruleManager) Run() { case <-ticker.C: start := time.Now() m.runIteration(m.results) - iterationDuration.WithLabelValues(m.interval.String()).Observe(float64(time.Since(start) / time.Millisecond)) + iterationDuration.Observe(float64(time.Since(start) / time.Millisecond)) case <-m.done: glog.Info("rules.Rule manager exiting...") return diff --git a/storage/metric/tiered/curator.go b/storage/metric/tiered/curator.go index ba2c63b3a4..ace1d58eb5 100644 --- a/storage/metric/tiered/curator.go +++ b/storage/metric/tiered/curator.go @@ -47,16 +47,18 @@ const ( var ( curationDurations = prometheus.NewSummaryVec( prometheus.SummaryOpts{ - Name: "prometheus_curation_durations_ms", - Help: "Histogram of time spent in curation (ms).", + Namespace: namespace, + Name: "curation_durations_milliseconds", + Help: "Histogram of time spent in curation.", Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99}, }, []string{cutOff, processorName, result}, ) curationFilterOperations = prometheus.NewCounterVec( prometheus.CounterOpts{ - Name: "prometheus_curation_filter_operations_total", - Help: "The number of curation filter operations completed.", + Namespace: namespace, + Name: "curation_filter_operations_total", + Help: "The number of curation filter operations completed.", }, []string{cutOff, processorName, result}, ) diff --git a/storage/metric/tiered/tiered.go b/storage/metric/tiered/tiered.go index cc02e84805..cc5e4fc58a 100644 --- a/storage/metric/tiered/tiered.go +++ b/storage/metric/tiered/tiered.go @@ -33,6 +33,8 @@ import ( // Constants for instrumentation. const ( + namespace = "prometheus" + operation = "operation" success = "success" failure = "failure" @@ -51,24 +53,22 @@ const ( queue = "queue" appendToDisk = "append_to_disk" viewGeneration = "view_generation" - - facet = "facet" - occupancy = "occupancy" - capacity = "capacity" ) var ( storageLatency = prometheus.NewSummaryVec( prometheus.SummaryOpts{ - Name: "prometheus_metric_disk_latency_microseconds", - Help: "Latency for metric disk operations in microseconds.", + Namespace: namespace, + Name: "metric_disk_latency_milliseconds", + Help: "Latency for metric disk operations (includes any storage drive even if it is not strictly a disk, e.g. SSD).", Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99}, }, []string{operation, result}, ) storedSamplesCount = prometheus.NewCounter(prometheus.CounterOpts{ - Name: "prometheus_stored_samples_total", - Help: "The number of samples that have been stored.", + Namespace: namespace, + Name: "stored_samples_total", + Help: "The number of samples that have been stored.", }) ) @@ -145,7 +145,8 @@ type TieredStorage struct { dtoSampleKeys *dtoSampleKeyList sampleKeys *sampleKeyList - queueSizes *prometheus.GaugeVec + queueLength *prometheus.GaugeVec + queueCapacity *prometheus.GaugeVec } // viewJob encapsulates a request to extract sample values from the datastore. @@ -159,10 +160,9 @@ type viewJob struct { const ( tieredMemorySemaphores = 5 + watermarkCacheLimit = 1024 * 1024 ) -const watermarkCacheLimit = 1024 * 1024 - // NewTieredStorage returns a TieredStorage object ready to use. func NewTieredStorage( appendToDiskQueueDepth, @@ -208,14 +208,25 @@ func NewTieredStorage( dtoSampleKeys: newDtoSampleKeyList(10), sampleKeys: newSampleKeyList(10), - queueSizes: prometheus.NewGaugeVec( + queueLength: prometheus.NewGaugeVec( prometheus.GaugeOpts{ - Name: "prometheus_storage_queue_sizes_total", - Help: "The various sizes and capacities of the storage queues.", + Namespace: namespace, + Name: "storage_queue_length", + Help: "The number of items in the storage queues.", }, - []string{queue, facet}, + []string{queue}, + ), + queueCapacity: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: "storage_queue_capacity", + Help: "The capacity of the storage queues.", + }, + []string{queue}, ), } + s.queueCapacity.WithLabelValues(appendToDisk).Set(float64(appendToDiskQueueDepth)) + s.queueCapacity.WithLabelValues(viewGeneration).Set(float64(viewQueueDepth)) for i := 0; i < tieredMemorySemaphores; i++ { s.memorySemaphore <- true @@ -444,13 +455,13 @@ func (t *TieredStorage) renderView(viewJob viewJob) { storageLatency.With( prometheus.Labels{operation: renderView, result: success}, ).Observe( - float64(time.Since(begin) / time.Microsecond), + float64(time.Since(begin) / time.Millisecond), ) } else { storageLatency.With( prometheus.Labels{operation: renderView, result: failure}, ).Observe( - float64(time.Since(begin) / time.Microsecond), + float64(time.Since(begin) / time.Millisecond), ) } }() @@ -788,23 +799,15 @@ func (t *TieredStorage) GetMetricForFingerprint(f *clientmodel.Fingerprint) (cli // Describe implements prometheus.Collector. func (t *TieredStorage) Describe(ch chan<- *prometheus.Desc) { - t.queueSizes.Describe(ch) + t.queueLength.Describe(ch) + t.queueCapacity.Describe(ch) } // Collect implements prometheus.Collector. func (t *TieredStorage) Collect(ch chan<- prometheus.Metric) { - t.queueSizes.With(prometheus.Labels{ - queue: appendToDisk, facet: occupancy, - }).Set(float64(len(t.appendToDiskQueue))) - t.queueSizes.With(prometheus.Labels{ - queue: appendToDisk, facet: capacity, - }).Set(float64(cap(t.appendToDiskQueue))) - t.queueSizes.With(prometheus.Labels{ - queue: viewGeneration, facet: occupancy, - }).Set(float64(len(t.ViewQueue))) - t.queueSizes.With(prometheus.Labels{ - queue: viewGeneration, facet: capacity, - }).Set(float64(cap(t.ViewQueue))) + t.queueLength.WithLabelValues(appendToDisk).Set(float64(len(t.appendToDiskQueue))) + t.queueLength.WithLabelValues(viewGeneration).Set(float64(len(t.ViewQueue))) - t.queueSizes.Collect(ch) + t.queueLength.Collect(ch) + t.queueCapacity.Collect(ch) } diff --git a/storage/remote/queue_manager.go b/storage/remote/queue_manager.go index 6ef4f3c045..50e0837658 100644 --- a/storage/remote/queue_manager.go +++ b/storage/remote/queue_manager.go @@ -34,14 +34,13 @@ const ( // String constants for instrumentation. const ( + namespace = "prometheus" + subsystem = "remote_tsdb" + result = "result" success = "success" failure = "failure" dropped = "dropped" - - facet = "facet" - occupancy = "occupancy" - capacity = "capacity" ) // TSDBClient defines an interface for sending a batch of samples to an @@ -59,9 +58,10 @@ type TSDBQueueManager struct { sendSemaphore chan bool drained chan bool - samplesCount *prometheus.CounterVec - sendLatency *prometheus.SummaryVec - queueSize *prometheus.GaugeVec + samplesCount *prometheus.CounterVec + sendLatency *prometheus.SummaryVec + queueLength prometheus.Gauge + queueCapacity prometheus.Metric } // NewTSDBQueueManager builds a new TSDBQueueManager. @@ -74,24 +74,36 @@ func NewTSDBQueueManager(tsdb TSDBClient, queueCapacity int) *TSDBQueueManager { samplesCount: prometheus.NewCounterVec( prometheus.CounterOpts{ - Name: "prometheus_remote_tsdb_sent_samples_total", - Help: "Total number of samples processed to be sent to remote TSDB.", + Namespace: namespace, + Subsystem: subsystem, + Name: "sent_samples_total", + Help: "Total number of processed samples to be sent to remote TSDB.", }, []string{result}, ), sendLatency: prometheus.NewSummaryVec( prometheus.SummaryOpts{ - Name: "prometheus_remote_tsdb_latency_ms", - Help: "Latency quantiles for sending samples to the remote TSDB in milliseconds.", + Namespace: namespace, + Subsystem: subsystem, + Name: "sent_latency_milliseconds", + Help: "Latency quantiles for sending samples to the remote TSDB.", }, []string{result}, ), - queueSize: prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "prometheus_remote_tsdb_queue_size_total", - Help: "The size and capacity of the queue of samples to be sent to the remote TSDB.", - }, - []string{facet}, + queueLength: prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: subsystem, + Name: "queue_length", + Help: "The number of processed samples queued to be sent to the remote TSDB.", + }), + queueCapacity: prometheus.MustNewConstMetric( + prometheus.NewDesc( + prometheus.BuildFQName(namespace, subsystem, "queue_capacity"), + "The capacity of the queue of samples to be sent to the remote TSDB.", + nil, nil, + ), + prometheus.GaugeValue, + float64(queueCapacity), ), } } @@ -122,16 +134,17 @@ func (t *TSDBQueueManager) Close() { func (t *TSDBQueueManager) Describe(ch chan<- *prometheus.Desc) { t.samplesCount.Describe(ch) t.sendLatency.Describe(ch) - t.queueSize.Describe(ch) + ch <- t.queueLength.Desc() + ch <- t.queueCapacity.Desc() } // Collect implements prometheus.Collector. func (t *TSDBQueueManager) Collect(ch chan<- prometheus.Metric) { t.samplesCount.Collect(ch) t.sendLatency.Collect(ch) - t.queueSize.WithLabelValues(occupancy).Set(float64(len(t.queue))) - t.queueSize.WithLabelValues(capacity).Set(float64(cap(t.queue))) - t.queueSize.Collect(ch) + t.queueLength.Set(float64(len(t.queue))) + ch <- t.queueLength + ch <- t.queueCapacity } func (t *TSDBQueueManager) sendSamples(s clientmodel.Samples) {