From 00730bfee7fb8b46c18837139965e3c7bc70ef36 Mon Sep 17 00:00:00 2001 From: Ben Ye Date: Wed, 8 Apr 2020 17:21:37 -0400 Subject: [PATCH] add rule_group label to rule evaluation metrics (#7094) Signed-off-by: yeya24 --- rules/manager.go | 53 +++++++++++++++++++++++++------------------ rules/manager_test.go | 8 ++++--- 2 files changed, 36 insertions(+), 25 deletions(-) diff --git a/rules/manager.go b/rules/manager.go index a37f742eb..eca688c05 100644 --- a/rules/manager.go +++ b/rules/manager.go @@ -54,11 +54,11 @@ const namespace = "prometheus" // Metrics for rule evaluation. type Metrics struct { evalDuration prometheus.Summary - evalFailures prometheus.Counter - evalTotal prometheus.Counter iterationDuration prometheus.Summary iterationsMissed prometheus.Counter iterationsScheduled prometheus.Counter + evalTotal *prometheus.CounterVec + evalFailures *prometheus.CounterVec groupInterval *prometheus.GaugeVec groupLastEvalTime *prometheus.GaugeVec groupLastDuration *prometheus.GaugeVec @@ -76,18 +76,6 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics { Help: "The duration for a rule to execute.", Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, }), - evalFailures: prometheus.NewCounter( - prometheus.CounterOpts{ - Namespace: namespace, - Name: "rule_evaluation_failures_total", - Help: "The total number of rule evaluation failures.", - }), - evalTotal: prometheus.NewCounter( - prometheus.CounterOpts{ - Namespace: namespace, - Name: "rule_evaluations_total", - Help: "The total number of rule evaluations.", - }), iterationDuration: prometheus.NewSummary(prometheus.SummaryOpts{ Namespace: namespace, Name: "rule_group_duration_seconds", @@ -104,6 +92,22 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics { Name: "rule_group_iterations_total", Help: "The total number of scheduled rule group evaluations, whether executed or missed.", }), + evalTotal: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Name: "rule_evaluations_total", + Help: "The total number of rule evaluations.", + }, + []string{"rule_group"}, + ), + evalFailures: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Name: "rule_evaluation_failures_total", + Help: "The total number of rule evaluation failures.", + }, + []string{"rule_group"}, + ), groupInterval: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: namespace, @@ -141,11 +145,11 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics { if reg != nil { reg.MustRegister( m.evalDuration, - m.evalFailures, - m.evalTotal, m.iterationDuration, m.iterationsMissed, m.iterationsScheduled, + m.evalTotal, + m.evalFailures, m.groupInterval, m.groupLastEvalTime, m.groupLastDuration, @@ -257,10 +261,13 @@ func NewGroup(o GroupOptions) *Group { metrics = NewGroupMetrics(o.Opts.Registerer) } - metrics.groupLastEvalTime.WithLabelValues(groupKey(o.File, o.Name)) - metrics.groupLastDuration.WithLabelValues(groupKey(o.File, o.Name)) - metrics.groupRules.WithLabelValues(groupKey(o.File, o.Name)).Set(float64(len(o.Rules))) - metrics.groupInterval.WithLabelValues(groupKey(o.File, o.Name)).Set(o.Interval.Seconds()) + key := groupKey(o.File, o.Name) + metrics.evalTotal.WithLabelValues(key) + metrics.evalFailures.WithLabelValues(key) + metrics.groupLastEvalTime.WithLabelValues(key) + metrics.groupLastDuration.WithLabelValues(key) + metrics.groupRules.WithLabelValues(key).Set(float64(len(o.Rules))) + metrics.groupInterval.WithLabelValues(key).Set(o.Interval.Seconds()) return &Group{ name: o.Name, @@ -567,7 +574,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { rule.SetEvaluationTimestamp(t) }(time.Now()) - g.metrics.evalTotal.Inc() + g.metrics.evalTotal.WithLabelValues(groupKey(g.File(), g.Name())).Inc() vector, err := rule.Eval(ctx, ts, g.opts.QueryFunc, g.opts.ExternalURL) if err != nil { @@ -576,7 +583,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { if _, ok := err.(promql.ErrQueryCanceled); !ok { level.Warn(g.logger).Log("msg", "Evaluating rule failed", "rule", rule, "err", err) } - g.metrics.evalFailures.Inc() + g.metrics.evalFailures.WithLabelValues(groupKey(g.File(), g.Name())).Inc() return } @@ -945,6 +952,8 @@ func (m *Manager) Update(interval time.Duration, files []string, externalLabels go func(n string, g *Group) { g.stopAndMakeStale() if m := g.metrics; m != nil { + m.evalTotal.DeleteLabelValues(n) + m.evalFailures.DeleteLabelValues(n) m.groupInterval.DeleteLabelValues(n) m.groupLastEvalTime.DeleteLabelValues(n) m.groupLastDuration.DeleteLabelValues(n) diff --git a/rules/manager_test.go b/rules/manager_test.go index 4d26885c8..99e305ca6 100644 --- a/rules/manager_test.go +++ b/rules/manager_test.go @@ -901,6 +901,8 @@ func TestNotify(t *testing.T) { func TestMetricsUpdate(t *testing.T) { files := []string{"fixtures/rules.yaml", "fixtures/rules2.yaml"} metricNames := []string{ + "prometheus_rule_evaluations_total", + "prometheus_rule_evaluation_failures_total", "prometheus_rule_group_interval_seconds", "prometheus_rule_group_last_duration_seconds", "prometheus_rule_group_last_evaluation_timestamp_seconds", @@ -950,11 +952,11 @@ func TestMetricsUpdate(t *testing.T) { }{ { files: files, - metrics: 8, + metrics: 12, }, { files: files[:1], - metrics: 4, + metrics: 6, }, { files: files[:0], @@ -962,7 +964,7 @@ func TestMetricsUpdate(t *testing.T) { }, { files: files[1:], - metrics: 4, + metrics: 6, }, }