From c80ca020e10f787238e75a2e9b0a74e67e2de7fb Mon Sep 17 00:00:00 2001 From: Kyle Stang Date: Wed, 12 Feb 2025 22:53:00 +0000 Subject: [PATCH] metrics: add rule type to evaluation failures and rule groups metrics Adds a rule_type label to prometheus_rule_evaluation_failures_total and prometheus_rule_group_rules metrics to describe whether the metrics relate to alerting or recording rules. Signed-off-by: Kyle Stang --- go.mod | 2 +- go.sum | 4 ++-- rules/group.go | 34 ++++++++++++++++++++++++++++------ rules/manager.go | 6 ++++-- rules/manager_test.go | 6 +++--- 5 files changed, 38 insertions(+), 14 deletions(-) diff --git a/go.mod b/go.mod index 7509c5e2ee..3985197617 100644 --- a/go.mod +++ b/go.mod @@ -20,7 +20,7 @@ require ( github.com/digitalocean/godo v1.136.0 github.com/docker/docker v27.5.1+incompatible github.com/edsrzf/mmap-go v1.2.0 - github.com/envoyproxy/go-control-plane/envoy v1.32.3 + github.com/envoyproxy/go-control-plane/envoy v1.32.4 github.com/envoyproxy/protoc-gen-validate v1.2.1 github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb github.com/fsnotify/fsnotify v1.8.0 diff --git a/go.sum b/go.sum index 5cf3e3ab67..79767ddb2a 100644 --- a/go.sum +++ b/go.sum @@ -106,8 +106,8 @@ github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRr github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= -github.com/envoyproxy/go-control-plane/envoy v1.32.3 h1:hVEaommgvzTjTd4xCaFd+kEQ2iYBtGxP6luyLrx6uOk= -github.com/envoyproxy/go-control-plane/envoy v1.32.3/go.mod h1:F6hWupPfh75TBXGKA++MCT/CZHFq5r9/uwt/kQYkZfE= +github.com/envoyproxy/go-control-plane/envoy v1.32.4 h1:jb83lalDRZSpPWW2Z7Mck/8kXZ5CQAFYVjQcdVIr83A= +github.com/envoyproxy/go-control-plane/envoy v1.32.4/go.mod h1:Gzjc5k8JcJswLjAx1Zm+wSYE20UrLtt7JZMWiWQXQEw= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= github.com/envoyproxy/protoc-gen-validate v1.2.1 h1:DEo3O99U8j4hBFwbJfrz9VtgcDfUKS7KJ7spH3d86P8= github.com/envoyproxy/protoc-gen-validate v1.2.1/go.mod h1:d/C80l/jxXLdfEIhX1W2TmLfsJ31lvEjwamM4DxlWXU= diff --git a/rules/group.go b/rules/group.go index 9ad9aab093..597096b71c 100644 --- a/rules/group.go +++ b/rules/group.go @@ -107,15 +107,27 @@ func NewGroup(o GroupOptions) *Group { metrics = NewGroupMetrics(opts.Registerer) } + alertingCount := 0 + recordingCount := 0 + for _, rule := range o.Rules { + if _, ok := rule.(*AlertingRule); ok { + alertingCount++ + } else { + recordingCount++ + } + } + key := GroupKey(o.File, o.Name) metrics.IterationsMissed.WithLabelValues(key) metrics.IterationsScheduled.WithLabelValues(key) metrics.EvalTotal.WithLabelValues(key) - metrics.EvalFailures.WithLabelValues(key) + metrics.EvalFailures.WithLabelValues(key, KindAlerting) + metrics.EvalFailures.WithLabelValues(key, KindRecording) metrics.GroupLastEvalTime.WithLabelValues(key) metrics.GroupLastDuration.WithLabelValues(key) metrics.GroupLastRuleDurationSum.WithLabelValues(key) - metrics.GroupRules.WithLabelValues(key).Set(float64(len(o.Rules))) + metrics.GroupRules.WithLabelValues(key, KindAlerting).Set(float64(alertingCount)) + metrics.GroupRules.WithLabelValues(key, KindRecording).Set(float64(recordingCount)) metrics.GroupSamples.WithLabelValues(key) metrics.GroupInterval.WithLabelValues(key).Set(o.Interval.Seconds()) @@ -542,7 +554,12 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { rule.SetHealth(HealthBad) rule.SetLastError(err) sp.SetStatus(codes.Error, err.Error()) - g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() + + if _, ok := rule.(*AlertingRule); ok { + g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), KindAlerting).Inc() + } else { + g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), KindRecording).Inc() + } // Canceled queries are intentional termination of queries. This normally // happens on shutdown and thus we skip logging of any errors here. @@ -572,7 +589,12 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { rule.SetHealth(HealthBad) rule.SetLastError(err) sp.SetStatus(codes.Error, err.Error()) - g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() + + if _, ok := rule.(*AlertingRule); ok { + g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), KindAlerting).Inc() + } else { + g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), KindRecording).Inc() + } logger.Warn("Rule sample appending failed", "err", err) return @@ -974,7 +996,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics { Name: "rule_evaluation_failures_total", Help: "The total number of rule evaluation failures.", }, - []string{"rule_group"}, + []string{"rule_group", "rule_type"}, ), GroupInterval: prometheus.NewGaugeVec( prometheus.GaugeOpts{ @@ -1022,7 +1044,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics { Name: "rule_group_rules", Help: "The number of rules.", }, - []string{"rule_group"}, + []string{"rule_group", "rule_type"}, ), GroupSamples: prometheus.NewGaugeVec( prometheus.GaugeOpts{ diff --git a/rules/manager.go b/rules/manager.go index 6e96df2168..689aaca49a 100644 --- a/rules/manager.go +++ b/rules/manager.go @@ -269,11 +269,13 @@ func (m *Manager) Update(interval time.Duration, files []string, externalLabels m.IterationsMissed.DeleteLabelValues(n) m.IterationsScheduled.DeleteLabelValues(n) m.EvalTotal.DeleteLabelValues(n) - m.EvalFailures.DeleteLabelValues(n) + m.EvalFailures.DeleteLabelValues(n, KindAlerting) + m.EvalFailures.DeleteLabelValues(n, KindRecording) m.GroupInterval.DeleteLabelValues(n) m.GroupLastEvalTime.DeleteLabelValues(n) m.GroupLastDuration.DeleteLabelValues(n) - m.GroupRules.DeleteLabelValues(n) + m.GroupRules.DeleteLabelValues(n, KindAlerting) + m.GroupRules.DeleteLabelValues(n, KindRecording) m.GroupSamples.DeleteLabelValues((n)) } wg.Done() diff --git a/rules/manager_test.go b/rules/manager_test.go index 6c32b6d0f1..c8f27fdc5b 100644 --- a/rules/manager_test.go +++ b/rules/manager_test.go @@ -1024,11 +1024,11 @@ func TestMetricsUpdate(t *testing.T) { }{ { files: files, - metrics: 12, + metrics: 16, }, { files: files[:1], - metrics: 6, + metrics: 8, }, { files: files[:0], @@ -1036,7 +1036,7 @@ func TestMetricsUpdate(t *testing.T) { }, { files: files[1:], - metrics: 6, + metrics: 8, }, }