mirror of
https://github.com/prometheus/prometheus.git
synced 2025-03-05 20:59:13 -08:00
metrics: add rule type to evaluation failures and rule groups metrics
Adds a rule_type label to prometheus_rule_evaluation_failures_total and prometheus_rule_group_rules metrics to describe whether the metrics relate to alerting or recording rules. Signed-off-by: Kyle Stang <kylestng@amazon.com>
This commit is contained in:
parent
a5ffa83be8
commit
c80ca020e1
2
go.mod
2
go.mod
|
@ -20,7 +20,7 @@ require (
|
||||||
github.com/digitalocean/godo v1.136.0
|
github.com/digitalocean/godo v1.136.0
|
||||||
github.com/docker/docker v27.5.1+incompatible
|
github.com/docker/docker v27.5.1+incompatible
|
||||||
github.com/edsrzf/mmap-go v1.2.0
|
github.com/edsrzf/mmap-go v1.2.0
|
||||||
github.com/envoyproxy/go-control-plane/envoy v1.32.3
|
github.com/envoyproxy/go-control-plane/envoy v1.32.4
|
||||||
github.com/envoyproxy/protoc-gen-validate v1.2.1
|
github.com/envoyproxy/protoc-gen-validate v1.2.1
|
||||||
github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb
|
github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb
|
||||||
github.com/fsnotify/fsnotify v1.8.0
|
github.com/fsnotify/fsnotify v1.8.0
|
||||||
|
|
4
go.sum
4
go.sum
|
@ -106,8 +106,8 @@ github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRr
|
||||||
github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
|
github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
|
||||||
github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
|
github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
|
||||||
github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
|
github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
|
||||||
github.com/envoyproxy/go-control-plane/envoy v1.32.3 h1:hVEaommgvzTjTd4xCaFd+kEQ2iYBtGxP6luyLrx6uOk=
|
github.com/envoyproxy/go-control-plane/envoy v1.32.4 h1:jb83lalDRZSpPWW2Z7Mck/8kXZ5CQAFYVjQcdVIr83A=
|
||||||
github.com/envoyproxy/go-control-plane/envoy v1.32.3/go.mod h1:F6hWupPfh75TBXGKA++MCT/CZHFq5r9/uwt/kQYkZfE=
|
github.com/envoyproxy/go-control-plane/envoy v1.32.4/go.mod h1:Gzjc5k8JcJswLjAx1Zm+wSYE20UrLtt7JZMWiWQXQEw=
|
||||||
github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
|
github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
|
||||||
github.com/envoyproxy/protoc-gen-validate v1.2.1 h1:DEo3O99U8j4hBFwbJfrz9VtgcDfUKS7KJ7spH3d86P8=
|
github.com/envoyproxy/protoc-gen-validate v1.2.1 h1:DEo3O99U8j4hBFwbJfrz9VtgcDfUKS7KJ7spH3d86P8=
|
||||||
github.com/envoyproxy/protoc-gen-validate v1.2.1/go.mod h1:d/C80l/jxXLdfEIhX1W2TmLfsJ31lvEjwamM4DxlWXU=
|
github.com/envoyproxy/protoc-gen-validate v1.2.1/go.mod h1:d/C80l/jxXLdfEIhX1W2TmLfsJ31lvEjwamM4DxlWXU=
|
||||||
|
|
|
@ -107,15 +107,27 @@ func NewGroup(o GroupOptions) *Group {
|
||||||
metrics = NewGroupMetrics(opts.Registerer)
|
metrics = NewGroupMetrics(opts.Registerer)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
alertingCount := 0
|
||||||
|
recordingCount := 0
|
||||||
|
for _, rule := range o.Rules {
|
||||||
|
if _, ok := rule.(*AlertingRule); ok {
|
||||||
|
alertingCount++
|
||||||
|
} else {
|
||||||
|
recordingCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
key := GroupKey(o.File, o.Name)
|
key := GroupKey(o.File, o.Name)
|
||||||
metrics.IterationsMissed.WithLabelValues(key)
|
metrics.IterationsMissed.WithLabelValues(key)
|
||||||
metrics.IterationsScheduled.WithLabelValues(key)
|
metrics.IterationsScheduled.WithLabelValues(key)
|
||||||
metrics.EvalTotal.WithLabelValues(key)
|
metrics.EvalTotal.WithLabelValues(key)
|
||||||
metrics.EvalFailures.WithLabelValues(key)
|
metrics.EvalFailures.WithLabelValues(key, KindAlerting)
|
||||||
|
metrics.EvalFailures.WithLabelValues(key, KindRecording)
|
||||||
metrics.GroupLastEvalTime.WithLabelValues(key)
|
metrics.GroupLastEvalTime.WithLabelValues(key)
|
||||||
metrics.GroupLastDuration.WithLabelValues(key)
|
metrics.GroupLastDuration.WithLabelValues(key)
|
||||||
metrics.GroupLastRuleDurationSum.WithLabelValues(key)
|
metrics.GroupLastRuleDurationSum.WithLabelValues(key)
|
||||||
metrics.GroupRules.WithLabelValues(key).Set(float64(len(o.Rules)))
|
metrics.GroupRules.WithLabelValues(key, KindAlerting).Set(float64(alertingCount))
|
||||||
|
metrics.GroupRules.WithLabelValues(key, KindRecording).Set(float64(recordingCount))
|
||||||
metrics.GroupSamples.WithLabelValues(key)
|
metrics.GroupSamples.WithLabelValues(key)
|
||||||
metrics.GroupInterval.WithLabelValues(key).Set(o.Interval.Seconds())
|
metrics.GroupInterval.WithLabelValues(key).Set(o.Interval.Seconds())
|
||||||
|
|
||||||
|
@ -542,7 +554,12 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
|
||||||
rule.SetHealth(HealthBad)
|
rule.SetHealth(HealthBad)
|
||||||
rule.SetLastError(err)
|
rule.SetLastError(err)
|
||||||
sp.SetStatus(codes.Error, err.Error())
|
sp.SetStatus(codes.Error, err.Error())
|
||||||
g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc()
|
|
||||||
|
if _, ok := rule.(*AlertingRule); ok {
|
||||||
|
g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), KindAlerting).Inc()
|
||||||
|
} else {
|
||||||
|
g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), KindRecording).Inc()
|
||||||
|
}
|
||||||
|
|
||||||
// Canceled queries are intentional termination of queries. This normally
|
// Canceled queries are intentional termination of queries. This normally
|
||||||
// happens on shutdown and thus we skip logging of any errors here.
|
// happens on shutdown and thus we skip logging of any errors here.
|
||||||
|
@ -572,7 +589,12 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
|
||||||
rule.SetHealth(HealthBad)
|
rule.SetHealth(HealthBad)
|
||||||
rule.SetLastError(err)
|
rule.SetLastError(err)
|
||||||
sp.SetStatus(codes.Error, err.Error())
|
sp.SetStatus(codes.Error, err.Error())
|
||||||
g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc()
|
|
||||||
|
if _, ok := rule.(*AlertingRule); ok {
|
||||||
|
g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), KindAlerting).Inc()
|
||||||
|
} else {
|
||||||
|
g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name()), KindRecording).Inc()
|
||||||
|
}
|
||||||
|
|
||||||
logger.Warn("Rule sample appending failed", "err", err)
|
logger.Warn("Rule sample appending failed", "err", err)
|
||||||
return
|
return
|
||||||
|
@ -974,7 +996,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
|
||||||
Name: "rule_evaluation_failures_total",
|
Name: "rule_evaluation_failures_total",
|
||||||
Help: "The total number of rule evaluation failures.",
|
Help: "The total number of rule evaluation failures.",
|
||||||
},
|
},
|
||||||
[]string{"rule_group"},
|
[]string{"rule_group", "rule_type"},
|
||||||
),
|
),
|
||||||
GroupInterval: prometheus.NewGaugeVec(
|
GroupInterval: prometheus.NewGaugeVec(
|
||||||
prometheus.GaugeOpts{
|
prometheus.GaugeOpts{
|
||||||
|
@ -1022,7 +1044,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
|
||||||
Name: "rule_group_rules",
|
Name: "rule_group_rules",
|
||||||
Help: "The number of rules.",
|
Help: "The number of rules.",
|
||||||
},
|
},
|
||||||
[]string{"rule_group"},
|
[]string{"rule_group", "rule_type"},
|
||||||
),
|
),
|
||||||
GroupSamples: prometheus.NewGaugeVec(
|
GroupSamples: prometheus.NewGaugeVec(
|
||||||
prometheus.GaugeOpts{
|
prometheus.GaugeOpts{
|
||||||
|
|
|
@ -269,11 +269,13 @@ func (m *Manager) Update(interval time.Duration, files []string, externalLabels
|
||||||
m.IterationsMissed.DeleteLabelValues(n)
|
m.IterationsMissed.DeleteLabelValues(n)
|
||||||
m.IterationsScheduled.DeleteLabelValues(n)
|
m.IterationsScheduled.DeleteLabelValues(n)
|
||||||
m.EvalTotal.DeleteLabelValues(n)
|
m.EvalTotal.DeleteLabelValues(n)
|
||||||
m.EvalFailures.DeleteLabelValues(n)
|
m.EvalFailures.DeleteLabelValues(n, KindAlerting)
|
||||||
|
m.EvalFailures.DeleteLabelValues(n, KindRecording)
|
||||||
m.GroupInterval.DeleteLabelValues(n)
|
m.GroupInterval.DeleteLabelValues(n)
|
||||||
m.GroupLastEvalTime.DeleteLabelValues(n)
|
m.GroupLastEvalTime.DeleteLabelValues(n)
|
||||||
m.GroupLastDuration.DeleteLabelValues(n)
|
m.GroupLastDuration.DeleteLabelValues(n)
|
||||||
m.GroupRules.DeleteLabelValues(n)
|
m.GroupRules.DeleteLabelValues(n, KindAlerting)
|
||||||
|
m.GroupRules.DeleteLabelValues(n, KindRecording)
|
||||||
m.GroupSamples.DeleteLabelValues((n))
|
m.GroupSamples.DeleteLabelValues((n))
|
||||||
}
|
}
|
||||||
wg.Done()
|
wg.Done()
|
||||||
|
|
|
@ -1024,11 +1024,11 @@ func TestMetricsUpdate(t *testing.T) {
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
files: files,
|
files: files,
|
||||||
metrics: 12,
|
metrics: 16,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
files: files[:1],
|
files: files[:1],
|
||||||
metrics: 6,
|
metrics: 8,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
files: files[:0],
|
files: files[:0],
|
||||||
|
@ -1036,7 +1036,7 @@ func TestMetricsUpdate(t *testing.T) {
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
files: files[1:],
|
files: files[1:],
|
||||||
metrics: 6,
|
metrics: 8,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue