add rule_group label to rule evaluation metrics (#7094)

Signed-off-by: yeya24 <yb532204897@gmail.com>
This commit is contained in:
Ben Ye 2020-04-08 17:21:37 -04:00 committed by GitHub
parent 9a21fdcd1b
commit 00730bfee7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 36 additions and 25 deletions

View file

@ -54,11 +54,11 @@ const namespace = "prometheus"
// Metrics for rule evaluation. // Metrics for rule evaluation.
type Metrics struct { type Metrics struct {
evalDuration prometheus.Summary evalDuration prometheus.Summary
evalFailures prometheus.Counter
evalTotal prometheus.Counter
iterationDuration prometheus.Summary iterationDuration prometheus.Summary
iterationsMissed prometheus.Counter iterationsMissed prometheus.Counter
iterationsScheduled prometheus.Counter iterationsScheduled prometheus.Counter
evalTotal *prometheus.CounterVec
evalFailures *prometheus.CounterVec
groupInterval *prometheus.GaugeVec groupInterval *prometheus.GaugeVec
groupLastEvalTime *prometheus.GaugeVec groupLastEvalTime *prometheus.GaugeVec
groupLastDuration *prometheus.GaugeVec groupLastDuration *prometheus.GaugeVec
@ -76,18 +76,6 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
Help: "The duration for a rule to execute.", Help: "The duration for a rule to execute.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
}), }),
evalFailures: prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: namespace,
Name: "rule_evaluation_failures_total",
Help: "The total number of rule evaluation failures.",
}),
evalTotal: prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: namespace,
Name: "rule_evaluations_total",
Help: "The total number of rule evaluations.",
}),
iterationDuration: prometheus.NewSummary(prometheus.SummaryOpts{ iterationDuration: prometheus.NewSummary(prometheus.SummaryOpts{
Namespace: namespace, Namespace: namespace,
Name: "rule_group_duration_seconds", Name: "rule_group_duration_seconds",
@ -104,6 +92,22 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
Name: "rule_group_iterations_total", Name: "rule_group_iterations_total",
Help: "The total number of scheduled rule group evaluations, whether executed or missed.", Help: "The total number of scheduled rule group evaluations, whether executed or missed.",
}), }),
evalTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
Name: "rule_evaluations_total",
Help: "The total number of rule evaluations.",
},
[]string{"rule_group"},
),
evalFailures: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
Name: "rule_evaluation_failures_total",
Help: "The total number of rule evaluation failures.",
},
[]string{"rule_group"},
),
groupInterval: prometheus.NewGaugeVec( groupInterval: prometheus.NewGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Namespace: namespace, Namespace: namespace,
@ -141,11 +145,11 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
if reg != nil { if reg != nil {
reg.MustRegister( reg.MustRegister(
m.evalDuration, m.evalDuration,
m.evalFailures,
m.evalTotal,
m.iterationDuration, m.iterationDuration,
m.iterationsMissed, m.iterationsMissed,
m.iterationsScheduled, m.iterationsScheduled,
m.evalTotal,
m.evalFailures,
m.groupInterval, m.groupInterval,
m.groupLastEvalTime, m.groupLastEvalTime,
m.groupLastDuration, m.groupLastDuration,
@ -257,10 +261,13 @@ func NewGroup(o GroupOptions) *Group {
metrics = NewGroupMetrics(o.Opts.Registerer) metrics = NewGroupMetrics(o.Opts.Registerer)
} }
metrics.groupLastEvalTime.WithLabelValues(groupKey(o.File, o.Name)) key := groupKey(o.File, o.Name)
metrics.groupLastDuration.WithLabelValues(groupKey(o.File, o.Name)) metrics.evalTotal.WithLabelValues(key)
metrics.groupRules.WithLabelValues(groupKey(o.File, o.Name)).Set(float64(len(o.Rules))) metrics.evalFailures.WithLabelValues(key)
metrics.groupInterval.WithLabelValues(groupKey(o.File, o.Name)).Set(o.Interval.Seconds()) metrics.groupLastEvalTime.WithLabelValues(key)
metrics.groupLastDuration.WithLabelValues(key)
metrics.groupRules.WithLabelValues(key).Set(float64(len(o.Rules)))
metrics.groupInterval.WithLabelValues(key).Set(o.Interval.Seconds())
return &Group{ return &Group{
name: o.Name, name: o.Name,
@ -567,7 +574,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
rule.SetEvaluationTimestamp(t) rule.SetEvaluationTimestamp(t)
}(time.Now()) }(time.Now())
g.metrics.evalTotal.Inc() g.metrics.evalTotal.WithLabelValues(groupKey(g.File(), g.Name())).Inc()
vector, err := rule.Eval(ctx, ts, g.opts.QueryFunc, g.opts.ExternalURL) vector, err := rule.Eval(ctx, ts, g.opts.QueryFunc, g.opts.ExternalURL)
if err != nil { if err != nil {
@ -576,7 +583,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
if _, ok := err.(promql.ErrQueryCanceled); !ok { if _, ok := err.(promql.ErrQueryCanceled); !ok {
level.Warn(g.logger).Log("msg", "Evaluating rule failed", "rule", rule, "err", err) level.Warn(g.logger).Log("msg", "Evaluating rule failed", "rule", rule, "err", err)
} }
g.metrics.evalFailures.Inc() g.metrics.evalFailures.WithLabelValues(groupKey(g.File(), g.Name())).Inc()
return return
} }
@ -945,6 +952,8 @@ func (m *Manager) Update(interval time.Duration, files []string, externalLabels
go func(n string, g *Group) { go func(n string, g *Group) {
g.stopAndMakeStale() g.stopAndMakeStale()
if m := g.metrics; m != nil { if m := g.metrics; m != nil {
m.evalTotal.DeleteLabelValues(n)
m.evalFailures.DeleteLabelValues(n)
m.groupInterval.DeleteLabelValues(n) m.groupInterval.DeleteLabelValues(n)
m.groupLastEvalTime.DeleteLabelValues(n) m.groupLastEvalTime.DeleteLabelValues(n)
m.groupLastDuration.DeleteLabelValues(n) m.groupLastDuration.DeleteLabelValues(n)

View file

@ -901,6 +901,8 @@ func TestNotify(t *testing.T) {
func TestMetricsUpdate(t *testing.T) { func TestMetricsUpdate(t *testing.T) {
files := []string{"fixtures/rules.yaml", "fixtures/rules2.yaml"} files := []string{"fixtures/rules.yaml", "fixtures/rules2.yaml"}
metricNames := []string{ metricNames := []string{
"prometheus_rule_evaluations_total",
"prometheus_rule_evaluation_failures_total",
"prometheus_rule_group_interval_seconds", "prometheus_rule_group_interval_seconds",
"prometheus_rule_group_last_duration_seconds", "prometheus_rule_group_last_duration_seconds",
"prometheus_rule_group_last_evaluation_timestamp_seconds", "prometheus_rule_group_last_evaluation_timestamp_seconds",
@ -950,11 +952,11 @@ func TestMetricsUpdate(t *testing.T) {
}{ }{
{ {
files: files, files: files,
metrics: 8, metrics: 12,
}, },
{ {
files: files[:1], files: files[:1],
metrics: 4, metrics: 6,
}, },
{ {
files: files[:0], files: files[:0],
@ -962,7 +964,7 @@ func TestMetricsUpdate(t *testing.T) {
}, },
{ {
files: files[1:], files: files[1:],
metrics: 4, metrics: 6,
}, },
} }