mirror of
https://github.com/prometheus/prometheus.git
synced 2025-01-11 05:47:27 -08:00
add rule_group label to rule evaluation metrics (#7094)
Signed-off-by: yeya24 <yb532204897@gmail.com>
This commit is contained in:
parent
9a21fdcd1b
commit
00730bfee7
|
@ -54,11 +54,11 @@ const namespace = "prometheus"
|
|||
// Metrics for rule evaluation.
|
||||
type Metrics struct {
|
||||
evalDuration prometheus.Summary
|
||||
evalFailures prometheus.Counter
|
||||
evalTotal prometheus.Counter
|
||||
iterationDuration prometheus.Summary
|
||||
iterationsMissed prometheus.Counter
|
||||
iterationsScheduled prometheus.Counter
|
||||
evalTotal *prometheus.CounterVec
|
||||
evalFailures *prometheus.CounterVec
|
||||
groupInterval *prometheus.GaugeVec
|
||||
groupLastEvalTime *prometheus.GaugeVec
|
||||
groupLastDuration *prometheus.GaugeVec
|
||||
|
@ -76,18 +76,6 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
|
|||
Help: "The duration for a rule to execute.",
|
||||
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
|
||||
}),
|
||||
evalFailures: prometheus.NewCounter(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: namespace,
|
||||
Name: "rule_evaluation_failures_total",
|
||||
Help: "The total number of rule evaluation failures.",
|
||||
}),
|
||||
evalTotal: prometheus.NewCounter(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: namespace,
|
||||
Name: "rule_evaluations_total",
|
||||
Help: "The total number of rule evaluations.",
|
||||
}),
|
||||
iterationDuration: prometheus.NewSummary(prometheus.SummaryOpts{
|
||||
Namespace: namespace,
|
||||
Name: "rule_group_duration_seconds",
|
||||
|
@ -104,6 +92,22 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
|
|||
Name: "rule_group_iterations_total",
|
||||
Help: "The total number of scheduled rule group evaluations, whether executed or missed.",
|
||||
}),
|
||||
evalTotal: prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: namespace,
|
||||
Name: "rule_evaluations_total",
|
||||
Help: "The total number of rule evaluations.",
|
||||
},
|
||||
[]string{"rule_group"},
|
||||
),
|
||||
evalFailures: prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: namespace,
|
||||
Name: "rule_evaluation_failures_total",
|
||||
Help: "The total number of rule evaluation failures.",
|
||||
},
|
||||
[]string{"rule_group"},
|
||||
),
|
||||
groupInterval: prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: namespace,
|
||||
|
@ -141,11 +145,11 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
|
|||
if reg != nil {
|
||||
reg.MustRegister(
|
||||
m.evalDuration,
|
||||
m.evalFailures,
|
||||
m.evalTotal,
|
||||
m.iterationDuration,
|
||||
m.iterationsMissed,
|
||||
m.iterationsScheduled,
|
||||
m.evalTotal,
|
||||
m.evalFailures,
|
||||
m.groupInterval,
|
||||
m.groupLastEvalTime,
|
||||
m.groupLastDuration,
|
||||
|
@ -257,10 +261,13 @@ func NewGroup(o GroupOptions) *Group {
|
|||
metrics = NewGroupMetrics(o.Opts.Registerer)
|
||||
}
|
||||
|
||||
metrics.groupLastEvalTime.WithLabelValues(groupKey(o.File, o.Name))
|
||||
metrics.groupLastDuration.WithLabelValues(groupKey(o.File, o.Name))
|
||||
metrics.groupRules.WithLabelValues(groupKey(o.File, o.Name)).Set(float64(len(o.Rules)))
|
||||
metrics.groupInterval.WithLabelValues(groupKey(o.File, o.Name)).Set(o.Interval.Seconds())
|
||||
key := groupKey(o.File, o.Name)
|
||||
metrics.evalTotal.WithLabelValues(key)
|
||||
metrics.evalFailures.WithLabelValues(key)
|
||||
metrics.groupLastEvalTime.WithLabelValues(key)
|
||||
metrics.groupLastDuration.WithLabelValues(key)
|
||||
metrics.groupRules.WithLabelValues(key).Set(float64(len(o.Rules)))
|
||||
metrics.groupInterval.WithLabelValues(key).Set(o.Interval.Seconds())
|
||||
|
||||
return &Group{
|
||||
name: o.Name,
|
||||
|
@ -567,7 +574,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
|
|||
rule.SetEvaluationTimestamp(t)
|
||||
}(time.Now())
|
||||
|
||||
g.metrics.evalTotal.Inc()
|
||||
g.metrics.evalTotal.WithLabelValues(groupKey(g.File(), g.Name())).Inc()
|
||||
|
||||
vector, err := rule.Eval(ctx, ts, g.opts.QueryFunc, g.opts.ExternalURL)
|
||||
if err != nil {
|
||||
|
@ -576,7 +583,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
|
|||
if _, ok := err.(promql.ErrQueryCanceled); !ok {
|
||||
level.Warn(g.logger).Log("msg", "Evaluating rule failed", "rule", rule, "err", err)
|
||||
}
|
||||
g.metrics.evalFailures.Inc()
|
||||
g.metrics.evalFailures.WithLabelValues(groupKey(g.File(), g.Name())).Inc()
|
||||
return
|
||||
}
|
||||
|
||||
|
@ -945,6 +952,8 @@ func (m *Manager) Update(interval time.Duration, files []string, externalLabels
|
|||
go func(n string, g *Group) {
|
||||
g.stopAndMakeStale()
|
||||
if m := g.metrics; m != nil {
|
||||
m.evalTotal.DeleteLabelValues(n)
|
||||
m.evalFailures.DeleteLabelValues(n)
|
||||
m.groupInterval.DeleteLabelValues(n)
|
||||
m.groupLastEvalTime.DeleteLabelValues(n)
|
||||
m.groupLastDuration.DeleteLabelValues(n)
|
||||
|
|
|
@ -901,6 +901,8 @@ func TestNotify(t *testing.T) {
|
|||
func TestMetricsUpdate(t *testing.T) {
|
||||
files := []string{"fixtures/rules.yaml", "fixtures/rules2.yaml"}
|
||||
metricNames := []string{
|
||||
"prometheus_rule_evaluations_total",
|
||||
"prometheus_rule_evaluation_failures_total",
|
||||
"prometheus_rule_group_interval_seconds",
|
||||
"prometheus_rule_group_last_duration_seconds",
|
||||
"prometheus_rule_group_last_evaluation_timestamp_seconds",
|
||||
|
@ -950,11 +952,11 @@ func TestMetricsUpdate(t *testing.T) {
|
|||
}{
|
||||
{
|
||||
files: files,
|
||||
metrics: 8,
|
||||
metrics: 12,
|
||||
},
|
||||
{
|
||||
files: files[:1],
|
||||
metrics: 4,
|
||||
metrics: 6,
|
||||
},
|
||||
{
|
||||
files: files[:0],
|
||||
|
@ -962,7 +964,7 @@ func TestMetricsUpdate(t *testing.T) {
|
|||
},
|
||||
{
|
||||
files: files[1:],
|
||||
metrics: 4,
|
||||
metrics: 6,
|
||||
},
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue