Merge pull request #8778 from owen-d/enhancement/expose-rule-metrics

[Enhancement] Expose rule metrics fields
This commit is contained in:
Julien Pivotto 2021-05-01 03:11:36 +02:00 committed by GitHub
commit e69093f8f7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -53,37 +53,37 @@ const namespace = "prometheus"
// Metrics for rule evaluation. // Metrics for rule evaluation.
type Metrics struct { type Metrics struct {
evalDuration prometheus.Summary EvalDuration prometheus.Summary
iterationDuration prometheus.Summary IterationDuration prometheus.Summary
iterationsMissed *prometheus.CounterVec IterationsMissed *prometheus.CounterVec
iterationsScheduled *prometheus.CounterVec IterationsScheduled *prometheus.CounterVec
evalTotal *prometheus.CounterVec EvalTotal *prometheus.CounterVec
evalFailures *prometheus.CounterVec EvalFailures *prometheus.CounterVec
groupInterval *prometheus.GaugeVec GroupInterval *prometheus.GaugeVec
groupLastEvalTime *prometheus.GaugeVec GroupLastEvalTime *prometheus.GaugeVec
groupLastDuration *prometheus.GaugeVec GroupLastDuration *prometheus.GaugeVec
groupRules *prometheus.GaugeVec GroupRules *prometheus.GaugeVec
groupSamples *prometheus.GaugeVec GroupSamples *prometheus.GaugeVec
} }
// NewGroupMetrics creates a new instance of Metrics and registers it with the provided registerer, // NewGroupMetrics creates a new instance of Metrics and registers it with the provided registerer,
// if not nil. // if not nil.
func NewGroupMetrics(reg prometheus.Registerer) *Metrics { func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
m := &Metrics{ m := &Metrics{
evalDuration: prometheus.NewSummary( EvalDuration: prometheus.NewSummary(
prometheus.SummaryOpts{ prometheus.SummaryOpts{
Namespace: namespace, Namespace: namespace,
Name: "rule_evaluation_duration_seconds", Name: "rule_evaluation_duration_seconds",
Help: "The duration for a rule to execute.", Help: "The duration for a rule to execute.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
}), }),
iterationDuration: prometheus.NewSummary(prometheus.SummaryOpts{ IterationDuration: prometheus.NewSummary(prometheus.SummaryOpts{
Namespace: namespace, Namespace: namespace,
Name: "rule_group_duration_seconds", Name: "rule_group_duration_seconds",
Help: "The duration of rule group evaluations.", Help: "The duration of rule group evaluations.",
Objectives: map[float64]float64{0.01: 0.001, 0.05: 0.005, 0.5: 0.05, 0.90: 0.01, 0.99: 0.001}, Objectives: map[float64]float64{0.01: 0.001, 0.05: 0.005, 0.5: 0.05, 0.90: 0.01, 0.99: 0.001},
}), }),
iterationsMissed: prometheus.NewCounterVec( IterationsMissed: prometheus.NewCounterVec(
prometheus.CounterOpts{ prometheus.CounterOpts{
Namespace: namespace, Namespace: namespace,
Name: "rule_group_iterations_missed_total", Name: "rule_group_iterations_missed_total",
@ -91,7 +91,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
}, },
[]string{"rule_group"}, []string{"rule_group"},
), ),
iterationsScheduled: prometheus.NewCounterVec( IterationsScheduled: prometheus.NewCounterVec(
prometheus.CounterOpts{ prometheus.CounterOpts{
Namespace: namespace, Namespace: namespace,
Name: "rule_group_iterations_total", Name: "rule_group_iterations_total",
@ -99,7 +99,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
}, },
[]string{"rule_group"}, []string{"rule_group"},
), ),
evalTotal: prometheus.NewCounterVec( EvalTotal: prometheus.NewCounterVec(
prometheus.CounterOpts{ prometheus.CounterOpts{
Namespace: namespace, Namespace: namespace,
Name: "rule_evaluations_total", Name: "rule_evaluations_total",
@ -107,7 +107,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
}, },
[]string{"rule_group"}, []string{"rule_group"},
), ),
evalFailures: prometheus.NewCounterVec( EvalFailures: prometheus.NewCounterVec(
prometheus.CounterOpts{ prometheus.CounterOpts{
Namespace: namespace, Namespace: namespace,
Name: "rule_evaluation_failures_total", Name: "rule_evaluation_failures_total",
@ -115,7 +115,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
}, },
[]string{"rule_group"}, []string{"rule_group"},
), ),
groupInterval: prometheus.NewGaugeVec( GroupInterval: prometheus.NewGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Namespace: namespace, Namespace: namespace,
Name: "rule_group_interval_seconds", Name: "rule_group_interval_seconds",
@ -123,7 +123,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
}, },
[]string{"rule_group"}, []string{"rule_group"},
), ),
groupLastEvalTime: prometheus.NewGaugeVec( GroupLastEvalTime: prometheus.NewGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Namespace: namespace, Namespace: namespace,
Name: "rule_group_last_evaluation_timestamp_seconds", Name: "rule_group_last_evaluation_timestamp_seconds",
@ -131,7 +131,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
}, },
[]string{"rule_group"}, []string{"rule_group"},
), ),
groupLastDuration: prometheus.NewGaugeVec( GroupLastDuration: prometheus.NewGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Namespace: namespace, Namespace: namespace,
Name: "rule_group_last_duration_seconds", Name: "rule_group_last_duration_seconds",
@ -139,7 +139,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
}, },
[]string{"rule_group"}, []string{"rule_group"},
), ),
groupRules: prometheus.NewGaugeVec( GroupRules: prometheus.NewGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Namespace: namespace, Namespace: namespace,
Name: "rule_group_rules", Name: "rule_group_rules",
@ -147,7 +147,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
}, },
[]string{"rule_group"}, []string{"rule_group"},
), ),
groupSamples: prometheus.NewGaugeVec( GroupSamples: prometheus.NewGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Namespace: namespace, Namespace: namespace,
Name: "rule_group_last_evaluation_samples", Name: "rule_group_last_evaluation_samples",
@ -159,17 +159,17 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
if reg != nil { if reg != nil {
reg.MustRegister( reg.MustRegister(
m.evalDuration, m.EvalDuration,
m.iterationDuration, m.IterationDuration,
m.iterationsMissed, m.IterationsMissed,
m.iterationsScheduled, m.IterationsScheduled,
m.evalTotal, m.EvalTotal,
m.evalFailures, m.EvalFailures,
m.groupInterval, m.GroupInterval,
m.groupLastEvalTime, m.GroupLastEvalTime,
m.groupLastDuration, m.GroupLastDuration,
m.groupRules, m.GroupRules,
m.groupSamples, m.GroupSamples,
) )
} }
@ -281,15 +281,15 @@ func NewGroup(o GroupOptions) *Group {
} }
key := GroupKey(o.File, o.Name) key := GroupKey(o.File, o.Name)
metrics.iterationsMissed.WithLabelValues(key) metrics.IterationsMissed.WithLabelValues(key)
metrics.iterationsScheduled.WithLabelValues(key) metrics.IterationsScheduled.WithLabelValues(key)
metrics.evalTotal.WithLabelValues(key) metrics.EvalTotal.WithLabelValues(key)
metrics.evalFailures.WithLabelValues(key) metrics.EvalFailures.WithLabelValues(key)
metrics.groupLastEvalTime.WithLabelValues(key) metrics.GroupLastEvalTime.WithLabelValues(key)
metrics.groupLastDuration.WithLabelValues(key) metrics.GroupLastDuration.WithLabelValues(key)
metrics.groupRules.WithLabelValues(key).Set(float64(len(o.Rules))) metrics.GroupRules.WithLabelValues(key).Set(float64(len(o.Rules)))
metrics.groupSamples.WithLabelValues(key) metrics.GroupSamples.WithLabelValues(key)
metrics.groupInterval.WithLabelValues(key).Set(o.Interval.Seconds()) metrics.GroupInterval.WithLabelValues(key).Set(o.Interval.Seconds())
return &Group{ return &Group{
name: o.Name, name: o.Name,
@ -338,13 +338,13 @@ func (g *Group) run(ctx context.Context) {
}) })
iter := func() { iter := func() {
g.metrics.iterationsScheduled.WithLabelValues(GroupKey(g.file, g.name)).Inc() g.metrics.IterationsScheduled.WithLabelValues(GroupKey(g.file, g.name)).Inc()
start := time.Now() start := time.Now()
g.Eval(ctx, evalTimestamp) g.Eval(ctx, evalTimestamp)
timeSinceStart := time.Since(start) timeSinceStart := time.Since(start)
g.metrics.iterationDuration.Observe(timeSinceStart.Seconds()) g.metrics.IterationDuration.Observe(timeSinceStart.Seconds())
g.setEvaluationTime(timeSinceStart) g.setEvaluationTime(timeSinceStart)
g.setLastEvaluation(start) g.setLastEvaluation(start)
} }
@ -390,8 +390,8 @@ func (g *Group) run(ctx context.Context) {
case <-tick.C: case <-tick.C:
missed := (time.Since(evalTimestamp) / g.interval) - 1 missed := (time.Since(evalTimestamp) / g.interval) - 1
if missed > 0 { if missed > 0 {
g.metrics.iterationsMissed.WithLabelValues(GroupKey(g.file, g.name)).Add(float64(missed)) g.metrics.IterationsMissed.WithLabelValues(GroupKey(g.file, g.name)).Add(float64(missed))
g.metrics.iterationsScheduled.WithLabelValues(GroupKey(g.file, g.name)).Add(float64(missed)) g.metrics.IterationsScheduled.WithLabelValues(GroupKey(g.file, g.name)).Add(float64(missed))
} }
evalTimestamp = evalTimestamp.Add((missed + 1) * g.interval) evalTimestamp = evalTimestamp.Add((missed + 1) * g.interval)
iter() iter()
@ -412,8 +412,8 @@ func (g *Group) run(ctx context.Context) {
case <-tick.C: case <-tick.C:
missed := (time.Since(evalTimestamp) / g.interval) - 1 missed := (time.Since(evalTimestamp) / g.interval) - 1
if missed > 0 { if missed > 0 {
g.metrics.iterationsMissed.WithLabelValues(GroupKey(g.file, g.name)).Add(float64(missed)) g.metrics.IterationsMissed.WithLabelValues(GroupKey(g.file, g.name)).Add(float64(missed))
g.metrics.iterationsScheduled.WithLabelValues(GroupKey(g.file, g.name)).Add(float64(missed)) g.metrics.IterationsScheduled.WithLabelValues(GroupKey(g.file, g.name)).Add(float64(missed))
} }
evalTimestamp = evalTimestamp.Add((missed + 1) * g.interval) evalTimestamp = evalTimestamp.Add((missed + 1) * g.interval)
iter() iter()
@ -476,7 +476,7 @@ func (g *Group) GetEvaluationTime() time.Duration {
// setEvaluationTime sets the time in seconds the last evaluation took. // setEvaluationTime sets the time in seconds the last evaluation took.
func (g *Group) setEvaluationTime(dur time.Duration) { func (g *Group) setEvaluationTime(dur time.Duration) {
g.metrics.groupLastDuration.WithLabelValues(GroupKey(g.file, g.name)).Set(dur.Seconds()) g.metrics.GroupLastDuration.WithLabelValues(GroupKey(g.file, g.name)).Set(dur.Seconds())
g.mtx.Lock() g.mtx.Lock()
defer g.mtx.Unlock() defer g.mtx.Unlock()
@ -492,7 +492,7 @@ func (g *Group) GetLastEvaluation() time.Time {
// setLastEvaluation updates evaluationTimestamp to the timestamp of when the rule group was last evaluated. // setLastEvaluation updates evaluationTimestamp to the timestamp of when the rule group was last evaluated.
func (g *Group) setLastEvaluation(ts time.Time) { func (g *Group) setLastEvaluation(ts time.Time) {
g.metrics.groupLastEvalTime.WithLabelValues(GroupKey(g.file, g.name)).Set(float64(ts.UnixNano()) / 1e9) g.metrics.GroupLastEvalTime.WithLabelValues(GroupKey(g.file, g.name)).Set(float64(ts.UnixNano()) / 1e9)
g.mtx.Lock() g.mtx.Lock()
defer g.mtx.Unlock() defer g.mtx.Unlock()
@ -584,18 +584,18 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
sp.Finish() sp.Finish()
since := time.Since(t) since := time.Since(t)
g.metrics.evalDuration.Observe(since.Seconds()) g.metrics.EvalDuration.Observe(since.Seconds())
rule.SetEvaluationDuration(since) rule.SetEvaluationDuration(since)
rule.SetEvaluationTimestamp(t) rule.SetEvaluationTimestamp(t)
}(time.Now()) }(time.Now())
g.metrics.evalTotal.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() g.metrics.EvalTotal.WithLabelValues(GroupKey(g.File(), g.Name())).Inc()
vector, err := rule.Eval(ctx, ts, g.opts.QueryFunc, g.opts.ExternalURL) vector, err := rule.Eval(ctx, ts, g.opts.QueryFunc, g.opts.ExternalURL)
if err != nil { if err != nil {
rule.SetHealth(HealthBad) rule.SetHealth(HealthBad)
rule.SetLastError(err) rule.SetLastError(err)
g.metrics.evalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc()
// Canceled queries are intentional termination of queries. This normally // Canceled queries are intentional termination of queries. This normally
// happens on shutdown and thus we skip logging of any errors here. // happens on shutdown and thus we skip logging of any errors here.
@ -620,7 +620,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
if err := app.Commit(); err != nil { if err := app.Commit(); err != nil {
rule.SetHealth(HealthBad) rule.SetHealth(HealthBad)
rule.SetLastError(err) rule.SetLastError(err)
g.metrics.evalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() g.metrics.EvalFailures.WithLabelValues(GroupKey(g.File(), g.Name())).Inc()
level.Warn(g.logger).Log("msg", "Rule sample appending failed", "err", err) level.Warn(g.logger).Log("msg", "Rule sample appending failed", "err", err)
return return
@ -671,7 +671,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
}(i, rule) }(i, rule)
} }
if g.metrics != nil { if g.metrics != nil {
g.metrics.groupSamples.WithLabelValues(GroupKey(g.File(), g.Name())).Set(samplesTotal) g.metrics.GroupSamples.WithLabelValues(GroupKey(g.File(), g.Name())).Set(samplesTotal)
} }
g.cleanupStaleSeries(ctx, ts) g.cleanupStaleSeries(ctx, ts)
} }
@ -996,15 +996,15 @@ func (m *Manager) Update(interval time.Duration, files []string, externalLabels
g.markStale = true g.markStale = true
g.stop() g.stop()
if m := g.metrics; m != nil { if m := g.metrics; m != nil {
m.iterationsMissed.DeleteLabelValues(n) m.IterationsMissed.DeleteLabelValues(n)
m.iterationsScheduled.DeleteLabelValues(n) m.IterationsScheduled.DeleteLabelValues(n)
m.evalTotal.DeleteLabelValues(n) m.EvalTotal.DeleteLabelValues(n)
m.evalFailures.DeleteLabelValues(n) m.EvalFailures.DeleteLabelValues(n)
m.groupInterval.DeleteLabelValues(n) m.GroupInterval.DeleteLabelValues(n)
m.groupLastEvalTime.DeleteLabelValues(n) m.GroupLastEvalTime.DeleteLabelValues(n)
m.groupLastDuration.DeleteLabelValues(n) m.GroupLastDuration.DeleteLabelValues(n)
m.groupRules.DeleteLabelValues(n) m.GroupRules.DeleteLabelValues(n)
m.groupSamples.DeleteLabelValues((n)) m.GroupSamples.DeleteLabelValues((n))
} }
wg.Done() wg.Done()
}(n, oldg) }(n, oldg)