mirror of
https://github.com/prometheus/prometheus.git
synced 2025-03-05 20:59:13 -08:00
Rule Manager: Add rule_group_last_restore_duration_seconds
to measure restore time per rule group
When a rule group changes or prometheus is restarted we need to ensure we restore the active alerts that were firing for a corresponding rule, for that Prometheus uses the `ALERTS_FOR_STATE` series to query the previous state and restore it. If a given rule has high cardinality (think 100s of 1000s for series) this proccess can take a bit of time - this is the first of a series of PRs to improve this problem and I'd like to start with exposing the time it takes to restore a rule group as a gauge. Signed-off-by: gotjosh <josue.abreu@gmail.com>
This commit is contained in:
parent
76b0318ed5
commit
e7219e3d36
|
@ -230,7 +230,9 @@ func (g *Group) run(ctx context.Context) {
|
||||||
g.evalIterationFunc(ctx, g, evalTimestamp)
|
g.evalIterationFunc(ctx, g, evalTimestamp)
|
||||||
}
|
}
|
||||||
|
|
||||||
g.RestoreForState(time.Now())
|
now := time.Now()
|
||||||
|
g.RestoreForState(now)
|
||||||
|
g.metrics.GroupLastRestoreDuration.WithLabelValues(GroupKey(g.file, g.name)).Set(time.Since(now).Seconds())
|
||||||
g.shouldRestore = false
|
g.shouldRestore = false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -779,17 +781,18 @@ const namespace = "prometheus"
|
||||||
|
|
||||||
// Metrics for rule evaluation.
|
// Metrics for rule evaluation.
|
||||||
type Metrics struct {
|
type Metrics struct {
|
||||||
EvalDuration prometheus.Summary
|
EvalDuration prometheus.Summary
|
||||||
IterationDuration prometheus.Summary
|
IterationDuration prometheus.Summary
|
||||||
IterationsMissed *prometheus.CounterVec
|
IterationsMissed *prometheus.CounterVec
|
||||||
IterationsScheduled *prometheus.CounterVec
|
IterationsScheduled *prometheus.CounterVec
|
||||||
EvalTotal *prometheus.CounterVec
|
EvalTotal *prometheus.CounterVec
|
||||||
EvalFailures *prometheus.CounterVec
|
EvalFailures *prometheus.CounterVec
|
||||||
GroupInterval *prometheus.GaugeVec
|
GroupInterval *prometheus.GaugeVec
|
||||||
GroupLastEvalTime *prometheus.GaugeVec
|
GroupLastEvalTime *prometheus.GaugeVec
|
||||||
GroupLastDuration *prometheus.GaugeVec
|
GroupLastDuration *prometheus.GaugeVec
|
||||||
GroupRules *prometheus.GaugeVec
|
GroupLastRestoreDuration *prometheus.GaugeVec
|
||||||
GroupSamples *prometheus.GaugeVec
|
GroupRules *prometheus.GaugeVec
|
||||||
|
GroupSamples *prometheus.GaugeVec
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewGroupMetrics creates a new instance of Metrics and registers it with the provided registerer,
|
// NewGroupMetrics creates a new instance of Metrics and registers it with the provided registerer,
|
||||||
|
@ -865,6 +868,14 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
|
||||||
},
|
},
|
||||||
[]string{"rule_group"},
|
[]string{"rule_group"},
|
||||||
),
|
),
|
||||||
|
GroupLastRestoreDuration: prometheus.NewGaugeVec(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Namespace: namespace,
|
||||||
|
Name: "rule_group_last_restore_duration_seconds",
|
||||||
|
Help: "The duration of the last rule group restoration.",
|
||||||
|
},
|
||||||
|
[]string{"rule_group"},
|
||||||
|
),
|
||||||
GroupRules: prometheus.NewGaugeVec(
|
GroupRules: prometheus.NewGaugeVec(
|
||||||
prometheus.GaugeOpts{
|
prometheus.GaugeOpts{
|
||||||
Namespace: namespace,
|
Namespace: namespace,
|
||||||
|
@ -894,6 +905,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
|
||||||
m.GroupInterval,
|
m.GroupInterval,
|
||||||
m.GroupLastEvalTime,
|
m.GroupLastEvalTime,
|
||||||
m.GroupLastDuration,
|
m.GroupLastDuration,
|
||||||
|
m.GroupLastRestoreDuration,
|
||||||
m.GroupRules,
|
m.GroupRules,
|
||||||
m.GroupSamples,
|
m.GroupSamples,
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in a new issue