mirror of
https://github.com/prometheus/prometheus.git
synced 2025-01-26 13:11:11 -08:00
rules: Add new RuleEvaluationTimeSum
field to groups (#15672)
Some checks are pending
buf.build / lint and publish (push) Waiting to run
CI / Go tests (push) Waiting to run
CI / More Go tests (push) Waiting to run
CI / Go tests with previous Go version (push) Waiting to run
CI / UI tests (push) Waiting to run
CI / Go tests on Windows (push) Waiting to run
CI / Mixins tests (push) Waiting to run
CI / Build Prometheus for common architectures (0) (push) Waiting to run
CI / Build Prometheus for common architectures (1) (push) Waiting to run
CI / Build Prometheus for common architectures (2) (push) Waiting to run
CI / Build Prometheus for all architectures (0) (push) Waiting to run
CI / Build Prometheus for all architectures (1) (push) Waiting to run
CI / Build Prometheus for all architectures (10) (push) Waiting to run
CI / Build Prometheus for all architectures (11) (push) Waiting to run
CI / Build Prometheus for all architectures (2) (push) Waiting to run
CI / Build Prometheus for all architectures (3) (push) Waiting to run
CI / Build Prometheus for all architectures (4) (push) Waiting to run
CI / Build Prometheus for all architectures (5) (push) Waiting to run
CI / Build Prometheus for all architectures (6) (push) Waiting to run
CI / Build Prometheus for all architectures (7) (push) Waiting to run
CI / Build Prometheus for all architectures (8) (push) Waiting to run
CI / Build Prometheus for all architectures (9) (push) Waiting to run
CI / Report status of build Prometheus for all architectures (push) Blocked by required conditions
CI / Check generated parser (push) Waiting to run
CI / golangci-lint (push) Waiting to run
CI / fuzzing (push) Waiting to run
CI / codeql (push) Waiting to run
CI / Publish main branch artifacts (push) Blocked by required conditions
CI / Publish release artefacts (push) Blocked by required conditions
CI / Publish UI on npm Registry (push) Blocked by required conditions
Scorecards supply-chain security / Scorecards analysis (push) Waiting to run
Some checks are pending
buf.build / lint and publish (push) Waiting to run
CI / Go tests (push) Waiting to run
CI / More Go tests (push) Waiting to run
CI / Go tests with previous Go version (push) Waiting to run
CI / UI tests (push) Waiting to run
CI / Go tests on Windows (push) Waiting to run
CI / Mixins tests (push) Waiting to run
CI / Build Prometheus for common architectures (0) (push) Waiting to run
CI / Build Prometheus for common architectures (1) (push) Waiting to run
CI / Build Prometheus for common architectures (2) (push) Waiting to run
CI / Build Prometheus for all architectures (0) (push) Waiting to run
CI / Build Prometheus for all architectures (1) (push) Waiting to run
CI / Build Prometheus for all architectures (10) (push) Waiting to run
CI / Build Prometheus for all architectures (11) (push) Waiting to run
CI / Build Prometheus for all architectures (2) (push) Waiting to run
CI / Build Prometheus for all architectures (3) (push) Waiting to run
CI / Build Prometheus for all architectures (4) (push) Waiting to run
CI / Build Prometheus for all architectures (5) (push) Waiting to run
CI / Build Prometheus for all architectures (6) (push) Waiting to run
CI / Build Prometheus for all architectures (7) (push) Waiting to run
CI / Build Prometheus for all architectures (8) (push) Waiting to run
CI / Build Prometheus for all architectures (9) (push) Waiting to run
CI / Report status of build Prometheus for all architectures (push) Blocked by required conditions
CI / Check generated parser (push) Waiting to run
CI / golangci-lint (push) Waiting to run
CI / fuzzing (push) Waiting to run
CI / codeql (push) Waiting to run
CI / Publish main branch artifacts (push) Blocked by required conditions
CI / Publish release artefacts (push) Blocked by required conditions
CI / Publish UI on npm Registry (push) Blocked by required conditions
Scorecards supply-chain security / Scorecards analysis (push) Waiting to run
* feat(ruler): Add new `RuleEvaluationTimeSum` field to groups Coupled with a metric: `rule_group_last_rule_duration_sum_seconds` This will give us more observability into how fast a group runs with or without concurrency Signed-off-by: Julien Duchesne <julien.duchesne@grafana.com> * Update rules/group.go Co-authored-by: gotjosh <josue.abreu@gmail.com> Signed-off-by: Julien Duchesne <julienduchesne@live.com> Signed-off-by: Julien Duchesne <julien.duchesne@grafana.com> * Apply suggestions from code review Co-authored-by: gotjosh <josue.abreu@gmail.com> Signed-off-by: Julien Duchesne <julienduchesne@live.com> Signed-off-by: Julien Duchesne <julien.duchesne@grafana.com> * Remove `in seconds`. A duration is a duration Signed-off-by: Julien Duchesne <julien.duchesne@grafana.com> --------- Signed-off-by: Julien Duchesne <julien.duchesne@grafana.com> Signed-off-by: Julien Duchesne <julienduchesne@live.com> Co-authored-by: gotjosh <josue.abreu@gmail.com>
This commit is contained in:
parent
7802ca263d
commit
e2f037e554
|
@ -44,19 +44,20 @@ import (
|
|||
|
||||
// Group is a set of rules that have a logical relation.
|
||||
type Group struct {
|
||||
name string
|
||||
file string
|
||||
interval time.Duration
|
||||
queryOffset *time.Duration
|
||||
limit int
|
||||
rules []Rule
|
||||
seriesInPreviousEval []map[string]labels.Labels // One per Rule.
|
||||
staleSeries []labels.Labels
|
||||
opts *ManagerOptions
|
||||
mtx sync.Mutex
|
||||
evaluationTime time.Duration
|
||||
lastEvaluation time.Time // Wall-clock time of most recent evaluation.
|
||||
lastEvalTimestamp time.Time // Time slot used for most recent evaluation.
|
||||
name string
|
||||
file string
|
||||
interval time.Duration
|
||||
queryOffset *time.Duration
|
||||
limit int
|
||||
rules []Rule
|
||||
seriesInPreviousEval []map[string]labels.Labels // One per Rule.
|
||||
staleSeries []labels.Labels
|
||||
opts *ManagerOptions
|
||||
mtx sync.Mutex
|
||||
evaluationTime time.Duration // Time it took to evaluate the group.
|
||||
evaluationRuleTimeSum time.Duration // Sum of time it took to evaluate each rule in the group.
|
||||
lastEvaluation time.Time // Wall-clock time of most recent evaluation.
|
||||
lastEvalTimestamp time.Time // Time slot used for most recent evaluation.
|
||||
|
||||
shouldRestore bool
|
||||
|
||||
|
@ -115,6 +116,7 @@ func NewGroup(o GroupOptions) *Group {
|
|||
metrics.EvalFailures.WithLabelValues(key)
|
||||
metrics.GroupLastEvalTime.WithLabelValues(key)
|
||||
metrics.GroupLastDuration.WithLabelValues(key)
|
||||
metrics.GroupLastRuleDurationSum.WithLabelValues(key)
|
||||
metrics.GroupRules.WithLabelValues(key).Set(float64(len(o.Rules)))
|
||||
metrics.GroupSamples.WithLabelValues(key)
|
||||
metrics.GroupInterval.WithLabelValues(key).Set(o.Interval.Seconds())
|
||||
|
@ -370,6 +372,28 @@ func (g *Group) setEvaluationTime(dur time.Duration) {
|
|||
g.evaluationTime = dur
|
||||
}
|
||||
|
||||
// GetRuleEvaluationTimeSum returns the sum of the time it took to evaluate each rule in the group irrespective of concurrency.
|
||||
func (g *Group) GetRuleEvaluationTimeSum() time.Duration {
|
||||
g.mtx.Lock()
|
||||
defer g.mtx.Unlock()
|
||||
return g.evaluationRuleTimeSum
|
||||
}
|
||||
|
||||
// updateRuleEvaluationTimeSum updates evaluationRuleTimeSum which is the sum of the time it took to evaluate each rule in the group irrespective of concurrency.
|
||||
// It collects the times from the rules themselves.
|
||||
func (g *Group) updateRuleEvaluationTimeSum() {
|
||||
var sum time.Duration
|
||||
for _, rule := range g.rules {
|
||||
sum += rule.GetEvaluationDuration()
|
||||
}
|
||||
|
||||
g.metrics.GroupLastRuleDurationSum.WithLabelValues(GroupKey(g.file, g.name)).Set(sum.Seconds())
|
||||
|
||||
g.mtx.Lock()
|
||||
defer g.mtx.Unlock()
|
||||
g.evaluationRuleTimeSum = sum
|
||||
}
|
||||
|
||||
// GetLastEvaluation returns the time the last evaluation of the rule group took place.
|
||||
func (g *Group) GetLastEvaluation() time.Time {
|
||||
g.mtx.Lock()
|
||||
|
@ -874,6 +898,7 @@ type Metrics struct {
|
|||
GroupInterval *prometheus.GaugeVec
|
||||
GroupLastEvalTime *prometheus.GaugeVec
|
||||
GroupLastDuration *prometheus.GaugeVec
|
||||
GroupLastRuleDurationSum *prometheus.GaugeVec
|
||||
GroupLastRestoreDuration *prometheus.GaugeVec
|
||||
GroupRules *prometheus.GaugeVec
|
||||
GroupSamples *prometheus.GaugeVec
|
||||
|
@ -952,6 +977,14 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
|
|||
},
|
||||
[]string{"rule_group"},
|
||||
),
|
||||
GroupLastRuleDurationSum: prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: namespace,
|
||||
Name: "rule_group_last_rule_duration_sum_seconds",
|
||||
Help: "The sum of time in seconds it took to evaluate each rule in the group regardless of concurrency. This should be higher than the group duration if rules are evaluated concurrently.",
|
||||
},
|
||||
[]string{"rule_group"},
|
||||
),
|
||||
GroupLastRestoreDuration: prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: namespace,
|
||||
|
@ -989,6 +1022,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
|
|||
m.GroupInterval,
|
||||
m.GroupLastEvalTime,
|
||||
m.GroupLastDuration,
|
||||
m.GroupLastRuleDurationSum,
|
||||
m.GroupLastRestoreDuration,
|
||||
m.GroupRules,
|
||||
m.GroupSamples,
|
||||
|
|
|
@ -82,6 +82,7 @@ func DefaultEvalIterationFunc(ctx context.Context, g *Group, evalTimestamp time.
|
|||
timeSinceStart := time.Since(start)
|
||||
|
||||
g.metrics.IterationDuration.Observe(timeSinceStart.Seconds())
|
||||
g.updateRuleEvaluationTimeSum()
|
||||
g.setEvaluationTime(timeSinceStart)
|
||||
g.setLastEvaluation(start)
|
||||
g.setLastEvalTimestamp(evalTimestamp)
|
||||
|
|
|
@ -1985,7 +1985,7 @@ func TestAsyncRuleEvaluation(t *testing.T) {
|
|||
require.Len(t, group.rules, ruleCount)
|
||||
|
||||
start := time.Now()
|
||||
group.Eval(ctx, start)
|
||||
DefaultEvalIterationFunc(ctx, group, start)
|
||||
|
||||
// Never expect more than 1 inflight query at a time.
|
||||
require.EqualValues(t, 1, maxInflight.Load())
|
||||
|
@ -1993,6 +1993,8 @@ func TestAsyncRuleEvaluation(t *testing.T) {
|
|||
require.GreaterOrEqual(t, time.Since(start).Seconds(), (time.Duration(ruleCount) * artificialDelay).Seconds())
|
||||
// Each rule produces one vector.
|
||||
require.EqualValues(t, ruleCount, testutil.ToFloat64(group.metrics.GroupSamples))
|
||||
// Group duration is higher than the sum of rule durations (group overhead).
|
||||
require.GreaterOrEqual(t, group.GetEvaluationTime(), group.GetRuleEvaluationTimeSum())
|
||||
}
|
||||
})
|
||||
|
||||
|
@ -2023,7 +2025,7 @@ func TestAsyncRuleEvaluation(t *testing.T) {
|
|||
require.Len(t, group.rules, ruleCount)
|
||||
|
||||
start := time.Now()
|
||||
group.Eval(ctx, start)
|
||||
DefaultEvalIterationFunc(ctx, group, start)
|
||||
|
||||
// Max inflight can be 1 synchronous eval and up to MaxConcurrentEvals concurrent evals.
|
||||
require.EqualValues(t, opts.MaxConcurrentEvals+1, maxInflight.Load())
|
||||
|
@ -2061,7 +2063,7 @@ func TestAsyncRuleEvaluation(t *testing.T) {
|
|||
require.Len(t, group.rules, ruleCount)
|
||||
|
||||
start := time.Now()
|
||||
group.Eval(ctx, start)
|
||||
DefaultEvalIterationFunc(ctx, group, start)
|
||||
|
||||
// Max inflight can be 1 synchronous eval and up to MaxConcurrentEvals concurrent evals.
|
||||
require.EqualValues(t, opts.MaxConcurrentEvals+1, maxInflight.Load())
|
||||
|
@ -2100,7 +2102,7 @@ func TestAsyncRuleEvaluation(t *testing.T) {
|
|||
|
||||
start := time.Now()
|
||||
|
||||
group.Eval(ctx, start)
|
||||
DefaultEvalIterationFunc(ctx, group, start)
|
||||
|
||||
// Max inflight can be up to MaxConcurrentEvals concurrent evals, since there is sufficient concurrency to run all rules at once.
|
||||
require.LessOrEqual(t, int64(maxInflight.Load()), opts.MaxConcurrentEvals)
|
||||
|
@ -2108,6 +2110,8 @@ func TestAsyncRuleEvaluation(t *testing.T) {
|
|||
require.Less(t, time.Since(start).Seconds(), (time.Duration(ruleCount) * artificialDelay).Seconds())
|
||||
// Each rule produces one vector.
|
||||
require.EqualValues(t, ruleCount, testutil.ToFloat64(group.metrics.GroupSamples))
|
||||
// Group duration is less than the sum of rule durations
|
||||
require.Less(t, group.GetEvaluationTime(), group.GetRuleEvaluationTimeSum())
|
||||
}
|
||||
})
|
||||
|
||||
|
|
Loading…
Reference in a new issue