diff --git a/rules/manager.go b/rules/manager.go index e585bf7c3..8cca54e91 100644 --- a/rules/manager.go +++ b/rules/manager.go @@ -14,6 +14,7 @@ package rules import ( + "fmt" "sync" "time" @@ -89,7 +90,7 @@ func (m *ruleManager) Run() { case <-ticker.C: start := time.Now() m.runIteration(m.results) - evalDurations.Add(map[string]string{intervalKey: m.interval.String()}, float64(time.Since(start)/time.Millisecond)) + iterationDuration.Add(map[string]string{intervalLabel: m.interval.String()}, float64(time.Since(start)/time.Millisecond)) case <-m.done: glog.Info("Rule manager exiting...") return @@ -146,7 +147,11 @@ func (m *ruleManager) runIteration(results chan<- *extraction.Result) { // BUG(julius): Look at fixing thundering herd. go func(rule Rule) { defer wg.Done() + + start := time.Now() vector, err := rule.Eval(now, m.storage) + duration := time.Since(start) + samples := make(clientmodel.Samples, len(vector)) copy(samples, vector) m.results <- &extraction.Result{ @@ -154,8 +159,14 @@ func (m *ruleManager) runIteration(results chan<- *extraction.Result) { Err: err, } - if alertingRule, ok := rule.(*AlertingRule); ok { - m.queueAlertNotifications(alertingRule) + switch r := rule.(type) { + case *AlertingRule: + m.queueAlertNotifications(r) + recordOutcome(alertingRuleType, duration) + case *RecordingRule: + recordOutcome(recordingRuleType, duration) + default: + panic(fmt.Sprintf("Unknown rule type: %T", rule)) } }(rule) } diff --git a/rules/telemetry.go b/rules/telemetry.go index fdb440c70..c644eb75f 100644 --- a/rules/telemetry.go +++ b/rules/telemetry.go @@ -14,21 +14,35 @@ package rules import ( + "time" + "github.com/prometheus/client_golang/prometheus" ) const ( - intervalKey = "interval" + intervalLabel = "interval" + ruleTypeLabel = "rule_type" + alertingRuleType = "alerting" + recordingRuleType = "recording" ) var ( - evalDurations = prometheus.NewHistogram(&prometheus.HistogramSpecification{ + evalDuration = prometheus.NewDefaultHistogram() + evalCount = prometheus.NewCounter() + iterationDuration = prometheus.NewHistogram(&prometheus.HistogramSpecification{ Starts: prometheus.LogarithmicSizedBucketsFor(0, 10000), BucketBuilder: prometheus.AccumulatingBucketBuilder(prometheus.EvictAndReplaceWith(10, prometheus.AverageReducer), 100), ReportablePercentiles: []float64{0.01, 0.05, 0.5, 0.90, 0.99}}) - evalDuration = prometheus.NewCounter() ) -func init() { - prometheus.Register("prometheus_evaluator_duration_ms", "The duration for each evaluation pool to execute.", prometheus.NilLabels, evalDurations) +func recordOutcome(ruleType string, duration time.Duration) { + millisecondDuration := float64(duration / time.Millisecond) + evalCount.Increment(map[string]string{ruleTypeLabel: ruleType}) + evalDuration.Add(map[string]string{ruleTypeLabel: ruleType}, millisecondDuration) +} + +func init() { + prometheus.Register("prometheus_evaluator_duration_ms", "The duration for each evaluation pool to execute.", prometheus.NilLabels, iterationDuration) + prometheus.Register("prometheus_rule_evaluation_duration_ms", "The duration for a rule to execute.", prometheus.NilLabels, evalDuration) + prometheus.Register("prometheus_rule_evaluation_count", "The number of rules evaluated.", prometheus.NilLabels, evalCount) }