Merge pull request #779 from prometheus/fabxc/rule-cleanup

rule cleanup
This commit is contained in:
Julius Volz 2015-06-08 15:12:47 +02:00
commit 51653e7890
6 changed files with 81 additions and 112 deletions

View file

@ -16,7 +16,6 @@ package rules
import ( import (
"fmt" "fmt"
"html/template" "html/template"
"reflect"
"sync" "sync"
"time" "time"
@ -28,12 +27,12 @@ import (
const ( const (
// AlertMetricName is the metric name for synthetic alert timeseries. // AlertMetricName is the metric name for synthetic alert timeseries.
AlertMetricName clientmodel.LabelValue = "ALERTS" alertMetricName clientmodel.LabelValue = "ALERTS"
// AlertNameLabel is the label name indicating the name of an alert. // AlertNameLabel is the label name indicating the name of an alert.
AlertNameLabel clientmodel.LabelName = "alertname" alertNameLabel clientmodel.LabelName = "alertname"
// AlertStateLabel is the label name indicating the state of an alert. // AlertStateLabel is the label name indicating the state of an alert.
AlertStateLabel clientmodel.LabelName = "alertstate" alertStateLabel clientmodel.LabelName = "alertstate"
) )
// AlertState denotes the state of an active alert. // AlertState denotes the state of an active alert.
@ -41,11 +40,11 @@ type AlertState int
func (s AlertState) String() string { func (s AlertState) String() string {
switch s { switch s {
case Inactive: case StateInactive:
return "inactive" return "inactive"
case Pending: case StatePending:
return "pending" return "pending"
case Firing: case StateFiring:
return "firing" return "firing"
default: default:
panic("undefined") panic("undefined")
@ -54,13 +53,13 @@ func (s AlertState) String() string {
const ( const (
// Inactive alerts are neither firing nor pending. // Inactive alerts are neither firing nor pending.
Inactive AlertState = iota StateInactive AlertState = iota
// Pending alerts have been active for less than the configured // Pending alerts have been active for less than the configured
// threshold duration. // threshold duration.
Pending StatePending
// Firing alerts have been active for longer than the configured // Firing alerts have been active for longer than the configured
// threshold duration. // threshold duration.
Firing StateFiring
) )
// Alert is used to track active (pending/firing) alerts over time. // Alert is used to track active (pending/firing) alerts over time.
@ -84,9 +83,9 @@ func (a Alert) sample(timestamp clientmodel.Timestamp, value clientmodel.SampleV
recordedMetric[label] = value recordedMetric[label] = value
} }
recordedMetric[clientmodel.MetricNameLabel] = AlertMetricName recordedMetric[clientmodel.MetricNameLabel] = alertMetricName
recordedMetric[AlertNameLabel] = clientmodel.LabelValue(a.Name) recordedMetric[alertNameLabel] = clientmodel.LabelValue(a.Name)
recordedMetric[AlertStateLabel] = clientmodel.LabelValue(a.State.String()) recordedMetric[alertStateLabel] = clientmodel.LabelValue(a.State.String())
return &promql.Sample{ return &promql.Sample{
Metric: clientmodel.COWMetric{ Metric: clientmodel.COWMetric{
@ -103,16 +102,16 @@ type AlertingRule struct {
// The name of the alert. // The name of the alert.
name string name string
// The vector expression from which to generate alerts. // The vector expression from which to generate alerts.
Vector promql.Expr vector promql.Expr
// The duration for which a labelset needs to persist in the expression // The duration for which a labelset needs to persist in the expression
// output vector before an alert transitions from Pending to Firing state. // output vector before an alert transitions from Pending to Firing state.
holdDuration time.Duration holdDuration time.Duration
// Extra labels to attach to the resulting alert sample vectors. // Extra labels to attach to the resulting alert sample vectors.
Labels clientmodel.LabelSet labels clientmodel.LabelSet
// Short alert summary, suitable for email subjects. // Short alert summary, suitable for email subjects.
Summary string summary string
// More detailed alert description. // More detailed alert description.
Description string description string
// Protects the below. // Protects the below.
mutex sync.Mutex mutex sync.Mutex
@ -121,24 +120,40 @@ type AlertingRule struct {
activeAlerts map[clientmodel.Fingerprint]*Alert activeAlerts map[clientmodel.Fingerprint]*Alert
} }
// NewAlertingRule constructs a new AlertingRule.
func NewAlertingRule(
name string,
vector promql.Expr,
holdDuration time.Duration,
labels clientmodel.LabelSet,
summary string,
description string,
) *AlertingRule {
return &AlertingRule{
name: name,
vector: vector,
holdDuration: holdDuration,
labels: labels,
summary: summary,
description: description,
activeAlerts: map[clientmodel.Fingerprint]*Alert{},
}
}
// Name returns the name of the alert. // Name returns the name of the alert.
func (rule *AlertingRule) Name() string { func (rule *AlertingRule) Name() string {
return rule.name return rule.name
} }
// EvalRaw returns the raw value of the rule expression, without creating alerts. // eval evaluates the rule expression and then creates pending alerts and fires
func (rule *AlertingRule) EvalRaw(timestamp clientmodel.Timestamp, engine *promql.Engine) (promql.Vector, error) { // or removes previously pending alerts accordingly.
query, err := engine.NewInstantQuery(rule.Vector.String(), timestamp) func (rule *AlertingRule) eval(timestamp clientmodel.Timestamp, engine *promql.Engine) (promql.Vector, error) {
query, err := engine.NewInstantQuery(rule.vector.String(), timestamp)
if err != nil { if err != nil {
return nil, err return nil, err
} }
return query.Exec().Vector() exprResult, err := query.Exec().Vector()
}
// Eval evaluates the rule expression and then creates pending alerts and fires
// or removes previously pending alerts accordingly.
func (rule *AlertingRule) Eval(timestamp clientmodel.Timestamp, engine *promql.Engine) (promql.Vector, error) {
exprResult, err := rule.EvalRaw(timestamp, engine)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -156,14 +171,14 @@ func (rule *AlertingRule) Eval(timestamp clientmodel.Timestamp, engine *promql.E
if alert, ok := rule.activeAlerts[fp]; !ok { if alert, ok := rule.activeAlerts[fp]; !ok {
labels := clientmodel.LabelSet{} labels := clientmodel.LabelSet{}
labels.MergeFromMetric(sample.Metric.Metric) labels.MergeFromMetric(sample.Metric.Metric)
labels = labels.Merge(rule.Labels) labels = labels.Merge(rule.labels)
if _, ok := labels[clientmodel.MetricNameLabel]; ok { if _, ok := labels[clientmodel.MetricNameLabel]; ok {
delete(labels, clientmodel.MetricNameLabel) delete(labels, clientmodel.MetricNameLabel)
} }
rule.activeAlerts[fp] = &Alert{ rule.activeAlerts[fp] = &Alert{
Name: rule.name, Name: rule.name,
Labels: labels, Labels: labels,
State: Pending, State: StatePending,
ActiveSince: timestamp, ActiveSince: timestamp,
Value: sample.Value, Value: sample.Value,
} }
@ -182,9 +197,9 @@ func (rule *AlertingRule) Eval(timestamp clientmodel.Timestamp, engine *promql.E
continue continue
} }
if activeAlert.State == Pending && timestamp.Sub(activeAlert.ActiveSince) >= rule.holdDuration { if activeAlert.State == StatePending && timestamp.Sub(activeAlert.ActiveSince) >= rule.holdDuration {
vector = append(vector, activeAlert.sample(timestamp, 0)) vector = append(vector, activeAlert.sample(timestamp, 0))
activeAlert.State = Firing activeAlert.State = StateFiring
} }
vector = append(vector, activeAlert.sample(timestamp, 1)) vector = append(vector, activeAlert.sample(timestamp, 1))
@ -193,39 +208,24 @@ func (rule *AlertingRule) Eval(timestamp clientmodel.Timestamp, engine *promql.E
return vector, nil return vector, nil
} }
// DotGraph returns the text representation of a dot graph.
func (rule *AlertingRule) DotGraph() string {
graph := fmt.Sprintf(
`digraph "Rules" {
%#p[shape="box",label="ALERT %s IF FOR %s"];
%#p -> %x;
%s
}`,
&rule, rule.name, strutil.DurationToString(rule.holdDuration),
&rule, reflect.ValueOf(rule.Vector).Pointer(),
rule.Vector.DotGraph(),
)
return graph
}
func (rule *AlertingRule) String() string { func (rule *AlertingRule) String() string {
return fmt.Sprintf("ALERT %s IF %s FOR %s WITH %s", rule.name, rule.Vector, strutil.DurationToString(rule.holdDuration), rule.Labels) return fmt.Sprintf("ALERT %s IF %s FOR %s WITH %s", rule.name, rule.vector, strutil.DurationToString(rule.holdDuration), rule.labels)
} }
// HTMLSnippet returns an HTML snippet representing this alerting rule. // HTMLSnippet returns an HTML snippet representing this alerting rule.
func (rule *AlertingRule) HTMLSnippet(pathPrefix string) template.HTML { func (rule *AlertingRule) HTMLSnippet(pathPrefix string) template.HTML {
alertMetric := clientmodel.Metric{ alertMetric := clientmodel.Metric{
clientmodel.MetricNameLabel: AlertMetricName, clientmodel.MetricNameLabel: alertMetricName,
AlertNameLabel: clientmodel.LabelValue(rule.name), alertNameLabel: clientmodel.LabelValue(rule.name),
} }
return template.HTML(fmt.Sprintf( return template.HTML(fmt.Sprintf(
`ALERT <a href="%s">%s</a> IF <a href="%s">%s</a> FOR %s WITH %s`, `ALERT <a href="%s">%s</a> IF <a href="%s">%s</a> FOR %s WITH %s`,
pathPrefix+strutil.GraphLinkForExpression(alertMetric.String()), pathPrefix+strutil.GraphLinkForExpression(alertMetric.String()),
rule.name, rule.name,
pathPrefix+strutil.GraphLinkForExpression(rule.Vector.String()), pathPrefix+strutil.GraphLinkForExpression(rule.vector.String()),
rule.Vector, rule.vector,
strutil.DurationToString(rule.holdDuration), strutil.DurationToString(rule.holdDuration),
rule.Labels)) rule.labels))
} }
// State returns the "maximum" state: firing > pending > inactive. // State returns the "maximum" state: firing > pending > inactive.
@ -233,7 +233,7 @@ func (rule *AlertingRule) State() AlertState {
rule.mutex.Lock() rule.mutex.Lock()
defer rule.mutex.Unlock() defer rule.mutex.Unlock()
maxState := Inactive maxState := StateInactive
for _, activeAlert := range rule.activeAlerts { for _, activeAlert := range rule.activeAlerts {
if activeAlert.State > maxState { if activeAlert.State > maxState {
maxState = activeAlert.State maxState = activeAlert.State
@ -253,17 +253,3 @@ func (rule *AlertingRule) ActiveAlerts() []Alert {
} }
return alerts return alerts
} }
// NewAlertingRule constructs a new AlertingRule.
func NewAlertingRule(name string, vector promql.Expr, holdDuration time.Duration, labels clientmodel.LabelSet, summary string, description string) *AlertingRule {
return &AlertingRule{
name: name,
Vector: vector,
holdDuration: holdDuration,
Labels: labels,
Summary: summary,
Description: description,
activeAlerts: map[clientmodel.Fingerprint]*Alert{},
}
}

View file

@ -39,8 +39,8 @@ const (
namespace = "prometheus" namespace = "prometheus"
ruleTypeLabel = "rule_type" ruleTypeLabel = "rule_type"
alertingRuleType = "alerting" ruleTypeAlerting = "alerting"
recordingRuleType = "recording" ruleTypeRecording = "recording"
) )
var ( var (
@ -173,7 +173,7 @@ func (m *Manager) queueAlertNotifications(rule *AlertingRule, timestamp clientmo
notifications := make(notification.NotificationReqs, 0, len(activeAlerts)) notifications := make(notification.NotificationReqs, 0, len(activeAlerts))
for _, aa := range activeAlerts { for _, aa := range activeAlerts {
if aa.State != Firing { if aa.State != StateFiring {
// BUG: In the future, make AlertManager support pending alerts? // BUG: In the future, make AlertManager support pending alerts?
continue continue
} }
@ -205,15 +205,15 @@ func (m *Manager) queueAlertNotifications(rule *AlertingRule, timestamp clientmo
} }
notifications = append(notifications, &notification.NotificationReq{ notifications = append(notifications, &notification.NotificationReq{
Summary: expand(rule.Summary), Summary: expand(rule.summary),
Description: expand(rule.Description), Description: expand(rule.description),
Labels: aa.Labels.Merge(clientmodel.LabelSet{ Labels: aa.Labels.Merge(clientmodel.LabelSet{
AlertNameLabel: clientmodel.LabelValue(rule.Name()), alertNameLabel: clientmodel.LabelValue(rule.Name()),
}), }),
Value: aa.Value, Value: aa.Value,
ActiveSince: aa.ActiveSince.Time(), ActiveSince: aa.ActiveSince.Time(),
RuleString: rule.String(), RuleString: rule.String(),
GeneratorURL: m.prometheusURL + strings.TrimLeft(strutil.GraphLinkForExpression(rule.Vector.String()), "/"), GeneratorURL: m.prometheusURL + strings.TrimLeft(strutil.GraphLinkForExpression(rule.vector.String()), "/"),
}) })
} }
m.notificationHandler.SubmitReqs(notifications) m.notificationHandler.SubmitReqs(notifications)
@ -235,7 +235,7 @@ func (m *Manager) runIteration() {
defer wg.Done() defer wg.Done()
start := time.Now() start := time.Now()
vector, err := rule.Eval(now, m.queryEngine) vector, err := rule.eval(now, m.queryEngine)
duration := time.Since(start) duration := time.Since(start)
if err != nil { if err != nil {
@ -247,11 +247,11 @@ func (m *Manager) runIteration() {
switch r := rule.(type) { switch r := rule.(type) {
case *AlertingRule: case *AlertingRule:
m.queueAlertNotifications(r, now) m.queueAlertNotifications(r, now)
evalDuration.WithLabelValues(alertingRuleType).Observe( evalDuration.WithLabelValues(ruleTypeAlerting).Observe(
float64(duration / time.Millisecond), float64(duration / time.Millisecond),
) )
case *RecordingRule: case *RecordingRule:
evalDuration.WithLabelValues(recordingRuleType).Observe( evalDuration.WithLabelValues(ruleTypeRecording).Observe(
float64(duration / time.Millisecond), float64(duration / time.Millisecond),
) )
default: default:
@ -319,7 +319,7 @@ func (m *Manager) loadRuleFiles(filenames ...string) error {
rule := NewAlertingRule(r.Name, r.Expr, r.Duration, r.Labels, r.Summary, r.Description) rule := NewAlertingRule(r.Name, r.Expr, r.Duration, r.Labels, r.Summary, r.Description)
m.rules = append(m.rules, rule) m.rules = append(m.rules, rule)
case *promql.RecordStmt: case *promql.RecordStmt:
rule := &RecordingRule{r.Name, r.Expr, r.Labels} rule := NewRecordingRule(r.Name, r.Expr, r.Labels)
m.rules = append(m.rules, rule) m.rules = append(m.rules, rule)
default: default:
panic("retrieval.Manager.LoadRuleFiles: unknown statement type") panic("retrieval.Manager.LoadRuleFiles: unknown statement type")

View file

@ -16,7 +16,6 @@ package rules
import ( import (
"fmt" "fmt"
"html/template" "html/template"
"reflect"
clientmodel "github.com/prometheus/client_golang/model" clientmodel "github.com/prometheus/client_golang/model"
@ -31,21 +30,25 @@ type RecordingRule struct {
labels clientmodel.LabelSet labels clientmodel.LabelSet
} }
// NewRecordingRule returns a new recording rule.
func NewRecordingRule(name string, vector promql.Expr, labels clientmodel.LabelSet) *RecordingRule {
return &RecordingRule{
name: name,
vector: vector,
labels: labels,
}
}
// Name returns the rule name. // Name returns the rule name.
func (rule RecordingRule) Name() string { return rule.name } func (rule RecordingRule) Name() string { return rule.name }
// EvalRaw returns the raw value of the rule expression. // eval evaluates the rule and then overrides the metric names and labels accordingly.
func (rule RecordingRule) EvalRaw(timestamp clientmodel.Timestamp, engine *promql.Engine) (promql.Vector, error) { func (rule RecordingRule) eval(timestamp clientmodel.Timestamp, engine *promql.Engine) (promql.Vector, error) {
query, err := engine.NewInstantQuery(rule.vector.String(), timestamp) query, err := engine.NewInstantQuery(rule.vector.String(), timestamp)
if err != nil { if err != nil {
return nil, err return nil, err
} }
return query.Exec().Vector() vector, err := query.Exec().Vector()
}
// Eval evaluates the rule and then overrides the metric names and labels accordingly.
func (rule RecordingRule) Eval(timestamp clientmodel.Timestamp, engine *promql.Engine) (promql.Vector, error) {
vector, err := rule.EvalRaw(timestamp, engine)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -65,21 +68,6 @@ func (rule RecordingRule) Eval(timestamp clientmodel.Timestamp, engine *promql.E
return vector, nil return vector, nil
} }
// DotGraph returns the text representation of a dot graph.
func (rule RecordingRule) DotGraph() string {
graph := fmt.Sprintf(
`digraph "Rules" {
%#p[shape="box",label="%s = "];
%#p -> %x;
%s
}`,
&rule, rule.name,
&rule, reflect.ValueOf(rule.vector).Pointer(),
rule.vector.DotGraph(),
)
return graph
}
func (rule RecordingRule) String() string { func (rule RecordingRule) String() string {
return fmt.Sprintf("%s%s = %s\n", rule.name, rule.labels, rule.vector) return fmt.Sprintf("%s%s = %s\n", rule.name, rule.labels, rule.vector)
} }

View file

@ -26,13 +26,8 @@ import (
type Rule interface { type Rule interface {
// Name returns the name of the rule. // Name returns the name of the rule.
Name() string Name() string
// EvalRaw evaluates the rule's vector expression without triggering any
// other actions, like recording or alerting.
EvalRaw(clientmodel.Timestamp, *promql.Engine) (promql.Vector, error)
// Eval evaluates the rule, including any associated recording or alerting actions. // Eval evaluates the rule, including any associated recording or alerting actions.
Eval(clientmodel.Timestamp, *promql.Engine) (promql.Vector, error) eval(clientmodel.Timestamp, *promql.Engine) (promql.Vector, error)
// DotGraph returns a Graphviz dot graph of the rule.
DotGraph() string
// String returns a human-readable string representation of the rule. // String returns a human-readable string representation of the rule.
String() string String() string
// HTMLSnippet returns a human-readable string representation of the rule, // HTMLSnippet returns a human-readable string representation of the rule,

View file

@ -186,7 +186,7 @@ func TestAlertingRule(t *testing.T) {
for i, expectedLines := range evalOutputs { for i, expectedLines := range evalOutputs {
evalTime := testStartTime.Add(testSampleInterval * time.Duration(i)) evalTime := testStartTime.Add(testSampleInterval * time.Duration(i))
res, err := rule.Eval(evalTime, engine) res, err := rule.eval(evalTime, engine)
if err != nil { if err != nil {
t.Fatalf("Error during alerting rule evaluation: %s", err) t.Fatalf("Error during alerting rule evaluation: %s", err)
} }

View file

@ -63,9 +63,9 @@ func (h *AlertsHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
alertStatus := AlertStatus{ alertStatus := AlertStatus{
AlertingRules: alertsSorter.alerts, AlertingRules: alertsSorter.alerts,
AlertStateToRowClass: map[rules.AlertState]string{ AlertStateToRowClass: map[rules.AlertState]string{
rules.Inactive: "success", rules.StateInactive: "success",
rules.Pending: "warning", rules.StatePending: "warning",
rules.Firing: "danger", rules.StateFiring: "danger",
}, },
} }
executeTemplate(w, "alerts", alertStatus, h.PathPrefix) executeTemplate(w, "alerts", alertStatus, h.PathPrefix)