General cleanup of rules.

This commit is contained in:
Fabian Reinartz 2015-05-25 21:16:32 +02:00
parent 75c920c95e
commit 5e13880201
6 changed files with 78 additions and 64 deletions

View file

@ -16,7 +16,6 @@ package rules
import (
"fmt"
"html/template"
"strings"
"sync"
"time"
@ -28,12 +27,12 @@ import (
const (
// AlertMetricName is the metric name for synthetic alert timeseries.
AlertMetricName clientmodel.LabelValue = "ALERTS"
alertMetricName clientmodel.LabelValue = "ALERTS"
// AlertNameLabel is the label name indicating the name of an alert.
AlertNameLabel clientmodel.LabelName = "alertname"
alertNameLabel clientmodel.LabelName = "alertname"
// AlertStateLabel is the label name indicating the state of an alert.
AlertStateLabel clientmodel.LabelName = "alertstate"
alertStateLabel clientmodel.LabelName = "alertstate"
)
// AlertState denotes the state of an active alert.
@ -41,11 +40,11 @@ type AlertState int
func (s AlertState) String() string {
switch s {
case Inactive:
case StateInactive:
return "inactive"
case Pending:
case StatePending:
return "pending"
case Firing:
case StateFiring:
return "firing"
default:
panic("undefined")
@ -54,13 +53,13 @@ func (s AlertState) String() string {
const (
// Inactive alerts are neither firing nor pending.
Inactive AlertState = iota
StateInactive AlertState = iota
// Pending alerts have been active for less than the configured
// threshold duration.
Pending
StatePending
// Firing alerts have been active for longer than the configured
// threshold duration.
Firing
StateFiring
)
// Alert is used to track active (pending/firing) alerts over time.
@ -84,9 +83,9 @@ func (a Alert) sample(timestamp clientmodel.Timestamp, value clientmodel.SampleV
recordedMetric[label] = value
}
recordedMetric[clientmodel.MetricNameLabel] = AlertMetricName
recordedMetric[AlertNameLabel] = clientmodel.LabelValue(a.Name)
recordedMetric[AlertStateLabel] = clientmodel.LabelValue(a.State.String())
recordedMetric[clientmodel.MetricNameLabel] = alertMetricName
recordedMetric[alertNameLabel] = clientmodel.LabelValue(a.Name)
recordedMetric[alertStateLabel] = clientmodel.LabelValue(a.State.String())
return &promql.Sample{
Metric: clientmodel.COWMetric{
@ -103,16 +102,16 @@ type AlertingRule struct {
// The name of the alert.
name string
// The vector expression from which to generate alerts.
Vector promql.Expr
vector promql.Expr
// The duration for which a labelset needs to persist in the expression
// output vector before an alert transitions from Pending to Firing state.
holdDuration time.Duration
// Extra labels to attach to the resulting alert sample vectors.
Labels clientmodel.LabelSet
labels clientmodel.LabelSet
// Short alert summary, suitable for email subjects.
Summary string
summary string
// More detailed alert description.
Description string
description string
// Protects the below.
mutex sync.Mutex
@ -121,15 +120,36 @@ type AlertingRule struct {
activeAlerts map[clientmodel.Fingerprint]*Alert
}
// NewAlertingRule constructs a new AlertingRule.
func NewAlertingRule(
name string,
vector promql.Expr,
holdDuration time.Duration,
labels clientmodel.LabelSet,
summary string,
description string,
) *AlertingRule {
return &AlertingRule{
name: name,
vector: vector,
holdDuration: holdDuration,
labels: labels,
summary: summary,
description: description,
activeAlerts: map[clientmodel.Fingerprint]*Alert{},
}
}
// Name returns the name of the alert.
func (rule *AlertingRule) Name() string {
return rule.name
}
// Eval evaluates the rule expression and then creates pending alerts and fires
// eval evaluates the rule expression and then creates pending alerts and fires
// or removes previously pending alerts accordingly.
func (rule *AlertingRule) Eval(timestamp clientmodel.Timestamp, engine *promql.Engine) (promql.Vector, error) {
query, err := engine.NewInstantQuery(rule.Vector.String(), timestamp)
func (rule *AlertingRule) eval(timestamp clientmodel.Timestamp, engine *promql.Engine) (promql.Vector, error) {
query, err := engine.NewInstantQuery(rule.vector.String(), timestamp)
if err != nil {
return nil, err
}
@ -151,14 +171,14 @@ func (rule *AlertingRule) Eval(timestamp clientmodel.Timestamp, engine *promql.E
if alert, ok := rule.activeAlerts[fp]; !ok {
labels := clientmodel.LabelSet{}
labels.MergeFromMetric(sample.Metric.Metric)
labels = labels.Merge(rule.Labels)
labels = labels.Merge(rule.labels)
if _, ok := labels[clientmodel.MetricNameLabel]; ok {
delete(labels, clientmodel.MetricNameLabel)
}
rule.activeAlerts[fp] = &Alert{
Name: rule.name,
Labels: labels,
State: Pending,
State: StatePending,
ActiveSince: timestamp,
Value: sample.Value,
}
@ -177,9 +197,9 @@ func (rule *AlertingRule) Eval(timestamp clientmodel.Timestamp, engine *promql.E
continue
}
if activeAlert.State == Pending && timestamp.Sub(activeAlert.ActiveSince) >= rule.holdDuration {
if activeAlert.State == StatePending && timestamp.Sub(activeAlert.ActiveSince) >= rule.holdDuration {
vector = append(vector, activeAlert.sample(timestamp, 0))
activeAlert.State = Firing
activeAlert.State = StateFiring
}
vector = append(vector, activeAlert.sample(timestamp, 1))
@ -189,23 +209,23 @@ func (rule *AlertingRule) Eval(timestamp clientmodel.Timestamp, engine *promql.E
}
func (rule *AlertingRule) String() string {
return fmt.Sprintf("ALERT %s IF %s FOR %s WITH %s", rule.name, rule.Vector, strutil.DurationToString(rule.holdDuration), rule.Labels)
return fmt.Sprintf("ALERT %s IF %s FOR %s WITH %s", rule.name, rule.vector, strutil.DurationToString(rule.holdDuration), rule.labels)
}
// HTMLSnippet returns an HTML snippet representing this alerting rule.
func (rule *AlertingRule) HTMLSnippet(pathPrefix string) template.HTML {
alertMetric := clientmodel.Metric{
clientmodel.MetricNameLabel: AlertMetricName,
AlertNameLabel: clientmodel.LabelValue(rule.name),
clientmodel.MetricNameLabel: alertMetricName,
alertNameLabel: clientmodel.LabelValue(rule.name),
}
return template.HTML(fmt.Sprintf(
`ALERT <a href="%s">%s</a> IF <a href="%s">%s</a> FOR %s WITH %s`,
pathPrefix+strutil.GraphLinkForExpression(alertMetric.String()),
rule.name,
pathPrefix+strutil.GraphLinkForExpression(rule.Vector.String()),
rule.Vector,
pathPrefix+strutil.GraphLinkForExpression(rule.vector.String()),
rule.vector,
strutil.DurationToString(rule.holdDuration),
rule.Labels))
rule.labels))
}
// State returns the "maximum" state: firing > pending > inactive.
@ -213,7 +233,7 @@ func (rule *AlertingRule) State() AlertState {
rule.mutex.Lock()
defer rule.mutex.Unlock()
maxState := Inactive
maxState := StateInactive
for _, activeAlert := range rule.activeAlerts {
if activeAlert.State > maxState {
maxState = activeAlert.State
@ -233,17 +253,3 @@ func (rule *AlertingRule) ActiveAlerts() []Alert {
}
return alerts
}
// NewAlertingRule constructs a new AlertingRule.
func NewAlertingRule(name string, vector promql.Expr, holdDuration time.Duration, labels clientmodel.LabelSet, summary string, description string) *AlertingRule {
return &AlertingRule{
name: name,
Vector: vector,
holdDuration: holdDuration,
Labels: labels,
Summary: summary,
Description: description,
activeAlerts: map[clientmodel.Fingerprint]*Alert{},
}
}

View file

@ -39,8 +39,8 @@ const (
namespace = "prometheus"
ruleTypeLabel = "rule_type"
alertingRuleType = "alerting"
recordingRuleType = "recording"
ruleTypeAlerting = "alerting"
ruleTypeRecording = "recording"
)
var (
@ -173,7 +173,7 @@ func (m *Manager) queueAlertNotifications(rule *AlertingRule, timestamp clientmo
notifications := make(notification.NotificationReqs, 0, len(activeAlerts))
for _, aa := range activeAlerts {
if aa.State != Firing {
if aa.State != StateFiring {
// BUG: In the future, make AlertManager support pending alerts?
continue
}
@ -205,15 +205,15 @@ func (m *Manager) queueAlertNotifications(rule *AlertingRule, timestamp clientmo
}
notifications = append(notifications, &notification.NotificationReq{
Summary: expand(rule.Summary),
Description: expand(rule.Description),
Summary: expand(rule.summary),
Description: expand(rule.description),
Labels: aa.Labels.Merge(clientmodel.LabelSet{
AlertNameLabel: clientmodel.LabelValue(rule.Name()),
alertNameLabel: clientmodel.LabelValue(rule.Name()),
}),
Value: aa.Value,
ActiveSince: aa.ActiveSince.Time(),
RuleString: rule.String(),
GeneratorURL: m.prometheusURL + strings.TrimLeft(strutil.GraphLinkForExpression(rule.Vector.String()), "/"),
GeneratorURL: m.prometheusURL + strings.TrimLeft(strutil.GraphLinkForExpression(rule.vector.String()), "/"),
})
}
m.notificationHandler.SubmitReqs(notifications)
@ -235,7 +235,7 @@ func (m *Manager) runIteration() {
defer wg.Done()
start := time.Now()
vector, err := rule.Eval(now, m.queryEngine)
vector, err := rule.eval(now, m.queryEngine)
duration := time.Since(start)
if err != nil {
@ -247,11 +247,11 @@ func (m *Manager) runIteration() {
switch r := rule.(type) {
case *AlertingRule:
m.queueAlertNotifications(r, now)
evalDuration.WithLabelValues(alertingRuleType).Observe(
evalDuration.WithLabelValues(ruleTypeAlerting).Observe(
float64(duration / time.Millisecond),
)
case *RecordingRule:
evalDuration.WithLabelValues(recordingRuleType).Observe(
evalDuration.WithLabelValues(ruleTypeRecording).Observe(
float64(duration / time.Millisecond),
)
default:
@ -319,7 +319,7 @@ func (m *Manager) loadRuleFiles(filenames ...string) error {
rule := NewAlertingRule(r.Name, r.Expr, r.Duration, r.Labels, r.Summary, r.Description)
m.rules = append(m.rules, rule)
case *promql.RecordStmt:
rule := &RecordingRule{r.Name, r.Expr, r.Labels}
rule := NewRecordingRule(r.Name, r.Expr, r.Labels)
m.rules = append(m.rules, rule)
default:
panic("retrieval.Manager.LoadRuleFiles: unknown statement type")

View file

@ -16,7 +16,6 @@ package rules
import (
"fmt"
"html/template"
"strings"
clientmodel "github.com/prometheus/client_golang/model"
@ -31,11 +30,20 @@ type RecordingRule struct {
labels clientmodel.LabelSet
}
// NewRecordingRule returns a new recording rule.
func NewRecordingRule(name string, vector promql.Expr, labels clientmodel.LabelSet) *RecordingRule {
return &RecordingRule{
name: name,
vector: vector,
labels: labels,
}
}
// Name returns the rule name.
func (rule RecordingRule) Name() string { return rule.name }
// Eval evaluates the rule and then overrides the metric names and labels accordingly.
func (rule RecordingRule) Eval(timestamp clientmodel.Timestamp, engine *promql.Engine) (promql.Vector, error) {
// eval evaluates the rule and then overrides the metric names and labels accordingly.
func (rule RecordingRule) eval(timestamp clientmodel.Timestamp, engine *promql.Engine) (promql.Vector, error) {
query, err := engine.NewInstantQuery(rule.vector.String(), timestamp)
if err != nil {
return nil, err

View file

@ -27,7 +27,7 @@ type Rule interface {
// Name returns the name of the rule.
Name() string
// Eval evaluates the rule, including any associated recording or alerting actions.
Eval(clientmodel.Timestamp, *promql.Engine) (promql.Vector, error)
eval(clientmodel.Timestamp, *promql.Engine) (promql.Vector, error)
// String returns a human-readable string representation of the rule.
String() string
// HTMLSnippet returns a human-readable string representation of the rule,

View file

@ -186,7 +186,7 @@ func TestAlertingRule(t *testing.T) {
for i, expectedLines := range evalOutputs {
evalTime := testStartTime.Add(testSampleInterval * time.Duration(i))
res, err := rule.Eval(evalTime, engine)
res, err := rule.eval(evalTime, engine)
if err != nil {
t.Fatalf("Error during alerting rule evaluation: %s", err)
}

View file

@ -63,9 +63,9 @@ func (h *AlertsHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
alertStatus := AlertStatus{
AlertingRules: alertsSorter.alerts,
AlertStateToRowClass: map[rules.AlertState]string{
rules.Inactive: "success",
rules.Pending: "warning",
rules.Firing: "danger",
rules.StateInactive: "success",
rules.StatePending: "warning",
rules.StateFiring: "danger",
},
}
executeTemplate(w, "alerts", alertStatus, h.PathPrefix)