2015-01-21 11:07:45 -08:00
// Copyright 2013 The Prometheus Authors
2013-04-24 02:51:40 -07:00
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package rules
import (
2013-04-26 07:02:52 -07:00
"fmt"
2013-06-13 07:10:05 -07:00
"sync"
"time"
2016-07-12 09:11:31 -07:00
html_template "html/template"
"github.com/prometheus/common/log"
2015-08-20 08:18:46 -07:00
"github.com/prometheus/common/model"
2013-06-25 05:02:27 -07:00
2015-03-30 10:43:19 -07:00
"github.com/prometheus/prometheus/promql"
2016-07-12 09:11:31 -07:00
"github.com/prometheus/prometheus/template"
2015-05-29 04:30:30 -07:00
"github.com/prometheus/prometheus/util/strutil"
2013-04-24 02:51:40 -07:00
)
2013-06-25 05:02:27 -07:00
const (
2014-12-10 07:16:49 -08:00
// AlertMetricName is the metric name for synthetic alert timeseries.
2015-08-20 08:18:46 -07:00
alertMetricName model . LabelValue = "ALERTS"
2013-06-25 05:02:27 -07:00
2014-12-10 07:16:49 -08:00
// AlertNameLabel is the label name indicating the name of an alert.
2015-08-20 08:18:46 -07:00
alertNameLabel model . LabelName = "alertname"
2014-12-10 07:16:49 -08:00
// AlertStateLabel is the label name indicating the state of an alert.
2015-08-20 08:18:46 -07:00
alertStateLabel model . LabelName = "alertstate"
2013-06-25 05:02:27 -07:00
)
2014-12-10 07:16:49 -08:00
// AlertState denotes the state of an active alert.
2013-06-13 07:10:05 -07:00
type AlertState int
2013-04-24 02:51:40 -07:00
2015-12-15 10:46:03 -08:00
const (
2016-02-04 20:42:55 -08:00
// StateInactive is the state of an alert that is neither firing nor pending.
2015-12-15 10:46:03 -08:00
StateInactive AlertState = iota
// StatePending is the state of an alert that has been active for less than
// the configured threshold duration.
StatePending
// StateFiring is the state of an alert that has been active for longer than
// the configured threshold duration.
StateFiring
)
2013-06-13 07:10:05 -07:00
func ( s AlertState ) String ( ) string {
2013-04-24 02:51:40 -07:00
switch s {
2015-05-25 12:16:32 -07:00
case StateInactive :
2013-06-13 07:10:05 -07:00
return "inactive"
2015-05-25 12:16:32 -07:00
case StatePending :
2013-05-15 22:38:31 -07:00
return "pending"
2015-05-25 12:16:32 -07:00
case StateFiring :
2013-05-15 22:38:31 -07:00
return "firing"
2013-04-24 02:51:40 -07:00
}
2016-01-29 06:23:11 -08:00
panic ( fmt . Errorf ( "unknown alert state: %v" , s . String ( ) ) )
2013-04-24 02:51:40 -07:00
}
2015-12-15 10:46:03 -08:00
// Alert is the user-level representation of a single instance of an alerting rule.
type Alert struct {
2016-07-12 09:11:31 -07:00
State AlertState
Labels model . LabelSet
Annotations model . LabelSet
2015-12-17 02:46:10 -08:00
// The value at the last evaluation of the alerting expression.
Value model . SampleValue
// The interval during which the condition of this alert held true.
// ResolvedAt will be 0 to indicate a still active alert.
ActiveAt , ResolvedAt model . Time
2013-04-24 02:51:40 -07:00
}
2014-12-10 07:16:49 -08:00
// An AlertingRule generates alerts from its vector expression.
2013-04-24 02:51:40 -07:00
type AlertingRule struct {
// The name of the alert.
2013-04-05 09:03:45 -07:00
name string
// The vector expression from which to generate alerts.
2015-05-25 12:16:32 -07:00
vector promql . Expr
2013-04-24 02:51:40 -07:00
// The duration for which a labelset needs to persist in the expression
2014-12-10 07:16:49 -08:00
// output vector before an alert transitions from Pending to Firing state.
2013-04-24 02:51:40 -07:00
holdDuration time . Duration
// Extra labels to attach to the resulting alert sample vectors.
2015-08-20 08:18:46 -07:00
labels model . LabelSet
2015-12-11 08:12:34 -08:00
// Non-identifying key/value pairs.
annotations model . LabelSet
2013-06-13 07:10:05 -07:00
// Protects the below.
2015-12-14 08:40:40 -08:00
mtx sync . Mutex
2014-12-10 07:16:49 -08:00
// A map of alerts which are currently active (Pending or Firing), keyed by
2013-04-24 02:51:40 -07:00
// the fingerprint of the labelset they correspond to.
2015-12-15 10:46:03 -08:00
active map [ model . Fingerprint ] * Alert
2013-04-24 02:51:40 -07:00
}
2015-05-25 12:16:32 -07:00
// NewAlertingRule constructs a new AlertingRule.
2015-12-11 08:12:34 -08:00
func NewAlertingRule ( name string , vec promql . Expr , hold time . Duration , lbls , anns model . LabelSet ) * AlertingRule {
2015-05-25 12:16:32 -07:00
return & AlertingRule {
name : name ,
2015-12-11 08:12:34 -08:00
vector : vec ,
holdDuration : hold ,
labels : lbls ,
annotations : anns ,
2015-12-15 10:46:03 -08:00
active : map [ model . Fingerprint ] * Alert { } ,
2015-05-25 12:16:32 -07:00
}
}
2014-12-10 07:16:49 -08:00
// Name returns the name of the alert.
2016-05-19 07:59:53 -07:00
func ( r * AlertingRule ) Name ( ) string {
return r . name
2013-07-30 08:18:07 -07:00
}
2013-04-24 02:51:40 -07:00
2016-03-02 02:54:37 -08:00
func ( r * AlertingRule ) equal ( o * AlertingRule ) bool {
return r . name == o . name && r . labels . Equal ( o . labels )
}
2015-12-15 10:46:03 -08:00
func ( r * AlertingRule ) sample ( alert * Alert , ts model . Time , set bool ) * model . Sample {
2015-12-14 08:40:40 -08:00
metric := model . Metric ( r . labels . Clone ( ) )
2015-12-15 10:46:03 -08:00
for ln , lv := range alert . Labels {
2015-12-14 08:40:40 -08:00
metric [ ln ] = lv
}
metric [ model . MetricNameLabel ] = alertMetricName
metric [ model . AlertNameLabel ] = model . LabelValue ( r . name )
2015-12-15 10:46:03 -08:00
metric [ alertStateLabel ] = model . LabelValue ( alert . State . String ( ) )
2015-12-14 08:40:40 -08:00
s := & model . Sample {
Metric : metric ,
Timestamp : ts ,
Value : 0 ,
}
if set {
s . Value = 1
}
return s
}
2015-12-15 10:46:03 -08:00
// resolvedRetention is the duration for which a resolved alert instance
// is kept in memory state and consequentally repeatedly sent to the AlertManager.
const resolvedRetention = 15 * time . Minute
2015-05-25 12:16:32 -07:00
// eval evaluates the rule expression and then creates pending alerts and fires
2015-05-25 11:43:24 -07:00
// or removes previously pending alerts accordingly.
2016-07-12 09:11:31 -07:00
func ( r * AlertingRule ) eval ( ts model . Time , engine * promql . Engine , externalURLPath string ) ( model . Vector , error ) {
2015-12-14 08:40:40 -08:00
query , err := engine . NewInstantQuery ( r . vector . String ( ) , ts )
2015-03-30 10:43:19 -07:00
if err != nil {
return nil , err
}
2015-12-14 08:40:40 -08:00
res , err := query . Exec ( ) . Vector ( )
2013-04-24 02:51:40 -07:00
if err != nil {
2013-05-15 22:38:31 -07:00
return nil , err
2013-04-24 02:51:40 -07:00
}
2015-12-14 08:40:40 -08:00
r . mtx . Lock ( )
defer r . mtx . Unlock ( )
2013-06-13 07:10:05 -07:00
2013-06-14 04:03:19 -07:00
// Create pending alerts for any new vector elements in the alert expression
// or update the expression value for existing elements.
2015-08-20 08:18:46 -07:00
resultFPs := map [ model . Fingerprint ] struct { } { }
2015-12-14 08:40:40 -08:00
for _ , smpl := range res {
2016-07-12 09:11:31 -07:00
// Provide the alert information to the template.
l := make ( map [ string ] string , len ( smpl . Metric ) )
for k , v := range smpl . Metric {
l [ string ( k ) ] = string ( v )
}
tmplData := struct {
Labels map [ string ] string
Value float64
} {
Labels : l ,
Value : float64 ( smpl . Value ) ,
}
// Inject some convenience variables that are easier to remember for users
// who are not used to Go's templating system.
defs := "{{$labels := .Labels}}{{$value := .Value}}"
expand := func ( text model . LabelValue ) model . LabelValue {
tmpl := template . NewTemplateExpander (
defs + string ( text ) ,
"__alert_" + r . Name ( ) ,
tmplData ,
ts ,
engine ,
externalURLPath ,
)
result , err := tmpl . Expand ( )
if err != nil {
result = fmt . Sprintf ( "<error expanding template: %s>" , err )
log . Warnf ( "Error expanding alert template %v with data '%v': %s" , r . Name ( ) , tmplData , err )
}
return model . LabelValue ( result )
}
2016-08-01 15:32:01 -07:00
delete ( smpl . Metric , model . MetricNameLabel )
2016-07-12 09:11:31 -07:00
labels := make ( model . LabelSet , len ( smpl . Metric ) + len ( r . labels ) + 1 )
for ln , lv := range smpl . Metric {
labels [ ln ] = lv
}
for ln , lv := range r . labels {
labels [ ln ] = expand ( lv )
}
labels [ model . AlertNameLabel ] = model . LabelValue ( r . Name ( ) )
annotations := make ( model . LabelSet , len ( r . annotations ) )
for an , av := range r . annotations {
annotations [ an ] = expand ( av )
}
2015-12-14 08:40:40 -08:00
fp := smpl . Metric . Fingerprint ( )
2015-05-28 12:51:44 -07:00
resultFPs [ fp ] = struct { } { }
2013-06-25 05:02:27 -07:00
2016-02-04 20:42:55 -08:00
if alert , ok := r . active [ fp ] ; ok && alert . State != StateInactive {
2015-12-15 10:46:03 -08:00
alert . Value = smpl . Value
2015-12-14 08:40:40 -08:00
continue
}
2015-12-15 10:46:03 -08:00
r . active [ fp ] = & Alert {
2016-07-12 09:11:31 -07:00
Labels : labels ,
Annotations : annotations ,
ActiveAt : ts ,
State : StatePending ,
Value : smpl . Value ,
2013-04-24 02:51:40 -07:00
}
}
2015-12-14 08:40:40 -08:00
var vec model . Vector
2013-04-24 02:51:40 -07:00
// Check if any pending alerts should be removed or fire now. Write out alert timeseries.
2015-12-15 10:46:03 -08:00
for fp , a := range r . active {
2015-05-28 12:51:44 -07:00
if _ , ok := resultFPs [ fp ] ; ! ok {
2015-12-15 10:46:03 -08:00
if a . State != StateInactive {
vec = append ( vec , r . sample ( a , ts , false ) )
}
2015-12-17 02:46:10 -08:00
// If the alert was previously firing, keep it around for a given
2015-12-15 10:46:03 -08:00
// retention time so it is reported as resolved to the AlertManager.
if a . State == StatePending || ( a . ResolvedAt != 0 && ts . Sub ( a . ResolvedAt ) > resolvedRetention ) {
delete ( r . active , fp )
}
if a . State != StateInactive {
a . State = StateInactive
a . ResolvedAt = ts
}
2013-04-24 02:51:40 -07:00
continue
}
2015-12-15 10:46:03 -08:00
if a . State == StatePending && ts . Sub ( a . ActiveAt ) >= r . holdDuration {
vec = append ( vec , r . sample ( a , ts , false ) )
a . State = StateFiring
2013-04-24 02:51:40 -07:00
}
2015-12-15 10:46:03 -08:00
vec = append ( vec , r . sample ( a , ts , true ) )
2013-04-24 02:51:40 -07:00
}
2013-05-15 22:38:31 -07:00
2015-12-14 08:40:40 -08:00
return vec , nil
}
2015-12-17 02:46:10 -08:00
// State returns the maximum state of alert instances for this rule.
// StateFiring > StatePending > StateInactive
2015-12-14 08:40:40 -08:00
func ( r * AlertingRule ) State ( ) AlertState {
r . mtx . Lock ( )
defer r . mtx . Unlock ( )
maxState := StateInactive
2015-12-15 10:46:03 -08:00
for _ , a := range r . active {
if a . State > maxState {
maxState = a . State
2015-12-14 08:40:40 -08:00
}
}
return maxState
}
// ActiveAlerts returns a slice of active alerts.
func ( r * AlertingRule ) ActiveAlerts ( ) [ ] * Alert {
2015-12-15 10:46:03 -08:00
var res [ ] * Alert
2015-12-17 02:46:10 -08:00
for _ , a := range r . currentAlerts ( ) {
2015-12-15 10:46:03 -08:00
if a . ResolvedAt == 0 {
res = append ( res , a )
}
}
return res
}
2015-12-17 02:46:10 -08:00
// currentAlerts returns all instances of alerts for this rule. This may include
// inactive alerts that were previously firing.
func ( r * AlertingRule ) currentAlerts ( ) [ ] * Alert {
2015-12-14 08:40:40 -08:00
r . mtx . Lock ( )
defer r . mtx . Unlock ( )
alerts := make ( [ ] * Alert , 0 , len ( r . active ) )
2015-12-15 10:46:03 -08:00
for _ , a := range r . active {
anew := * a
alerts = append ( alerts , & anew )
2015-12-14 08:40:40 -08:00
}
return alerts
2013-04-24 02:51:40 -07:00
}
2016-05-19 07:59:53 -07:00
func ( r * AlertingRule ) String ( ) string {
s := fmt . Sprintf ( "ALERT %s" , r . name )
s += fmt . Sprintf ( "\n\tIF %s" , r . vector )
if r . holdDuration > 0 {
s += fmt . Sprintf ( "\n\tFOR %s" , model . Duration ( r . holdDuration ) )
2015-06-23 08:46:57 -07:00
}
2016-05-19 07:59:53 -07:00
if len ( r . labels ) > 0 {
s += fmt . Sprintf ( "\n\tLABELS %s" , r . labels )
2015-06-23 08:46:57 -07:00
}
2016-05-19 07:59:53 -07:00
if len ( r . annotations ) > 0 {
s += fmt . Sprintf ( "\n\tANNOTATIONS %s" , r . annotations )
2015-12-11 08:12:34 -08:00
}
2015-06-23 08:46:57 -07:00
return s
2013-06-13 07:10:05 -07:00
}
2015-06-23 08:46:57 -07:00
// HTMLSnippet returns an HTML snippet representing this alerting rule. The
// resulting snippet is expected to be presented in a <pre> element, so that
// line breaks and other returned whitespace is respected.
2016-07-12 09:11:31 -07:00
func ( r * AlertingRule ) HTMLSnippet ( pathPrefix string ) html_template . HTML {
2015-08-20 08:18:46 -07:00
alertMetric := model . Metric {
model . MetricNameLabel : alertMetricName ,
2016-05-19 07:59:53 -07:00
alertNameLabel : model . LabelValue ( r . name ) ,
2013-06-13 07:10:05 -07:00
}
2016-05-19 07:59:53 -07:00
s := fmt . Sprintf ( "ALERT <a href=%q>%s</a>" , pathPrefix + strutil . GraphLinkForExpression ( alertMetric . String ( ) ) , r . name )
2016-08-11 17:52:59 -07:00
s += fmt . Sprintf ( "\n IF <a href=%q>%s</a>" , pathPrefix + strutil . GraphLinkForExpression ( r . vector . String ( ) ) , html_template . HTMLEscapeString ( r . vector . String ( ) ) )
2016-05-19 07:59:53 -07:00
if r . holdDuration > 0 {
s += fmt . Sprintf ( "\n FOR %s" , model . Duration ( r . holdDuration ) )
2015-06-23 08:46:57 -07:00
}
2016-05-19 07:59:53 -07:00
if len ( r . labels ) > 0 {
2016-08-11 17:52:59 -07:00
s += fmt . Sprintf ( "\n LABELS %s" , html_template . HTMLEscapeString ( r . labels . String ( ) ) )
2015-06-23 08:46:57 -07:00
}
2016-05-19 07:59:53 -07:00
if len ( r . annotations ) > 0 {
2016-08-11 17:52:59 -07:00
s += fmt . Sprintf ( "\n ANNOTATIONS %s" , html_template . HTMLEscapeString ( r . annotations . String ( ) ) )
2015-12-11 08:12:34 -08:00
}
2016-07-12 09:11:31 -07:00
return html_template . HTML ( s )
2013-06-13 07:10:05 -07:00
}