2015-01-21 11:07:45 -08:00
// Copyright 2013 The Prometheus Authors
2013-04-24 02:51:40 -07:00
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package rules
import (
2013-04-26 07:02:52 -07:00
"fmt"
2013-06-13 07:10:05 -07:00
"sync"
"time"
promql: Allow per-query contexts.
For Weaveworks' Frankenstein, we need to support multitenancy. In
Frankenstein, we initially solved this without modifying the promql
package at all: we constructed a new promql.Engine for every
query and injected a storage implementation into that engine which would
be primed to only collect data for a given user.
This is problematic to upstream, however. Prometheus assumes that there
is only one engine: the query concurrency gate is part of the engine,
and the engine contains one central cancellable context to shut down all
queries. Also, creating a new engine for every query seems like overkill.
Thus, we want to be able to pass per-query contexts into a single engine.
This change gets rid of the promql.Engine's built-in base context and
allows passing in a per-query context instead. Central cancellation of
all queries is still possible by deriving all passed-in contexts from
one central one, but this is now the responsibility of the caller. The
central query context is now created in main() and passed into the
relevant components (web handler / API, rule manager).
In a next step, the per-query context would have to be passed to the
storage implementation, so that the storage can implement multi-tenancy
or other features based on the contextual information.
2016-09-15 04:52:50 -07:00
"golang.org/x/net/context"
2016-07-12 09:11:31 -07:00
html_template "html/template"
"github.com/prometheus/common/log"
2015-08-20 08:18:46 -07:00
"github.com/prometheus/common/model"
2013-06-25 05:02:27 -07:00
2015-03-30 10:43:19 -07:00
"github.com/prometheus/prometheus/promql"
2016-07-12 09:11:31 -07:00
"github.com/prometheus/prometheus/template"
2015-05-29 04:30:30 -07:00
"github.com/prometheus/prometheus/util/strutil"
2013-04-24 02:51:40 -07:00
)
2013-06-25 05:02:27 -07:00
const (
2014-12-10 07:16:49 -08:00
// AlertMetricName is the metric name for synthetic alert timeseries.
2015-08-20 08:18:46 -07:00
alertMetricName model . LabelValue = "ALERTS"
2013-06-25 05:02:27 -07:00
2014-12-10 07:16:49 -08:00
// AlertNameLabel is the label name indicating the name of an alert.
2015-08-20 08:18:46 -07:00
alertNameLabel model . LabelName = "alertname"
2014-12-10 07:16:49 -08:00
// AlertStateLabel is the label name indicating the state of an alert.
2015-08-20 08:18:46 -07:00
alertStateLabel model . LabelName = "alertstate"
2013-06-25 05:02:27 -07:00
)
2014-12-10 07:16:49 -08:00
// AlertState denotes the state of an active alert.
2013-06-13 07:10:05 -07:00
type AlertState int
2013-04-24 02:51:40 -07:00
2015-12-15 10:46:03 -08:00
const (
2016-02-04 20:42:55 -08:00
// StateInactive is the state of an alert that is neither firing nor pending.
2015-12-15 10:46:03 -08:00
StateInactive AlertState = iota
// StatePending is the state of an alert that has been active for less than
// the configured threshold duration.
StatePending
// StateFiring is the state of an alert that has been active for longer than
// the configured threshold duration.
StateFiring
)
2013-06-13 07:10:05 -07:00
func ( s AlertState ) String ( ) string {
2013-04-24 02:51:40 -07:00
switch s {
2015-05-25 12:16:32 -07:00
case StateInactive :
2013-06-13 07:10:05 -07:00
return "inactive"
2015-05-25 12:16:32 -07:00
case StatePending :
2013-05-15 22:38:31 -07:00
return "pending"
2015-05-25 12:16:32 -07:00
case StateFiring :
2013-05-15 22:38:31 -07:00
return "firing"
2013-04-24 02:51:40 -07:00
}
2016-01-29 06:23:11 -08:00
panic ( fmt . Errorf ( "unknown alert state: %v" , s . String ( ) ) )
2013-04-24 02:51:40 -07:00
}
2015-12-15 10:46:03 -08:00
// Alert is the user-level representation of a single instance of an alerting rule.
type Alert struct {
2016-07-12 09:11:31 -07:00
State AlertState
Labels model . LabelSet
Annotations model . LabelSet
2015-12-17 02:46:10 -08:00
// The value at the last evaluation of the alerting expression.
Value model . SampleValue
// The interval during which the condition of this alert held true.
// ResolvedAt will be 0 to indicate a still active alert.
ActiveAt , ResolvedAt model . Time
2013-04-24 02:51:40 -07:00
}
2014-12-10 07:16:49 -08:00
// An AlertingRule generates alerts from its vector expression.
2013-04-24 02:51:40 -07:00
type AlertingRule struct {
// The name of the alert.
2013-04-05 09:03:45 -07:00
name string
// The vector expression from which to generate alerts.
2015-05-25 12:16:32 -07:00
vector promql . Expr
2013-04-24 02:51:40 -07:00
// The duration for which a labelset needs to persist in the expression
2014-12-10 07:16:49 -08:00
// output vector before an alert transitions from Pending to Firing state.
2013-04-24 02:51:40 -07:00
holdDuration time . Duration
// Extra labels to attach to the resulting alert sample vectors.
2015-08-20 08:18:46 -07:00
labels model . LabelSet
2015-12-11 08:12:34 -08:00
// Non-identifying key/value pairs.
annotations model . LabelSet
2013-06-13 07:10:05 -07:00
// Protects the below.
2015-12-14 08:40:40 -08:00
mtx sync . Mutex
2014-12-10 07:16:49 -08:00
// A map of alerts which are currently active (Pending or Firing), keyed by
2013-04-24 02:51:40 -07:00
// the fingerprint of the labelset they correspond to.
2015-12-15 10:46:03 -08:00
active map [ model . Fingerprint ] * Alert
2013-04-24 02:51:40 -07:00
}
2015-05-25 12:16:32 -07:00
// NewAlertingRule constructs a new AlertingRule.
2015-12-11 08:12:34 -08:00
func NewAlertingRule ( name string , vec promql . Expr , hold time . Duration , lbls , anns model . LabelSet ) * AlertingRule {
2015-05-25 12:16:32 -07:00
return & AlertingRule {
name : name ,
2015-12-11 08:12:34 -08:00
vector : vec ,
holdDuration : hold ,
labels : lbls ,
annotations : anns ,
2015-12-15 10:46:03 -08:00
active : map [ model . Fingerprint ] * Alert { } ,
2015-05-25 12:16:32 -07:00
}
}
2014-12-10 07:16:49 -08:00
// Name returns the name of the alert.
2016-05-19 07:59:53 -07:00
func ( r * AlertingRule ) Name ( ) string {
return r . name
2013-07-30 08:18:07 -07:00
}
2013-04-24 02:51:40 -07:00
2016-03-02 02:54:37 -08:00
func ( r * AlertingRule ) equal ( o * AlertingRule ) bool {
return r . name == o . name && r . labels . Equal ( o . labels )
}
2015-12-15 10:46:03 -08:00
func ( r * AlertingRule ) sample ( alert * Alert , ts model . Time , set bool ) * model . Sample {
2015-12-14 08:40:40 -08:00
metric := model . Metric ( r . labels . Clone ( ) )
2015-12-15 10:46:03 -08:00
for ln , lv := range alert . Labels {
2015-12-14 08:40:40 -08:00
metric [ ln ] = lv
}
metric [ model . MetricNameLabel ] = alertMetricName
metric [ model . AlertNameLabel ] = model . LabelValue ( r . name )
2015-12-15 10:46:03 -08:00
metric [ alertStateLabel ] = model . LabelValue ( alert . State . String ( ) )
2015-12-14 08:40:40 -08:00
s := & model . Sample {
Metric : metric ,
Timestamp : ts ,
Value : 0 ,
}
if set {
s . Value = 1
}
return s
}
2015-12-15 10:46:03 -08:00
// resolvedRetention is the duration for which a resolved alert instance
// is kept in memory state and consequentally repeatedly sent to the AlertManager.
const resolvedRetention = 15 * time . Minute
2015-05-25 12:16:32 -07:00
// eval evaluates the rule expression and then creates pending alerts and fires
2015-05-25 11:43:24 -07:00
// or removes previously pending alerts accordingly.
2016-09-15 15:58:06 -07:00
func ( r * AlertingRule ) eval ( ctx context . Context , ts model . Time , engine * promql . Engine , externalURLPath string ) ( model . Vector , error ) {
2015-12-14 08:40:40 -08:00
query , err := engine . NewInstantQuery ( r . vector . String ( ) , ts )
2015-03-30 10:43:19 -07:00
if err != nil {
return nil , err
}
2016-09-15 15:58:06 -07:00
res , err := query . Exec ( ctx ) . Vector ( )
2013-04-24 02:51:40 -07:00
if err != nil {
2013-05-15 22:38:31 -07:00
return nil , err
2013-04-24 02:51:40 -07:00
}
2015-12-14 08:40:40 -08:00
r . mtx . Lock ( )
defer r . mtx . Unlock ( )
2013-06-13 07:10:05 -07:00
2013-06-14 04:03:19 -07:00
// Create pending alerts for any new vector elements in the alert expression
// or update the expression value for existing elements.
2015-08-20 08:18:46 -07:00
resultFPs := map [ model . Fingerprint ] struct { } { }
2015-12-14 08:40:40 -08:00
for _ , smpl := range res {
2016-07-12 09:11:31 -07:00
// Provide the alert information to the template.
l := make ( map [ string ] string , len ( smpl . Metric ) )
for k , v := range smpl . Metric {
l [ string ( k ) ] = string ( v )
}
tmplData := struct {
Labels map [ string ] string
Value float64
} {
Labels : l ,
Value : float64 ( smpl . Value ) ,
}
// Inject some convenience variables that are easier to remember for users
// who are not used to Go's templating system.
defs := "{{$labels := .Labels}}{{$value := .Value}}"
expand := func ( text model . LabelValue ) model . LabelValue {
tmpl := template . NewTemplateExpander (
2016-09-15 15:58:06 -07:00
ctx ,
2016-07-12 09:11:31 -07:00
defs + string ( text ) ,
"__alert_" + r . Name ( ) ,
tmplData ,
ts ,
engine ,
externalURLPath ,
)
result , err := tmpl . Expand ( )
if err != nil {
result = fmt . Sprintf ( "<error expanding template: %s>" , err )
log . Warnf ( "Error expanding alert template %v with data '%v': %s" , r . Name ( ) , tmplData , err )
}
return model . LabelValue ( result )
}
2016-08-01 15:32:01 -07:00
delete ( smpl . Metric , model . MetricNameLabel )
2016-07-12 09:11:31 -07:00
labels := make ( model . LabelSet , len ( smpl . Metric ) + len ( r . labels ) + 1 )
for ln , lv := range smpl . Metric {
labels [ ln ] = lv
}
for ln , lv := range r . labels {
labels [ ln ] = expand ( lv )
}
labels [ model . AlertNameLabel ] = model . LabelValue ( r . Name ( ) )
annotations := make ( model . LabelSet , len ( r . annotations ) )
for an , av := range r . annotations {
annotations [ an ] = expand ( av )
}
2015-12-14 08:40:40 -08:00
fp := smpl . Metric . Fingerprint ( )
2015-05-28 12:51:44 -07:00
resultFPs [ fp ] = struct { } { }
2013-06-25 05:02:27 -07:00
2016-02-04 20:42:55 -08:00
if alert , ok := r . active [ fp ] ; ok && alert . State != StateInactive {
2015-12-15 10:46:03 -08:00
alert . Value = smpl . Value
2015-12-14 08:40:40 -08:00
continue
}
2015-12-15 10:46:03 -08:00
r . active [ fp ] = & Alert {
2016-07-12 09:11:31 -07:00
Labels : labels ,
Annotations : annotations ,
ActiveAt : ts ,
State : StatePending ,
Value : smpl . Value ,
2013-04-24 02:51:40 -07:00
}
}
2015-12-14 08:40:40 -08:00
var vec model . Vector
2013-04-24 02:51:40 -07:00
// Check if any pending alerts should be removed or fire now. Write out alert timeseries.
2015-12-15 10:46:03 -08:00
for fp , a := range r . active {
2015-05-28 12:51:44 -07:00
if _ , ok := resultFPs [ fp ] ; ! ok {
2015-12-15 10:46:03 -08:00
if a . State != StateInactive {
vec = append ( vec , r . sample ( a , ts , false ) )
}
2015-12-17 02:46:10 -08:00
// If the alert was previously firing, keep it around for a given
2015-12-15 10:46:03 -08:00
// retention time so it is reported as resolved to the AlertManager.
if a . State == StatePending || ( a . ResolvedAt != 0 && ts . Sub ( a . ResolvedAt ) > resolvedRetention ) {
delete ( r . active , fp )
}
if a . State != StateInactive {
a . State = StateInactive
a . ResolvedAt = ts
}
2013-04-24 02:51:40 -07:00
continue
}
2015-12-15 10:46:03 -08:00
if a . State == StatePending && ts . Sub ( a . ActiveAt ) >= r . holdDuration {
vec = append ( vec , r . sample ( a , ts , false ) )
a . State = StateFiring
2013-04-24 02:51:40 -07:00
}
2015-12-15 10:46:03 -08:00
vec = append ( vec , r . sample ( a , ts , true ) )
2013-04-24 02:51:40 -07:00
}
2013-05-15 22:38:31 -07:00
2015-12-14 08:40:40 -08:00
return vec , nil
}
2015-12-17 02:46:10 -08:00
// State returns the maximum state of alert instances for this rule.
// StateFiring > StatePending > StateInactive
2015-12-14 08:40:40 -08:00
func ( r * AlertingRule ) State ( ) AlertState {
r . mtx . Lock ( )
defer r . mtx . Unlock ( )
maxState := StateInactive
2015-12-15 10:46:03 -08:00
for _ , a := range r . active {
if a . State > maxState {
maxState = a . State
2015-12-14 08:40:40 -08:00
}
}
return maxState
}
// ActiveAlerts returns a slice of active alerts.
func ( r * AlertingRule ) ActiveAlerts ( ) [ ] * Alert {
2015-12-15 10:46:03 -08:00
var res [ ] * Alert
2015-12-17 02:46:10 -08:00
for _ , a := range r . currentAlerts ( ) {
2015-12-15 10:46:03 -08:00
if a . ResolvedAt == 0 {
res = append ( res , a )
}
}
return res
}
2015-12-17 02:46:10 -08:00
// currentAlerts returns all instances of alerts for this rule. This may include
// inactive alerts that were previously firing.
func ( r * AlertingRule ) currentAlerts ( ) [ ] * Alert {
2015-12-14 08:40:40 -08:00
r . mtx . Lock ( )
defer r . mtx . Unlock ( )
alerts := make ( [ ] * Alert , 0 , len ( r . active ) )
2015-12-15 10:46:03 -08:00
for _ , a := range r . active {
anew := * a
alerts = append ( alerts , & anew )
2015-12-14 08:40:40 -08:00
}
return alerts
2013-04-24 02:51:40 -07:00
}
2016-05-19 07:59:53 -07:00
func ( r * AlertingRule ) String ( ) string {
s := fmt . Sprintf ( "ALERT %s" , r . name )
s += fmt . Sprintf ( "\n\tIF %s" , r . vector )
if r . holdDuration > 0 {
s += fmt . Sprintf ( "\n\tFOR %s" , model . Duration ( r . holdDuration ) )
2015-06-23 08:46:57 -07:00
}
2016-05-19 07:59:53 -07:00
if len ( r . labels ) > 0 {
s += fmt . Sprintf ( "\n\tLABELS %s" , r . labels )
2015-06-23 08:46:57 -07:00
}
2016-05-19 07:59:53 -07:00
if len ( r . annotations ) > 0 {
s += fmt . Sprintf ( "\n\tANNOTATIONS %s" , r . annotations )
2015-12-11 08:12:34 -08:00
}
2015-06-23 08:46:57 -07:00
return s
2013-06-13 07:10:05 -07:00
}
2015-06-23 08:46:57 -07:00
// HTMLSnippet returns an HTML snippet representing this alerting rule. The
// resulting snippet is expected to be presented in a <pre> element, so that
// line breaks and other returned whitespace is respected.
2016-07-12 09:11:31 -07:00
func ( r * AlertingRule ) HTMLSnippet ( pathPrefix string ) html_template . HTML {
2015-08-20 08:18:46 -07:00
alertMetric := model . Metric {
model . MetricNameLabel : alertMetricName ,
2016-05-19 07:59:53 -07:00
alertNameLabel : model . LabelValue ( r . name ) ,
2013-06-13 07:10:05 -07:00
}
2016-05-19 07:59:53 -07:00
s := fmt . Sprintf ( "ALERT <a href=%q>%s</a>" , pathPrefix + strutil . GraphLinkForExpression ( alertMetric . String ( ) ) , r . name )
2016-08-11 17:52:59 -07:00
s += fmt . Sprintf ( "\n IF <a href=%q>%s</a>" , pathPrefix + strutil . GraphLinkForExpression ( r . vector . String ( ) ) , html_template . HTMLEscapeString ( r . vector . String ( ) ) )
2016-05-19 07:59:53 -07:00
if r . holdDuration > 0 {
s += fmt . Sprintf ( "\n FOR %s" , model . Duration ( r . holdDuration ) )
2015-06-23 08:46:57 -07:00
}
2016-05-19 07:59:53 -07:00
if len ( r . labels ) > 0 {
2016-08-11 17:52:59 -07:00
s += fmt . Sprintf ( "\n LABELS %s" , html_template . HTMLEscapeString ( r . labels . String ( ) ) )
2015-06-23 08:46:57 -07:00
}
2016-05-19 07:59:53 -07:00
if len ( r . annotations ) > 0 {
2016-08-11 17:52:59 -07:00
s += fmt . Sprintf ( "\n ANNOTATIONS %s" , html_template . HTMLEscapeString ( r . annotations . String ( ) ) )
2015-12-11 08:12:34 -08:00
}
2016-07-12 09:11:31 -07:00
return html_template . HTML ( s )
2013-06-13 07:10:05 -07:00
}