2013-04-24 02:51:40 -07:00
|
|
|
// Copyright 2013 Prometheus Team
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
|
|
|
package rules
|
|
|
|
|
|
|
|
import (
|
2013-04-26 07:02:52 -07:00
|
|
|
"fmt"
|
2013-06-13 07:10:05 -07:00
|
|
|
"html/template"
|
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
|
2013-06-25 05:02:27 -07:00
|
|
|
clientmodel "github.com/prometheus/client_golang/model"
|
|
|
|
|
2013-04-24 02:51:40 -07:00
|
|
|
"github.com/prometheus/prometheus/rules/ast"
|
2013-06-03 08:07:03 -07:00
|
|
|
"github.com/prometheus/prometheus/stats"
|
2013-05-07 04:15:10 -07:00
|
|
|
"github.com/prometheus/prometheus/storage/metric"
|
2013-04-24 02:51:40 -07:00
|
|
|
"github.com/prometheus/prometheus/utility"
|
|
|
|
)
|
|
|
|
|
2013-06-25 05:02:27 -07:00
|
|
|
const (
|
|
|
|
// The metric name for synthetic alert timeseries.
|
|
|
|
AlertMetricName clientmodel.LabelValue = "ALERTS"
|
|
|
|
|
|
|
|
// The label name indicating the name of an alert.
|
|
|
|
AlertNameLabel clientmodel.LabelName = "alertname"
|
|
|
|
// The label name indicating the state of an alert.
|
|
|
|
AlertStateLabel clientmodel.LabelName = "alertstate"
|
|
|
|
)
|
|
|
|
|
2013-04-24 02:51:40 -07:00
|
|
|
// States that active alerts can be in.
|
2013-06-13 07:10:05 -07:00
|
|
|
type AlertState int
|
2013-04-24 02:51:40 -07:00
|
|
|
|
2013-06-13 07:10:05 -07:00
|
|
|
func (s AlertState) String() string {
|
2013-04-24 02:51:40 -07:00
|
|
|
switch s {
|
2013-06-13 07:10:05 -07:00
|
|
|
case INACTIVE:
|
|
|
|
return "inactive"
|
2013-04-24 02:51:40 -07:00
|
|
|
case PENDING:
|
2013-05-15 22:38:31 -07:00
|
|
|
return "pending"
|
2013-04-24 02:51:40 -07:00
|
|
|
case FIRING:
|
2013-05-15 22:38:31 -07:00
|
|
|
return "firing"
|
|
|
|
default:
|
|
|
|
panic("undefined")
|
2013-04-24 02:51:40 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const (
|
2013-06-13 07:10:05 -07:00
|
|
|
INACTIVE AlertState = iota
|
|
|
|
PENDING
|
2013-04-24 02:51:40 -07:00
|
|
|
FIRING
|
|
|
|
)
|
|
|
|
|
2013-06-13 07:10:05 -07:00
|
|
|
// Alert is used to track active (pending/firing) alerts over time.
|
|
|
|
type Alert struct {
|
2013-04-24 02:51:40 -07:00
|
|
|
// The name of the alert.
|
2013-06-13 07:10:05 -07:00
|
|
|
Name string
|
2013-04-24 02:51:40 -07:00
|
|
|
// The vector element labelset triggering this alert.
|
2013-06-25 05:02:27 -07:00
|
|
|
Labels clientmodel.LabelSet
|
2013-04-24 02:51:40 -07:00
|
|
|
// The state of the alert (PENDING or FIRING).
|
2013-06-13 07:10:05 -07:00
|
|
|
State AlertState
|
2013-04-24 02:51:40 -07:00
|
|
|
// The time when the alert first transitioned into PENDING state.
|
Use custom timestamp type for sample timestamps and related code.
So far we've been using Go's native time.Time for anything related to sample
timestamps. Since the range of time.Time is much bigger than what we need, this
has created two problems:
- there could be time.Time values which were out of the range/precision of the
time type that we persist to disk, therefore causing incorrectly ordered keys.
One bug caused by this was:
https://github.com/prometheus/prometheus/issues/367
It would be good to use a timestamp type that's more closely aligned with
what the underlying storage supports.
- sizeof(time.Time) is 192, while Prometheus should be ok with a single 64-bit
Unix timestamp (possibly even a 32-bit one). Since we store samples in large
numbers, this seriously affects memory usage. Furthermore, copying/working
with the data will be faster if it's smaller.
*MEMORY USAGE RESULTS*
Initial memory usage comparisons for a running Prometheus with 1 timeseries and
100,000 samples show roughly a 13% decrease in total (VIRT) memory usage. In my
tests, this advantage for some reason decreased a bit the more samples the
timeseries had (to 5-7% for millions of samples). This I can't fully explain,
but perhaps garbage collection issues were involved.
*WHEN TO USE THE NEW TIMESTAMP TYPE*
The new clientmodel.Timestamp type should be used whenever time
calculations are either directly or indirectly related to sample
timestamps.
For example:
- the timestamp of a sample itself
- all kinds of watermarks
- anything that may become or is compared to a sample timestamp (like the timestamp
passed into Target.Scrape()).
When to still use time.Time:
- for measuring durations/times not related to sample timestamps, like duration
telemetry exporting, timers that indicate how frequently to execute some
action, etc.
*NOTE ON OPERATOR OPTIMIZATION TESTS*
We don't use operator optimization code anymore, but it still lives in
the code as dead code. It still has tests, but I couldn't get all of them to
pass with the new timestamp format. I commented out the failing cases for now,
but we should probably remove the dead code soon. I just didn't want to do that
in the same change as this.
Change-Id: I821787414b0debe85c9fffaeb57abd453727af0f
2013-10-28 06:35:02 -07:00
|
|
|
ActiveSince clientmodel.Timestamp
|
2013-06-13 07:10:05 -07:00
|
|
|
// The value of the alert expression for this vector element.
|
2013-06-25 05:02:27 -07:00
|
|
|
Value clientmodel.SampleValue
|
2013-04-24 02:51:40 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
// sample returns a Sample suitable for recording the alert.
|
Use custom timestamp type for sample timestamps and related code.
So far we've been using Go's native time.Time for anything related to sample
timestamps. Since the range of time.Time is much bigger than what we need, this
has created two problems:
- there could be time.Time values which were out of the range/precision of the
time type that we persist to disk, therefore causing incorrectly ordered keys.
One bug caused by this was:
https://github.com/prometheus/prometheus/issues/367
It would be good to use a timestamp type that's more closely aligned with
what the underlying storage supports.
- sizeof(time.Time) is 192, while Prometheus should be ok with a single 64-bit
Unix timestamp (possibly even a 32-bit one). Since we store samples in large
numbers, this seriously affects memory usage. Furthermore, copying/working
with the data will be faster if it's smaller.
*MEMORY USAGE RESULTS*
Initial memory usage comparisons for a running Prometheus with 1 timeseries and
100,000 samples show roughly a 13% decrease in total (VIRT) memory usage. In my
tests, this advantage for some reason decreased a bit the more samples the
timeseries had (to 5-7% for millions of samples). This I can't fully explain,
but perhaps garbage collection issues were involved.
*WHEN TO USE THE NEW TIMESTAMP TYPE*
The new clientmodel.Timestamp type should be used whenever time
calculations are either directly or indirectly related to sample
timestamps.
For example:
- the timestamp of a sample itself
- all kinds of watermarks
- anything that may become or is compared to a sample timestamp (like the timestamp
passed into Target.Scrape()).
When to still use time.Time:
- for measuring durations/times not related to sample timestamps, like duration
telemetry exporting, timers that indicate how frequently to execute some
action, etc.
*NOTE ON OPERATOR OPTIMIZATION TESTS*
We don't use operator optimization code anymore, but it still lives in
the code as dead code. It still has tests, but I couldn't get all of them to
pass with the new timestamp format. I commented out the failing cases for now,
but we should probably remove the dead code soon. I just didn't want to do that
in the same change as this.
Change-Id: I821787414b0debe85c9fffaeb57abd453727af0f
2013-10-28 06:35:02 -07:00
|
|
|
func (a Alert) sample(timestamp clientmodel.Timestamp, value clientmodel.SampleValue) *clientmodel.Sample {
|
2013-06-25 05:02:27 -07:00
|
|
|
recordedMetric := clientmodel.Metric{}
|
2013-06-13 07:10:05 -07:00
|
|
|
for label, value := range a.Labels {
|
2013-04-24 02:51:40 -07:00
|
|
|
recordedMetric[label] = value
|
|
|
|
}
|
|
|
|
|
2013-06-25 05:02:27 -07:00
|
|
|
recordedMetric[clientmodel.MetricNameLabel] = AlertMetricName
|
|
|
|
recordedMetric[AlertNameLabel] = clientmodel.LabelValue(a.Name)
|
|
|
|
recordedMetric[AlertStateLabel] = clientmodel.LabelValue(a.State.String())
|
2013-04-24 02:51:40 -07:00
|
|
|
|
2013-06-25 05:02:27 -07:00
|
|
|
return &clientmodel.Sample{
|
2013-04-24 02:51:40 -07:00
|
|
|
Metric: recordedMetric,
|
|
|
|
Value: value,
|
|
|
|
Timestamp: timestamp,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// An alerting rule generates alerts from its vector expression.
|
|
|
|
type AlertingRule struct {
|
|
|
|
// The name of the alert.
|
2013-04-05 09:03:45 -07:00
|
|
|
name string
|
|
|
|
// The vector expression from which to generate alerts.
|
|
|
|
vector ast.VectorNode
|
2013-04-24 02:51:40 -07:00
|
|
|
// The duration for which a labelset needs to persist in the expression
|
|
|
|
// output vector before an alert transitions from PENDING to FIRING state.
|
|
|
|
holdDuration time.Duration
|
|
|
|
// Extra labels to attach to the resulting alert sample vectors.
|
2013-07-30 08:18:07 -07:00
|
|
|
Labels clientmodel.LabelSet
|
|
|
|
// Short alert summary, suitable for email subjects.
|
|
|
|
Summary string
|
|
|
|
// More detailed alert description.
|
|
|
|
Description string
|
2013-06-13 07:10:05 -07:00
|
|
|
|
|
|
|
// Protects the below.
|
|
|
|
mutex sync.Mutex
|
2013-04-24 02:51:40 -07:00
|
|
|
// A map of alerts which are currently active (PENDING or FIRING), keyed by
|
|
|
|
// the fingerprint of the labelset they correspond to.
|
2013-06-25 05:02:27 -07:00
|
|
|
activeAlerts map[clientmodel.Fingerprint]*Alert
|
2013-04-24 02:51:40 -07:00
|
|
|
}
|
|
|
|
|
2013-07-30 08:18:07 -07:00
|
|
|
func (rule *AlertingRule) Name() string {
|
|
|
|
return rule.name
|
|
|
|
}
|
2013-04-24 02:51:40 -07:00
|
|
|
|
2014-04-11 00:27:05 -07:00
|
|
|
func (rule *AlertingRule) EvalRaw(timestamp clientmodel.Timestamp, storage metric.PreloadingPersistence) (ast.Vector, error) {
|
2013-06-03 08:07:03 -07:00
|
|
|
return ast.EvalVectorInstant(rule.vector, timestamp, storage, stats.NewTimerGroup())
|
2013-04-24 02:51:40 -07:00
|
|
|
}
|
|
|
|
|
2014-04-11 00:27:05 -07:00
|
|
|
func (rule *AlertingRule) Eval(timestamp clientmodel.Timestamp, storage metric.PreloadingPersistence) (ast.Vector, error) {
|
2013-04-24 02:51:40 -07:00
|
|
|
// Get the raw value of the rule expression.
|
2013-05-07 04:15:10 -07:00
|
|
|
exprResult, err := rule.EvalRaw(timestamp, storage)
|
2013-04-24 02:51:40 -07:00
|
|
|
if err != nil {
|
2013-05-15 22:38:31 -07:00
|
|
|
return nil, err
|
2013-04-24 02:51:40 -07:00
|
|
|
}
|
|
|
|
|
2013-06-13 07:10:05 -07:00
|
|
|
rule.mutex.Lock()
|
|
|
|
defer rule.mutex.Unlock()
|
|
|
|
|
2013-06-14 04:03:19 -07:00
|
|
|
// Create pending alerts for any new vector elements in the alert expression
|
|
|
|
// or update the expression value for existing elements.
|
2013-04-24 02:51:40 -07:00
|
|
|
resultFingerprints := utility.Set{}
|
|
|
|
for _, sample := range exprResult {
|
2013-06-25 05:02:27 -07:00
|
|
|
fp := new(clientmodel.Fingerprint)
|
|
|
|
fp.LoadFromMetric(sample.Metric)
|
|
|
|
resultFingerprints.Add(*fp)
|
|
|
|
|
|
|
|
if alert, ok := rule.activeAlerts[*fp]; !ok {
|
|
|
|
labels := clientmodel.LabelSet{}
|
|
|
|
labels.MergeFromMetric(sample.Metric)
|
2013-07-30 08:18:07 -07:00
|
|
|
labels = labels.Merge(rule.Labels)
|
2013-06-25 05:02:27 -07:00
|
|
|
if _, ok := labels[clientmodel.MetricNameLabel]; ok {
|
|
|
|
delete(labels, clientmodel.MetricNameLabel)
|
2013-06-13 07:10:05 -07:00
|
|
|
}
|
2013-06-25 05:02:27 -07:00
|
|
|
rule.activeAlerts[*fp] = &Alert{
|
2013-06-13 07:10:05 -07:00
|
|
|
Name: rule.name,
|
2013-06-14 04:03:19 -07:00
|
|
|
Labels: labels,
|
2013-06-13 07:10:05 -07:00
|
|
|
State: PENDING,
|
|
|
|
ActiveSince: timestamp,
|
|
|
|
Value: sample.Value,
|
2013-04-24 02:51:40 -07:00
|
|
|
}
|
2013-06-13 07:10:05 -07:00
|
|
|
} else {
|
|
|
|
alert.Value = sample.Value
|
2013-04-24 02:51:40 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-05-15 22:38:31 -07:00
|
|
|
vector := ast.Vector{}
|
|
|
|
|
2013-04-24 02:51:40 -07:00
|
|
|
// Check if any pending alerts should be removed or fire now. Write out alert timeseries.
|
|
|
|
for fp, activeAlert := range rule.activeAlerts {
|
|
|
|
if !resultFingerprints.Has(fp) {
|
|
|
|
vector = append(vector, activeAlert.sample(timestamp, 0))
|
|
|
|
delete(rule.activeAlerts, fp)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2013-06-13 07:10:05 -07:00
|
|
|
if activeAlert.State == PENDING && timestamp.Sub(activeAlert.ActiveSince) >= rule.holdDuration {
|
2013-04-24 02:51:40 -07:00
|
|
|
vector = append(vector, activeAlert.sample(timestamp, 0))
|
2013-06-13 07:10:05 -07:00
|
|
|
activeAlert.State = FIRING
|
2013-04-24 02:51:40 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
vector = append(vector, activeAlert.sample(timestamp, 1))
|
|
|
|
}
|
2013-05-15 22:38:31 -07:00
|
|
|
|
|
|
|
return vector, nil
|
2013-04-24 02:51:40 -07:00
|
|
|
}
|
|
|
|
|
2013-06-13 07:10:05 -07:00
|
|
|
func (rule *AlertingRule) ToDotGraph() string {
|
2013-04-26 07:02:52 -07:00
|
|
|
graph := fmt.Sprintf(`digraph "Rules" {
|
|
|
|
%#p[shape="box",label="ALERT %s IF FOR %s"];
|
|
|
|
%#p -> %#p;
|
|
|
|
%s
|
|
|
|
}`, &rule, rule.name, utility.DurationToString(rule.holdDuration), &rule, rule.vector, rule.vector.NodeTreeToDotGraph())
|
|
|
|
return graph
|
|
|
|
}
|
|
|
|
|
2013-06-13 07:10:05 -07:00
|
|
|
func (rule *AlertingRule) String() string {
|
2013-07-30 08:18:07 -07:00
|
|
|
return fmt.Sprintf("ALERT %s IF %s FOR %s WITH %s", rule.name, rule.vector, utility.DurationToString(rule.holdDuration), rule.Labels)
|
2013-06-13 07:10:05 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
func (rule *AlertingRule) HTMLSnippet() template.HTML {
|
2013-06-25 05:02:27 -07:00
|
|
|
alertMetric := clientmodel.Metric{
|
|
|
|
clientmodel.MetricNameLabel: AlertMetricName,
|
|
|
|
AlertNameLabel: clientmodel.LabelValue(rule.name),
|
2013-06-13 07:10:05 -07:00
|
|
|
}
|
|
|
|
return template.HTML(fmt.Sprintf(
|
|
|
|
`ALERT <a href="%s">%s</a> IF <a href="%s">%s</a> FOR %s WITH %s`,
|
|
|
|
ConsoleLinkForExpression(alertMetric.String()),
|
|
|
|
rule.name,
|
|
|
|
ConsoleLinkForExpression(rule.vector.String()),
|
|
|
|
rule.vector,
|
|
|
|
utility.DurationToString(rule.holdDuration),
|
2013-07-30 08:18:07 -07:00
|
|
|
rule.Labels))
|
2013-06-13 07:10:05 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
func (rule *AlertingRule) State() AlertState {
|
|
|
|
rule.mutex.Lock()
|
|
|
|
defer rule.mutex.Unlock()
|
|
|
|
|
|
|
|
maxState := INACTIVE
|
|
|
|
for _, activeAlert := range rule.activeAlerts {
|
|
|
|
if activeAlert.State > maxState {
|
|
|
|
maxState = activeAlert.State
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return maxState
|
|
|
|
}
|
|
|
|
|
|
|
|
func (rule *AlertingRule) ActiveAlerts() []Alert {
|
|
|
|
rule.mutex.Lock()
|
|
|
|
defer rule.mutex.Unlock()
|
|
|
|
|
|
|
|
alerts := make([]Alert, 0, len(rule.activeAlerts))
|
|
|
|
for _, alert := range rule.activeAlerts {
|
|
|
|
alerts = append(alerts, *alert)
|
|
|
|
}
|
|
|
|
return alerts
|
2013-06-06 06:12:37 -07:00
|
|
|
}
|
|
|
|
|
2013-04-24 02:51:40 -07:00
|
|
|
// Construct a new AlertingRule.
|
2013-07-30 08:18:07 -07:00
|
|
|
func NewAlertingRule(name string, vector ast.VectorNode, holdDuration time.Duration, labels clientmodel.LabelSet, summary string, description string) *AlertingRule {
|
2013-04-24 02:51:40 -07:00
|
|
|
return &AlertingRule{
|
|
|
|
name: name,
|
|
|
|
vector: vector,
|
|
|
|
holdDuration: holdDuration,
|
2013-07-30 08:18:07 -07:00
|
|
|
Labels: labels,
|
|
|
|
Summary: summary,
|
|
|
|
Description: description,
|
|
|
|
|
2013-06-25 05:02:27 -07:00
|
|
|
activeAlerts: map[clientmodel.Fingerprint]*Alert{},
|
2013-04-24 02:51:40 -07:00
|
|
|
}
|
|
|
|
}
|