mirror of
https://github.com/prometheus/prometheus.git
synced 2025-03-05 20:59:13 -08:00
rules: cleanup alerting test
This commit is contained in:
parent
9bd4f6d017
commit
f06cf664e1
|
@ -21,6 +21,8 @@ import (
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
html_template "html/template"
|
||||||
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
"github.com/prometheus/log"
|
"github.com/prometheus/log"
|
||||||
|
|
||||||
|
@ -84,7 +86,7 @@ type Rule interface {
|
||||||
String() string
|
String() string
|
||||||
// HTMLSnippet returns a human-readable string representation of the rule,
|
// HTMLSnippet returns a human-readable string representation of the rule,
|
||||||
// decorated with HTML elements for use the web frontend.
|
// decorated with HTML elements for use the web frontend.
|
||||||
HTMLSnippet(pathPrefix string) template.HTML
|
HTMLSnippet(pathPrefix string) html_template.HTML
|
||||||
}
|
}
|
||||||
|
|
||||||
// The Manager manages recording and alerting rules.
|
// The Manager manages recording and alerting rules.
|
||||||
|
@ -285,14 +287,9 @@ func (m *Manager) runIteration() {
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
}
|
}
|
||||||
|
|
||||||
// ApplyConfig updates the rule manager's state as the config requires. If
|
// transferAlertState makes a copy of the state of alerting rules and returns a function
|
||||||
// loading the new rules failed the old rule set is restored. Returns true on success.
|
// that restores them in the current state.
|
||||||
func (m *Manager) ApplyConfig(conf *config.Config) bool {
|
func (m *Manager) transferAlertState() func() {
|
||||||
m.Lock()
|
|
||||||
defer m.Unlock()
|
|
||||||
|
|
||||||
success := true
|
|
||||||
m.interval = time.Duration(conf.GlobalConfig.EvaluationInterval)
|
|
||||||
|
|
||||||
alertingRules := map[string]*AlertingRule{}
|
alertingRules := map[string]*AlertingRule{}
|
||||||
for _, r := range m.rules {
|
for _, r := range m.rules {
|
||||||
|
@ -301,6 +298,31 @@ func (m *Manager) ApplyConfig(conf *config.Config) bool {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return func() {
|
||||||
|
// Restore alerting rule state.
|
||||||
|
for _, r := range m.rules {
|
||||||
|
ar, ok := r.(*AlertingRule)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if old, ok := alertingRules[ar.name]; ok {
|
||||||
|
ar.activeAlerts = old.activeAlerts
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ApplyConfig updates the rule manager's state as the config requires. If
|
||||||
|
// loading the new rules failed the old rule set is restored. Returns true on success.
|
||||||
|
func (m *Manager) ApplyConfig(conf *config.Config) bool {
|
||||||
|
m.Lock()
|
||||||
|
defer m.Unlock()
|
||||||
|
|
||||||
|
defer m.transferAlertState()()
|
||||||
|
|
||||||
|
success := true
|
||||||
|
m.interval = time.Duration(conf.GlobalConfig.EvaluationInterval)
|
||||||
|
|
||||||
rulesSnapshot := make([]Rule, len(m.rules))
|
rulesSnapshot := make([]Rule, len(m.rules))
|
||||||
copy(rulesSnapshot, m.rules)
|
copy(rulesSnapshot, m.rules)
|
||||||
m.rules = m.rules[:0]
|
m.rules = m.rules[:0]
|
||||||
|
@ -321,16 +343,6 @@ func (m *Manager) ApplyConfig(conf *config.Config) bool {
|
||||||
log.Errorf("Error loading rules, previous rule set restored: %s", err)
|
log.Errorf("Error loading rules, previous rule set restored: %s", err)
|
||||||
success = false
|
success = false
|
||||||
}
|
}
|
||||||
// Restore alerting rule state.
|
|
||||||
for _, r := range m.rules {
|
|
||||||
ar, ok := r.(*AlertingRule)
|
|
||||||
if !ok {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if old, ok := alertingRules[ar.name]; ok {
|
|
||||||
ar.activeAlerts = old.activeAlerts
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return success
|
return success
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,6 +15,7 @@ package rules
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"reflect"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
@ -22,67 +23,112 @@ import (
|
||||||
clientmodel "github.com/prometheus/client_golang/model"
|
clientmodel "github.com/prometheus/client_golang/model"
|
||||||
|
|
||||||
"github.com/prometheus/prometheus/promql"
|
"github.com/prometheus/prometheus/promql"
|
||||||
"github.com/prometheus/prometheus/storage/local"
|
|
||||||
"github.com/prometheus/prometheus/storage/metric"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
func TestAlertingRule(t *testing.T) {
|
||||||
testSampleInterval = time.Duration(5) * time.Minute
|
suite, err := promql.NewTest(t, `
|
||||||
testStartTime = clientmodel.Timestamp(0)
|
load 5m
|
||||||
)
|
http_requests{job="api-server", instance="0", group="production"} 0+10x10
|
||||||
|
http_requests{job="api-server", instance="1", group="production"} 0+20x10
|
||||||
|
http_requests{job="api-server", instance="0", group="canary"} 0+30x10
|
||||||
|
http_requests{job="api-server", instance="1", group="canary"} 0+40x10
|
||||||
|
http_requests{job="app-server", instance="0", group="production"} 0+50x10
|
||||||
|
http_requests{job="app-server", instance="1", group="production"} 0+60x10
|
||||||
|
http_requests{job="app-server", instance="0", group="canary"} 0+70x10
|
||||||
|
http_requests{job="app-server", instance="1", group="canary"} 0+80x10
|
||||||
|
`)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
defer suite.Close()
|
||||||
|
|
||||||
func getTestValueStream(startVal clientmodel.SampleValue, endVal clientmodel.SampleValue, stepVal clientmodel.SampleValue, startTime clientmodel.Timestamp) (resultValues metric.Values) {
|
if err := suite.Run(); err != nil {
|
||||||
currentTime := startTime
|
t.Fatal(err)
|
||||||
for currentVal := startVal; currentVal <= endVal; currentVal += stepVal {
|
|
||||||
sample := metric.SamplePair{
|
|
||||||
Value: currentVal,
|
|
||||||
Timestamp: currentTime,
|
|
||||||
}
|
}
|
||||||
resultValues = append(resultValues, sample)
|
|
||||||
currentTime = currentTime.Add(testSampleInterval)
|
|
||||||
}
|
|
||||||
return resultValues
|
|
||||||
}
|
|
||||||
|
|
||||||
func getTestVectorFromTestMatrix(matrix promql.Matrix) promql.Vector {
|
expr, err := promql.ParseExpr(`http_requests{group="canary", job="app-server"} < 100`)
|
||||||
vector := promql.Vector{}
|
if err != nil {
|
||||||
for _, sampleStream := range matrix {
|
t.Fatalf("Unable to parse alert expression: %s", err)
|
||||||
lastSample := sampleStream.Values[len(sampleStream.Values)-1]
|
|
||||||
vector = append(vector, &promql.Sample{
|
|
||||||
Metric: sampleStream.Metric,
|
|
||||||
Value: lastSample.Value,
|
|
||||||
Timestamp: lastSample.Timestamp,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
return vector
|
|
||||||
}
|
|
||||||
|
|
||||||
func storeMatrix(storage local.Storage, matrix promql.Matrix) {
|
rule := NewAlertingRule(
|
||||||
pendingSamples := clientmodel.Samples{}
|
"HTTPRequestRateLow",
|
||||||
for _, sampleStream := range matrix {
|
expr,
|
||||||
for _, sample := range sampleStream.Values {
|
time.Minute,
|
||||||
pendingSamples = append(pendingSamples, &clientmodel.Sample{
|
clientmodel.LabelSet{"severity": "critical"},
|
||||||
Metric: sampleStream.Metric.Metric,
|
"summary", "description", "runbook",
|
||||||
Value: sample.Value,
|
)
|
||||||
Timestamp: sample.Timestamp,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for _, s := range pendingSamples {
|
|
||||||
storage.Append(s)
|
|
||||||
}
|
|
||||||
storage.WaitForIndexing()
|
|
||||||
}
|
|
||||||
|
|
||||||
func vectorComparisonString(expected []string, actual []string) string {
|
var tests = []struct {
|
||||||
separator := "\n--------------\n"
|
time time.Duration
|
||||||
return fmt.Sprintf("Expected:%v%v%v\nActual:%v%v%v ",
|
result []string
|
||||||
separator,
|
}{
|
||||||
strings.Join(expected, "\n"),
|
{
|
||||||
separator,
|
time: 0,
|
||||||
separator,
|
result: []string{
|
||||||
strings.Join(actual, "\n"),
|
`ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`,
|
||||||
separator)
|
`ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`,
|
||||||
|
},
|
||||||
|
}, {
|
||||||
|
time: 5 * time.Minute,
|
||||||
|
result: []string{
|
||||||
|
`ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`,
|
||||||
|
`ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`,
|
||||||
|
`ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`,
|
||||||
|
`ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`,
|
||||||
|
},
|
||||||
|
}, {
|
||||||
|
time: 10 * time.Minute,
|
||||||
|
result: []string{
|
||||||
|
`ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`,
|
||||||
|
`ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
time: 15 * time.Minute,
|
||||||
|
result: nil,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
time: 20 * time.Minute,
|
||||||
|
result: nil,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, test := range tests {
|
||||||
|
evalTime := clientmodel.Timestamp(0).Add(test.time)
|
||||||
|
|
||||||
|
res, err := rule.eval(evalTime, suite.QueryEngine())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Error during alerting rule evaluation: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
actual := strings.Split(res.String(), "\n")
|
||||||
|
expected := annotateWithTime(test.result, evalTime)
|
||||||
|
if actual[0] == "" {
|
||||||
|
actual = []string{}
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(actual) != len(expected) {
|
||||||
|
t.Errorf("%d. Number of samples in expected and actual output don't match (%d vs. %d)", i, len(expected), len(actual))
|
||||||
|
}
|
||||||
|
|
||||||
|
for j, expectedSample := range expected {
|
||||||
|
found := false
|
||||||
|
for _, actualSample := range actual {
|
||||||
|
if actualSample == expectedSample {
|
||||||
|
found = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
|
t.Errorf("%d.%d. Couldn't find expected sample in output: '%v'", i, j, expectedSample)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if t.Failed() {
|
||||||
|
t.Errorf("%d. Expected and actual outputs don't match:", i)
|
||||||
|
t.Fatalf("Expected:\n%v\n----\nActual:\n%v", strings.Join(expected, "\n"), strings.Join(actual, "\n"))
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func annotateWithTime(lines []string, timestamp clientmodel.Timestamp) []string {
|
func annotateWithTime(lines []string, timestamp clientmodel.Timestamp) []string {
|
||||||
|
@ -93,131 +139,45 @@ func annotateWithTime(lines []string, timestamp clientmodel.Timestamp) []string
|
||||||
return annotatedLines
|
return annotatedLines
|
||||||
}
|
}
|
||||||
|
|
||||||
var testMatrix = promql.Matrix{
|
func TestTransferAlertState(t *testing.T) {
|
||||||
{
|
m := NewManager(&ManagerOptions{})
|
||||||
Metric: clientmodel.COWMetric{
|
|
||||||
Metric: clientmodel.Metric{
|
|
||||||
clientmodel.MetricNameLabel: "http_requests",
|
|
||||||
clientmodel.JobLabel: "api-server",
|
|
||||||
"instance": "0",
|
|
||||||
"group": "canary",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
Values: getTestValueStream(0, 300, 30, testStartTime),
|
|
||||||
},
|
|
||||||
{
|
|
||||||
Metric: clientmodel.COWMetric{
|
|
||||||
Metric: clientmodel.Metric{
|
|
||||||
clientmodel.MetricNameLabel: "http_requests",
|
|
||||||
clientmodel.JobLabel: "api-server",
|
|
||||||
"instance": "1",
|
|
||||||
"group": "canary",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
Values: getTestValueStream(0, 400, 40, testStartTime),
|
|
||||||
},
|
|
||||||
{
|
|
||||||
Metric: clientmodel.COWMetric{
|
|
||||||
Metric: clientmodel.Metric{
|
|
||||||
clientmodel.MetricNameLabel: "http_requests",
|
|
||||||
clientmodel.JobLabel: "app-server",
|
|
||||||
"instance": "0",
|
|
||||||
"group": "canary",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
Values: getTestValueStream(0, 700, 70, testStartTime),
|
|
||||||
},
|
|
||||||
{
|
|
||||||
Metric: clientmodel.COWMetric{
|
|
||||||
Metric: clientmodel.Metric{
|
|
||||||
clientmodel.MetricNameLabel: "http_requests",
|
|
||||||
clientmodel.JobLabel: "app-server",
|
|
||||||
"instance": "1",
|
|
||||||
"group": "canary",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
Values: getTestValueStream(0, 800, 80, testStartTime),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestAlertingRule(t *testing.T) {
|
alert := &Alert{
|
||||||
// Labels in expected output need to be alphabetically sorted.
|
Name: "testalert",
|
||||||
var evalOutputs = [][]string{
|
State: StateFiring,
|
||||||
{
|
|
||||||
`ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`,
|
|
||||||
`ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
`ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`,
|
|
||||||
`ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`,
|
|
||||||
`ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`,
|
|
||||||
`ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
`ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`,
|
|
||||||
`ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
/* empty */
|
|
||||||
},
|
|
||||||
{
|
|
||||||
/* empty */
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
storage, closer := local.NewTestStorage(t, 1)
|
arule := AlertingRule{
|
||||||
defer closer.Close()
|
name: "test",
|
||||||
|
activeAlerts: map[clientmodel.Fingerprint]*Alert{},
|
||||||
|
}
|
||||||
|
aruleCopy := arule
|
||||||
|
|
||||||
storeMatrix(storage, testMatrix)
|
m.rules = append(m.rules, &arule)
|
||||||
|
|
||||||
engine := promql.NewEngine(storage, nil)
|
// Set an alert.
|
||||||
defer engine.Stop()
|
arule.activeAlerts[0] = alert
|
||||||
|
|
||||||
expr, err := promql.ParseExpr(`http_requests{group="canary", job="app-server"} < 100`)
|
// Save state and get the restore function.
|
||||||
if err != nil {
|
restore := m.transferAlertState()
|
||||||
t.Fatalf("Unable to parse alert expression: %s", err)
|
|
||||||
|
// Remove arule from the rule list and add an unrelated rule and the
|
||||||
|
// stateless copy of arule.
|
||||||
|
m.rules = []Rule{
|
||||||
|
&AlertingRule{
|
||||||
|
name: "test_other",
|
||||||
|
activeAlerts: map[clientmodel.Fingerprint]*Alert{},
|
||||||
|
},
|
||||||
|
&aruleCopy,
|
||||||
}
|
}
|
||||||
|
|
||||||
alertLabels := clientmodel.LabelSet{
|
// Apply the restore function.
|
||||||
"severity": "critical",
|
restore()
|
||||||
}
|
|
||||||
rule := NewAlertingRule("HttpRequestRateLow", expr, time.Minute, alertLabels, "summary", "description", "runbook")
|
|
||||||
|
|
||||||
for i, expectedLines := range evalOutputs {
|
if ar := m.rules[0].(*AlertingRule); len(ar.activeAlerts) != 0 {
|
||||||
evalTime := testStartTime.Add(testSampleInterval * time.Duration(i))
|
t.Fatalf("unexpected alert for unrelated alerting rule")
|
||||||
|
|
||||||
res, err := rule.eval(evalTime, engine)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("Error during alerting rule evaluation: %s", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
actualLines := strings.Split(res.String(), "\n")
|
|
||||||
expectedLines := annotateWithTime(expectedLines, evalTime)
|
|
||||||
if actualLines[0] == "" {
|
|
||||||
actualLines = []string{}
|
|
||||||
}
|
|
||||||
|
|
||||||
failed := false
|
|
||||||
if len(actualLines) != len(expectedLines) {
|
|
||||||
t.Errorf("%d. Number of samples in expected and actual output don't match (%d vs. %d)", i, len(expectedLines), len(actualLines))
|
|
||||||
failed = true
|
|
||||||
}
|
|
||||||
|
|
||||||
for j, expectedSample := range expectedLines {
|
|
||||||
found := false
|
|
||||||
for _, actualSample := range actualLines {
|
|
||||||
if actualSample == expectedSample {
|
|
||||||
found = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !found {
|
|
||||||
t.Errorf("%d.%d. Couldn't find expected sample in output: '%v'", i, j, expectedSample)
|
|
||||||
failed = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if failed {
|
|
||||||
t.Fatalf("%d. Expected and actual outputs don't match:\n%v", i, vectorComparisonString(expectedLines, actualLines))
|
|
||||||
}
|
}
|
||||||
|
if ar := m.rules[1].(*AlertingRule); !reflect.DeepEqual(ar.activeAlerts[0], alert) {
|
||||||
|
t.Fatalf("alert state was not restored")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue