Merge pull request #4318 from mxinden/expose-alerts-and-rules

api/v1: Expose rules and alerts
This commit is contained in:
Max Inden 2018-07-31 13:50:55 +02:00 committed by GitHub
commit 41b0580e7e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 438 additions and 22 deletions

View file

@ -363,6 +363,103 @@ $ curl http://localhost:9090/api/v1/targets
}
```
## Rules
The `/rules` API endpoint returns a list of alerting and recording rules that
are currently loaded. In addition it returns the currently active alerts fired
by the Prometheus instance of each alerting rule.
As the `/rules` endpoint is fairly new, it does not have the same stability
guarantees as the overarching API v1.
```
GET /api/v1/rules
```
```json
$ curl http://localhost:9090/api/v1/rules
{
"data": {
"groups": [
{
"rules": [
{
"alerts": [
{
"activeAt": "2018-07-04T20:27:12.60602144+02:00",
"annotations": {
"summary": "High request latency"
},
"labels": {
"alertname": "HighRequestLatency",
"severity": "page"
},
"state": "firing",
"value": 1
}
],
"annotations": {
"summary": "High request latency"
},
"duration": 600,
"labels": {
"severity": "page"
},
"name": "HighRequestLatency",
"query": "job:request_latency_seconds:mean5m{job=\"myjob\"} > 0.5",
"type": "alerting"
},
{
"name": "job:http_inprogress_requests:sum",
"query": "sum(http_inprogress_requests) by (job)",
"type": "recording"
}
],
"file": "/rules.yaml",
"interval": 60,
"name": "example"
}
]
},
"status": "success"
}
```
## Alerts
The `/alerts` endpoint returns a list of all active alerts.
As the `/alerts` endpoint is fairly new, it does not have the same stability
guarantees as the overarching API v1.
```
GET /api/v1/alerts
```
```json
$ curl http://localhost:9090/api/v1/alerts
{
"data": {
"alerts": [
{
"activeAt": "2018-07-04T20:27:12.60602144+02:00",
"annotations": {},
"labels": {
"alertname": "my-alert"
},
"state": "firing",
"value": 1
}
]
},
"status": "success"
}
```
## Querying target metadata
The following endpoint returns metadata about metrics currently scraped by targets.

View file

@ -126,11 +126,31 @@ func NewAlertingRule(name string, vec promql.Expr, hold time.Duration, lbls, ann
}
}
// Name returns the name of the alert.
// Name returns the name of the alerting rule.
func (r *AlertingRule) Name() string {
return r.name
}
// Query returns the query expression of the alerting rule.
func (r *AlertingRule) Query() promql.Expr {
return r.vector
}
// Duration returns the hold duration of the alerting rule.
func (r *AlertingRule) Duration() time.Duration {
return r.holdDuration
}
// Labels returns the labels of the alerting rule.
func (r *AlertingRule) Labels() labels.Labels {
return r.labels
}
// Annotations returns the annotations of the alerting rule.
func (r *AlertingRule) Annotations() labels.Labels {
return r.annotations
}
func (r *AlertingRule) equal(o *AlertingRule) bool {
return r.name == o.name && labels.Equal(r.labels, o.labels)
}

View file

@ -188,6 +188,9 @@ func (g *Group) File() string { return g.file }
// Rules returns the group's rules.
func (g *Group) Rules() []Rule { return g.rules }
// Interval returns the group's interval.
func (g *Group) Interval() time.Duration { return g.interval }
func (g *Group) run(ctx context.Context) {
defer close(g.terminated)

View file

@ -52,6 +52,16 @@ func (rule *RecordingRule) Name() string {
return rule.name
}
// Query returns the rule query expression.
func (rule *RecordingRule) Query() promql.Expr {
return rule.vector
}
// Labels returns the rule labels.
func (rule *RecordingRule) Labels() labels.Labels {
return rule.labels
}
// Eval evaluates the rule and then overrides the metric names and labels accordingly.
func (rule *RecordingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, _ *url.URL) (promql.Vector, error) {
vector, err := query(ctx, rule.vector.String(), ts)

View file

@ -41,6 +41,7 @@ import (
"github.com/prometheus/prometheus/pkg/timestamp"
"github.com/prometheus/prometheus/prompb"
"github.com/prometheus/prometheus/promql"
"github.com/prometheus/prometheus/rules"
"github.com/prometheus/prometheus/scrape"
"github.com/prometheus/prometheus/storage"
"github.com/prometheus/prometheus/storage/remote"
@ -95,6 +96,11 @@ type alertmanagerRetriever interface {
DroppedAlertmanagers() []*url.URL
}
type rulesRetriever interface {
RuleGroups() []*rules.Group
AlertingRules() []*rules.AlertingRule
}
type response struct {
Status status `json:"status"`
Data interface{} `json:"data,omitempty"`
@ -119,11 +125,11 @@ type API struct {
targetRetriever targetRetriever
alertmanagerRetriever alertmanagerRetriever
now func() time.Time
config func() config.Config
flagsMap map[string]string
ready func(http.HandlerFunc) http.HandlerFunc
rulesRetriever rulesRetriever
now func() time.Time
config func() config.Config
flagsMap map[string]string
ready func(http.HandlerFunc) http.HandlerFunc
db func() *tsdb.DB
enableAdmin bool
@ -142,18 +148,20 @@ func NewAPI(
db func() *tsdb.DB,
enableAdmin bool,
logger log.Logger,
rr rulesRetriever,
) *API {
return &API{
QueryEngine: qe,
Queryable: q,
targetRetriever: tr,
alertmanagerRetriever: ar,
now: time.Now,
config: configFunc,
flagsMap: flagsMap,
ready: readyFunc,
db: db,
enableAdmin: enableAdmin,
now: time.Now,
config: configFunc,
flagsMap: flagsMap,
ready: readyFunc,
db: db,
enableAdmin: enableAdmin,
rulesRetriever: rr,
}
}
@ -199,6 +207,9 @@ func (api *API) Register(r *route.Router) {
r.Get("/status/flags", wrap(api.serveFlags))
r.Post("/read", api.ready(http.HandlerFunc(api.remoteRead)))
r.Get("/alerts", wrap(api.alerts))
r.Get("/rules", wrap(api.rules))
// Admin APIs
r.Post("/admin/tsdb/delete_series", wrap(api.deleteSeries))
r.Post("/admin/tsdb/clean_tombstones", wrap(api.cleanTombstones))
@ -578,6 +589,132 @@ func (api *API) alertmanagers(r *http.Request) (interface{}, *apiError, func())
return ams, nil, nil
}
// AlertDiscovery has info for all active alerts.
type AlertDiscovery struct {
Alerts []*Alert `json:"alerts"`
}
// Alert has info for an alert.
type Alert struct {
Labels labels.Labels `json:"labels"`
Annotations labels.Labels `json:"annotations"`
State string `json:"state"`
ActiveAt *time.Time `json:"activeAt,omitempty"`
Value float64 `json:"value"`
}
func (api *API) alerts(r *http.Request) (interface{}, *apiError, func()) {
alertingRules := api.rulesRetriever.AlertingRules()
alerts := []*Alert{}
for _, alertingRule := range alertingRules {
alerts = append(
alerts,
rulesAlertsToAPIAlerts(alertingRule.ActiveAlerts())...,
)
}
res := &AlertDiscovery{Alerts: alerts}
return res, nil, nil
}
func rulesAlertsToAPIAlerts(rulesAlerts []*rules.Alert) []*Alert {
apiAlerts := make([]*Alert, len(rulesAlerts))
for i, ruleAlert := range rulesAlerts {
apiAlerts[i] = &Alert{
Labels: ruleAlert.Labels,
Annotations: ruleAlert.Annotations,
State: ruleAlert.State.String(),
ActiveAt: &ruleAlert.ActiveAt,
Value: ruleAlert.Value,
}
}
return apiAlerts
}
// RuleDiscovery has info for all rules
type RuleDiscovery struct {
RuleGroups []*RuleGroup `json:"groups"`
}
// RuleGroup has info for rules which are part of a group
type RuleGroup struct {
Name string `json:"name"`
File string `json:"file"`
// In order to preserve rule ordering, while exposing type (alerting or recording)
// specific properties, both alerting and recording rules are exposed in the
// same array.
Rules []rule `json:"rules"`
Interval float64 `json:"interval"`
}
type rule interface{}
type alertingRule struct {
Name string `json:"name"`
Query string `json:"query"`
Duration float64 `json:"duration"`
Labels labels.Labels `json:"labels"`
Annotations labels.Labels `json:"annotations"`
Alerts []*Alert `json:"alerts"`
// Type of an alertingRule is always "alerting".
Type string `json:"type"`
}
type recordingRule struct {
Name string `json:"name"`
Query string `json:"query"`
Labels labels.Labels `json:"labels,omitempty"`
// Type of a recordingRule is always "recording".
Type string `json:"type"`
}
func (api *API) rules(r *http.Request) (interface{}, *apiError, func()) {
ruleGroups := api.rulesRetriever.RuleGroups()
res := &RuleDiscovery{RuleGroups: make([]*RuleGroup, len(ruleGroups))}
for i, grp := range ruleGroups {
apiRuleGroup := &RuleGroup{
Name: grp.Name(),
File: grp.File(),
Interval: grp.Interval().Seconds(),
Rules: []rule{},
}
for _, r := range grp.Rules() {
var enrichedRule rule
switch rule := r.(type) {
case *rules.AlertingRule:
enrichedRule = alertingRule{
Name: rule.Name(),
Query: rule.Query().String(),
Duration: rule.Duration().Seconds(),
Labels: rule.Labels(),
Annotations: rule.Annotations(),
Alerts: rulesAlertsToAPIAlerts(rule.ActiveAlerts()),
Type: "alerting",
}
case *rules.RecordingRule:
enrichedRule = recordingRule{
Name: rule.Name(),
Query: rule.Query().String(),
Labels: rule.Labels(),
Type: "recording",
}
default:
err := fmt.Errorf("failed to assert type of rule '%v'", rule.Name())
return nil, &apiError{errorInternal, err}, nil
}
apiRuleGroup.Rules = append(apiRuleGroup.Rules, enrichedRule)
}
res.RuleGroups[i] = apiRuleGroup
}
return res, nil, nil
}
type prometheusConfig struct {
YAML string `json:"yaml"`
}

View file

@ -19,6 +19,7 @@ import (
"encoding/json"
"errors"
"fmt"
"github.com/go-kit/kit/log"
"io/ioutil"
"math"
"net/http"
@ -41,9 +42,11 @@ import (
"github.com/prometheus/prometheus/pkg/timestamp"
"github.com/prometheus/prometheus/prompb"
"github.com/prometheus/prometheus/promql"
"github.com/prometheus/prometheus/rules"
"github.com/prometheus/prometheus/scrape"
"github.com/prometheus/prometheus/storage"
"github.com/prometheus/prometheus/storage/remote"
"github.com/prometheus/prometheus/util/testutil"
)
type testTargetRetriever struct{}
@ -98,6 +101,73 @@ func (t testAlertmanagerRetriever) DroppedAlertmanagers() []*url.URL {
}
}
type rulesRetrieverMock struct {
testing *testing.T
}
func (m rulesRetrieverMock) AlertingRules() []*rules.AlertingRule {
expr1, err := promql.ParseExpr(`absent(test_metric3) != 1`)
if err != nil {
m.testing.Fatalf("unable to parse alert expression: %s", err)
}
expr2, err := promql.ParseExpr(`up == 1`)
if err != nil {
m.testing.Fatalf("Unable to parse alert expression: %s", err)
}
rule1 := rules.NewAlertingRule(
"test_metric3",
expr1,
time.Second,
labels.Labels{},
labels.Labels{},
log.NewNopLogger(),
)
rule2 := rules.NewAlertingRule(
"test_metric4",
expr2,
time.Second,
labels.Labels{},
labels.Labels{},
log.NewNopLogger(),
)
var r []*rules.AlertingRule
r = append(r, rule1)
r = append(r, rule2)
return r
}
func (m rulesRetrieverMock) RuleGroups() []*rules.Group {
var ar rulesRetrieverMock
arules := ar.AlertingRules()
storage := testutil.NewStorage(m.testing)
defer storage.Close()
engine := promql.NewEngine(nil, nil, 10, 10*time.Second)
opts := &rules.ManagerOptions{
QueryFunc: rules.EngineQueryFunc(engine, storage),
Appendable: storage,
Context: context.Background(),
Logger: log.NewNopLogger(),
}
var r []rules.Rule
for _, alertrule := range arules {
r = append(r, alertrule)
}
recordingExpr, err := promql.ParseExpr(`vector(1)`)
if err != nil {
m.testing.Fatalf("unable to parse alert expression: %s", err)
}
recordingRule := rules.NewRecordingRule("recording-rule-1", recordingExpr, labels.Labels{})
r = append(r, recordingRule)
group := rules.NewGroup("grp", "/path/to/file", time.Second, r, opts)
return []*rules.Group{group}
}
var samplePrometheusCfg = config.Config{
GlobalConfig: config.GlobalConfig{},
AlertingConfig: config.AlertingConfig{},
@ -130,16 +200,29 @@ func TestEndpoints(t *testing.T) {
now := time.Now()
var algr rulesRetrieverMock
algr.testing = t
algr.AlertingRules()
algr.RuleGroups()
t.Run("local", func(t *testing.T) {
var algr rulesRetrieverMock
algr.testing = t
algr.AlertingRules()
algr.RuleGroups()
api := &API{
Queryable: suite.Storage(),
QueryEngine: suite.QueryEngine(),
targetRetriever: testTargetRetriever{},
alertmanagerRetriever: testAlertmanagerRetriever{},
now: func() time.Time { return now },
config: func() config.Config { return samplePrometheusCfg },
flagsMap: sampleFlagMap,
ready: func(f http.HandlerFunc) http.HandlerFunc { return f },
now: func() time.Time { return now },
config: func() config.Config { return samplePrometheusCfg },
flagsMap: sampleFlagMap,
ready: func(f http.HandlerFunc) http.HandlerFunc { return f },
rulesRetriever: algr,
}
testEndpoints(t, api, true)
@ -176,15 +259,23 @@ func TestEndpoints(t *testing.T) {
t.Fatal(err)
}
var algr rulesRetrieverMock
algr.testing = t
algr.AlertingRules()
algr.RuleGroups()
api := &API{
Queryable: remote,
QueryEngine: suite.QueryEngine(),
targetRetriever: testTargetRetriever{},
alertmanagerRetriever: testAlertmanagerRetriever{},
now: func() time.Time { return now },
config: func() config.Config { return samplePrometheusCfg },
flagsMap: sampleFlagMap,
ready: func(f http.HandlerFunc) http.HandlerFunc { return f },
now: func() time.Time { return now },
config: func() config.Config { return samplePrometheusCfg },
flagsMap: sampleFlagMap,
ready: func(f http.HandlerFunc) http.HandlerFunc { return f },
rulesRetriever: algr,
}
testEndpoints(t, api, false)
@ -237,7 +328,6 @@ func setupRemote(s storage.Storage) *httptest.Server {
}
func testEndpoints(t *testing.T, api *API, testLabelAPI bool) {
start := time.Unix(0, 0)
type test struct {
@ -567,6 +657,50 @@ func testEndpoints(t *testing.T, api *API, testLabelAPI bool) {
endpoint: api.serveFlags,
response: sampleFlagMap,
},
{
endpoint: api.alerts,
response: &AlertDiscovery{
Alerts: []*Alert{},
},
},
{
endpoint: api.rules,
response: &RuleDiscovery{
RuleGroups: []*RuleGroup{
{
Name: "grp",
File: "/path/to/file",
Interval: 1,
Rules: []rule{
alertingRule{
Name: "test_metric3",
Query: "absent(test_metric3) != 1",
Duration: 1,
Labels: labels.Labels{},
Annotations: labels.Labels{},
Alerts: []*Alert{},
Type: "alerting",
},
alertingRule{
Name: "test_metric4",
Query: "up == 1",
Duration: 1,
Labels: labels.Labels{},
Annotations: labels.Labels{},
Alerts: []*Alert{},
Type: "alerting",
},
recordingRule{
Name: "recording-rule-1",
Query: "vector(1)",
Labels: labels.Labels{},
Type: "recording",
},
},
},
},
},
},
}
if testLabelAPI {
@ -646,7 +780,21 @@ func testEndpoints(t *testing.T, api *API, testLabelAPI bool) {
t.Fatalf("Expected error of type %q but got none", test.errType)
}
if !reflect.DeepEqual(resp, test.response) {
t.Fatalf("Response does not match, expected:\n%+v\ngot:\n%+v", test.response, resp)
respJSON, err := json.Marshal(resp)
if err != nil {
t.Fatalf("failed to marshal response as JSON: %v", err.Error())
}
expectedRespJSON, err := json.Marshal(test.response)
if err != nil {
t.Fatalf("failed to marshal expected response as JSON: %v", err.Error())
}
t.Fatalf(
"Response does not match, expected:\n%+v\ngot:\n%+v",
string(expectedRespJSON),
string(respJSON),
)
}
}
}

View file

@ -228,6 +228,7 @@ func New(logger log.Logger, o *Options) *Handler {
h.options.TSDB,
h.options.EnableAdminAPI,
logger,
h.ruleManager,
)
if o.RoutePrefix != "/" {