mirror of
https://github.com/prometheus/prometheus.git
synced 2024-12-27 22:49:40 -08:00
1909686789
- Always spell out the time unit (e.g. milliseconds instead of ms). - Remove "_total" from the names of metrics that are not counters. - Make use of the "Namespace" and "Subsystem" fields in the options. - Removed the "capacity" facet from all metrics about channels/queues. These are all fixed via command line flags and will never change during the runtime of a process. Also, they should not be part of the same metric family. I have added separate metrics for the capacity of queues as convenience. (They will never change and are only set once.) - I left "metric_disk_latency_microseconds" unchanged, although that metric measures the latency of the storage device, even if it is not a spinning disk. "SSD" is read by many as "solid state disk", so it's not too far off. (It should be "solid state drive", of course, but "metric_drive_latency_microseconds" is probably confusing.) - Brian suggested to not mix "failure" and "success" outcome in the same metric family (distinguished by labels). For now, I left it as it is. We are touching some bigger issue here, especially as other parts in the Prometheus ecosystem are following the same principle. We still need to come to terms here and then change things consistently everywhere. Change-Id: If799458b450d18f78500f05990301c12525197d3
204 lines
5.8 KiB
Go
204 lines
5.8 KiB
Go
// Copyright 2013 Prometheus Team
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package notification
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/json"
|
|
"flag"
|
|
"io"
|
|
"io/ioutil"
|
|
"net/http"
|
|
"time"
|
|
|
|
"github.com/golang/glog"
|
|
|
|
clientmodel "github.com/prometheus/client_golang/model"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
|
|
"github.com/prometheus/prometheus/utility"
|
|
)
|
|
|
|
const (
|
|
alertmanagerApiEventsPath = "/api/alerts"
|
|
contentTypeJson = "application/json"
|
|
)
|
|
|
|
// String constants for instrumentation.
|
|
const (
|
|
namespace = "prometheus"
|
|
subsystem = "notifications"
|
|
|
|
result = "result"
|
|
success = "success"
|
|
failure = "failure"
|
|
dropped = "dropped"
|
|
)
|
|
|
|
var (
|
|
deadline = flag.Duration("alertmanager.httpDeadline", 10*time.Second, "Alert manager HTTP API timeout.")
|
|
)
|
|
|
|
// A request for sending a notification to the alert manager for a single alert
|
|
// vector element.
|
|
type NotificationReq struct {
|
|
// Short-form alert summary. May contain text/template-style interpolations.
|
|
Summary string
|
|
// Longer alert description. May contain text/template-style interpolations.
|
|
Description string
|
|
// Labels associated with this alert notification, including alert name.
|
|
Labels clientmodel.LabelSet
|
|
// Current value of alert
|
|
Value clientmodel.SampleValue
|
|
// Since when this alert has been active (pending or firing).
|
|
ActiveSince time.Time
|
|
// A textual representation of the rule that triggered the alert.
|
|
RuleString string
|
|
// Prometheus console link to alert expression.
|
|
GeneratorUrl string
|
|
}
|
|
|
|
type NotificationReqs []*NotificationReq
|
|
|
|
type httpPoster interface {
|
|
Post(url string, bodyType string, body io.Reader) (*http.Response, error)
|
|
}
|
|
|
|
// NotificationHandler is responsible for dispatching alert notifications to an
|
|
// alert manager service.
|
|
type NotificationHandler struct {
|
|
// The URL of the alert manager to send notifications to.
|
|
alertmanagerUrl string
|
|
// Buffer of notifications that have not yet been sent.
|
|
pendingNotifications <-chan NotificationReqs
|
|
// HTTP client with custom timeout settings.
|
|
httpClient httpPoster
|
|
|
|
notificationLatency *prometheus.SummaryVec
|
|
notificationsQueueLength prometheus.Gauge
|
|
notificationsQueueCapacity prometheus.Metric
|
|
}
|
|
|
|
// Construct a new NotificationHandler.
|
|
func NewNotificationHandler(alertmanagerUrl string, notificationReqs <-chan NotificationReqs) *NotificationHandler {
|
|
return &NotificationHandler{
|
|
alertmanagerUrl: alertmanagerUrl,
|
|
pendingNotifications: notificationReqs,
|
|
httpClient: utility.NewDeadlineClient(*deadline),
|
|
|
|
notificationLatency: prometheus.NewSummaryVec(
|
|
prometheus.SummaryOpts{
|
|
Namespace: namespace,
|
|
Subsystem: subsystem,
|
|
Name: "latency_milliseconds",
|
|
Help: "Latency quantiles for sending alert notifications.",
|
|
},
|
|
[]string{result},
|
|
),
|
|
notificationsQueueLength: prometheus.NewGauge(prometheus.GaugeOpts{
|
|
Namespace: namespace,
|
|
Subsystem: subsystem,
|
|
Name: "queue_length",
|
|
Help: "The number of alert notifications in the queue.",
|
|
}),
|
|
notificationsQueueCapacity: prometheus.MustNewConstMetric(
|
|
prometheus.NewDesc(
|
|
prometheus.BuildFQName(namespace, subsystem, "queue_capacity"),
|
|
"The capacity of the alert notifications queue.",
|
|
nil, nil,
|
|
),
|
|
prometheus.GaugeValue,
|
|
float64(cap(notificationReqs)),
|
|
),
|
|
}
|
|
}
|
|
|
|
// Send a list of notifications to the configured alert manager.
|
|
func (n *NotificationHandler) sendNotifications(reqs NotificationReqs) error {
|
|
alerts := make([]map[string]interface{}, 0, len(reqs))
|
|
for _, req := range reqs {
|
|
alerts = append(alerts, map[string]interface{}{
|
|
"Summary": req.Summary,
|
|
"Description": req.Description,
|
|
"Labels": req.Labels,
|
|
"Payload": map[string]interface{}{
|
|
"Value": req.Value,
|
|
"ActiveSince": req.ActiveSince,
|
|
"GeneratorUrl": req.GeneratorUrl,
|
|
"AlertingRule": req.RuleString,
|
|
},
|
|
})
|
|
}
|
|
buf, err := json.Marshal(alerts)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
glog.V(1).Infoln("Sending notifications to alertmanager:", string(buf))
|
|
resp, err := n.httpClient.Post(
|
|
n.alertmanagerUrl+alertmanagerApiEventsPath,
|
|
contentTypeJson,
|
|
bytes.NewBuffer(buf),
|
|
)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
_, err = ioutil.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// BUG: Do we need to check the response code?
|
|
return nil
|
|
}
|
|
|
|
// Continuously dispatch notifications.
|
|
func (n *NotificationHandler) Run() {
|
|
for reqs := range n.pendingNotifications {
|
|
if n.alertmanagerUrl == "" {
|
|
glog.Warning("No alert manager configured, not dispatching notification")
|
|
n.notificationLatency.WithLabelValues(dropped).Observe(0)
|
|
continue
|
|
}
|
|
|
|
begin := time.Now()
|
|
err := n.sendNotifications(reqs)
|
|
labelValue := success
|
|
|
|
if err != nil {
|
|
glog.Error("Error sending notification: ", err)
|
|
labelValue = failure
|
|
}
|
|
|
|
n.notificationLatency.WithLabelValues(labelValue).Observe(
|
|
float64(time.Since(begin) / time.Millisecond),
|
|
)
|
|
}
|
|
}
|
|
|
|
// Describe implements prometheus.Collector.
|
|
func (n *NotificationHandler) Describe(ch chan<- *prometheus.Desc) {
|
|
n.notificationLatency.Describe(ch)
|
|
ch <- n.notificationsQueueLength.Desc()
|
|
ch <- n.notificationsQueueCapacity.Desc()
|
|
}
|
|
|
|
// Collect implements prometheus.Collector.
|
|
func (n *NotificationHandler) Collect(ch chan<- prometheus.Metric) {
|
|
n.notificationLatency.Collect(ch)
|
|
n.notificationsQueueLength.Set(float64(len(n.pendingNotifications)))
|
|
ch <- n.notificationsQueueLength
|
|
ch <- n.notificationsQueueCapacity
|
|
}
|