Better Metrics For Alerts

* Closes prometheus/prometheus#2429
* Moved metrics to top of file for easier access
* Initialised CounterVecs
This commit is contained in:
Goutham Veeramachaneni 2017-03-02 23:58:15 +05:30
parent 7e14533b32
commit 41da5c4ef2
No known key found for this signature in database
GPG key ID: F1C217E8E9023CAD

View file

@ -50,6 +50,49 @@ const (
alertmanagerLabel = "alertmanager" alertmanagerLabel = "alertmanager"
) )
var (
alertLatency = prometheus.NewSummaryVec(prometheus.SummaryOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "latency_seconds",
Help: "Latency quantiles for sending alert notifications (not including dropped notifications).",
},
[]string{alertmanagerLabel},
)
alertErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "errors_total",
Help: "Total number of errors sending alert notifications.",
},
[]string{alertmanagerLabel},
)
alertSent = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "sent_total",
Help: "Total number of alerts sent.",
},
[]string{alertmanagerLabel},
)
alertDropped = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "dropped_total",
Help: "Total number of alerts dropped due to errors when sending to Alertmanager.",
})
alertQueueLength = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "queue_length",
Help: "The number of alert notifications in the queue.",
})
)
// Notifier is responsible for dispatching alert notifications to an // Notifier is responsible for dispatching alert notifications to an
// alert manager service. // alert manager service.
type Notifier struct { type Notifier struct {
@ -61,11 +104,6 @@ type Notifier struct {
ctx context.Context ctx context.Context
cancel func() cancel func()
latency *prometheus.SummaryVec
errors *prometheus.CounterVec
sent *prometheus.CounterVec
dropped prometheus.Counter
queueLength prometheus.Gauge
queueCapacity prometheus.Metric queueCapacity prometheus.Metric
alertmanagers []*alertmanagerSet alertmanagers []*alertmanagerSet
@ -96,42 +134,6 @@ func New(o *Options) *Notifier {
more: make(chan struct{}, 1), more: make(chan struct{}, 1),
opts: o, opts: o,
latency: prometheus.NewSummaryVec(prometheus.SummaryOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "latency_seconds",
Help: "Latency quantiles for sending alert notifications (not including dropped notifications).",
},
[]string{alertmanagerLabel},
),
errors: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "errors_total",
Help: "Total number of errors sending alert notifications.",
},
[]string{alertmanagerLabel},
),
sent: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "sent_total",
Help: "Total number of alerts successfully sent.",
},
[]string{alertmanagerLabel},
),
dropped: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "dropped_total",
Help: "Total number of alerts dropped due to errors when sending to Alertmanager.",
}),
queueLength: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "queue_length",
Help: "The number of alert notifications in the queue.",
}),
queueCapacity: prometheus.MustNewConstMetric( queueCapacity: prometheus.MustNewConstMetric(
prometheus.NewDesc( prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "queue_capacity"), prometheus.BuildFQName(namespace, subsystem, "queue_capacity"),
@ -160,6 +162,12 @@ func (n *Notifier) ApplyConfig(conf *config.Config) error {
if err != nil { if err != nil {
return err return err
} }
for _, am := range ams.ams {
alertErrors.WithLabelValues(am.url())
alertSent.WithLabelValues(am.url())
}
amSets = append(amSets, ams) amSets = append(amSets, ams)
} }
@ -216,7 +224,7 @@ func (n *Notifier) Run() {
alerts := n.nextBatch() alerts := n.nextBatch()
if !n.sendAll(alerts...) { if !n.sendAll(alerts...) {
n.dropped.Add(float64(len(alerts))) alertDropped.Add(float64(len(alerts)))
} }
// If the queue still has items left, kick off the next iteration. // If the queue still has items left, kick off the next iteration.
if n.queueLen() > 0 { if n.queueLen() > 0 {
@ -248,7 +256,7 @@ func (n *Notifier) Send(alerts ...*model.Alert) {
alerts = alerts[d:] alerts = alerts[d:]
log.Warnf("Alert batch larger than queue capacity, dropping %d alerts", d) log.Warnf("Alert batch larger than queue capacity, dropping %d alerts", d)
n.dropped.Add(float64(d)) alertDropped.Add(float64(d))
} }
// If the queue is full, remove the oldest alerts in favor // If the queue is full, remove the oldest alerts in favor
@ -257,7 +265,7 @@ func (n *Notifier) Send(alerts ...*model.Alert) {
n.queue = n.queue[d:] n.queue = n.queue[d:]
log.Warnf("Alert notification queue full, dropping %d alerts", d) log.Warnf("Alert notification queue full, dropping %d alerts", d)
n.dropped.Add(float64(d)) alertDropped.Add(float64(d))
} }
n.queue = append(n.queue, alerts...) n.queue = append(n.queue, alerts...)
@ -339,12 +347,12 @@ func (n *Notifier) sendAll(alerts ...*model.Alert) bool {
if err := n.sendOne(ctx, ams.client, u, b); err != nil { if err := n.sendOne(ctx, ams.client, u, b); err != nil {
log.With("alertmanager", u).With("count", len(alerts)).Errorf("Error sending alerts: %s", err) log.With("alertmanager", u).With("count", len(alerts)).Errorf("Error sending alerts: %s", err)
n.errors.WithLabelValues(u).Inc() alertErrors.WithLabelValues(u).Inc()
} else { } else {
atomic.AddUint64(&numSuccess, 1) atomic.AddUint64(&numSuccess, 1)
} }
n.latency.WithLabelValues(u).Observe(time.Since(begin).Seconds()) alertLatency.WithLabelValues(u).Observe(time.Since(begin).Seconds())
n.sent.WithLabelValues(u).Add(float64(len(alerts))) alertSent.WithLabelValues(u).Add(float64(len(alerts)))
wg.Done() wg.Done()
}(am) }(am)
@ -383,25 +391,25 @@ func (n *Notifier) Stop() {
// Describe implements prometheus.Collector. // Describe implements prometheus.Collector.
func (n *Notifier) Describe(ch chan<- *prometheus.Desc) { func (n *Notifier) Describe(ch chan<- *prometheus.Desc) {
n.latency.Describe(ch) alertLatency.Describe(ch)
n.errors.Describe(ch) alertErrors.Describe(ch)
n.sent.Describe(ch) alertSent.Describe(ch)
ch <- n.dropped.Desc() ch <- alertDropped.Desc()
ch <- n.queueLength.Desc() ch <- alertQueueLength.Desc()
ch <- n.queueCapacity.Desc() ch <- n.queueCapacity.Desc()
} }
// Collect implements prometheus.Collector. // Collect implements prometheus.Collector.
func (n *Notifier) Collect(ch chan<- prometheus.Metric) { func (n *Notifier) Collect(ch chan<- prometheus.Metric) {
n.queueLength.Set(float64(n.queueLen())) alertQueueLength.Set(float64(n.queueLen()))
n.latency.Collect(ch) alertLatency.Collect(ch)
n.errors.Collect(ch) alertErrors.Collect(ch)
n.sent.Collect(ch) alertSent.Collect(ch)
ch <- n.dropped ch <- alertDropped
ch <- n.queueLength ch <- alertQueueLength
ch <- n.queueCapacity ch <- n.queueCapacity
} }
@ -474,6 +482,9 @@ func (s *alertmanagerSet) Sync(tgs []*config.TargetGroup) {
continue continue
} }
alertSent.WithLabelValues(us)
alertErrors.WithLabelValues(us)
seen[us] = struct{}{} seen[us] = struct{}{}
s.ams = append(s.ams, am) s.ams = append(s.ams, am)
} }