mirror of
https://github.com/prometheus/prometheus.git
synced 2025-01-11 22:07:27 -08:00
throttle resends of alerts to 1 minute by default (#4538)
Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>
This commit is contained in:
parent
53691ae261
commit
87f1dad16d
|
@ -89,6 +89,7 @@ func main() {
|
||||||
notifierTimeout model.Duration
|
notifierTimeout model.Duration
|
||||||
forGracePeriod model.Duration
|
forGracePeriod model.Duration
|
||||||
outageTolerance model.Duration
|
outageTolerance model.Duration
|
||||||
|
resendDelay model.Duration
|
||||||
web web.Options
|
web web.Options
|
||||||
tsdb tsdb.Options
|
tsdb tsdb.Options
|
||||||
lookbackDelta model.Duration
|
lookbackDelta model.Duration
|
||||||
|
@ -173,6 +174,9 @@ func main() {
|
||||||
a.Flag("rules.alert.for-grace-period", "Minimum duration between alert and restored 'for' state. This is maintained only for alerts with configured 'for' time greater than grace period.").
|
a.Flag("rules.alert.for-grace-period", "Minimum duration between alert and restored 'for' state. This is maintained only for alerts with configured 'for' time greater than grace period.").
|
||||||
Default("10m").SetValue(&cfg.forGracePeriod)
|
Default("10m").SetValue(&cfg.forGracePeriod)
|
||||||
|
|
||||||
|
a.Flag("rules.alert.resend-delay", "Minimum amount of time to wait before resending an alert to Alertmanager. Must be lower than resolve_timeout in Alertmanager").
|
||||||
|
Default("1m").SetValue(&cfg.resendDelay)
|
||||||
|
|
||||||
a.Flag("alertmanager.notification-queue-capacity", "The capacity of the queue for pending Alertmanager notifications.").
|
a.Flag("alertmanager.notification-queue-capacity", "The capacity of the queue for pending Alertmanager notifications.").
|
||||||
Default("10000").IntVar(&cfg.notifier.QueueCapacity)
|
Default("10000").IntVar(&cfg.notifier.QueueCapacity)
|
||||||
|
|
||||||
|
@ -272,6 +276,7 @@ func main() {
|
||||||
Logger: log.With(logger, "component", "rule manager"),
|
Logger: log.With(logger, "component", "rule manager"),
|
||||||
OutageTolerance: time.Duration(cfg.outageTolerance),
|
OutageTolerance: time.Duration(cfg.outageTolerance),
|
||||||
ForGracePeriod: time.Duration(cfg.forGracePeriod),
|
ForGracePeriod: time.Duration(cfg.forGracePeriod),
|
||||||
|
ResendDelay: time.Duration(cfg.resendDelay),
|
||||||
})
|
})
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -682,16 +687,11 @@ func computeExternalURL(u, listenAddr string) (*url.URL, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// sendAlerts implements the rules.NotifyFunc for a Notifier.
|
// sendAlerts implements the rules.NotifyFunc for a Notifier.
|
||||||
// It filters any non-firing alerts from the input.
|
|
||||||
func sendAlerts(n *notifier.Manager, externalURL string) rules.NotifyFunc {
|
func sendAlerts(n *notifier.Manager, externalURL string) rules.NotifyFunc {
|
||||||
return func(ctx context.Context, expr string, alerts ...*rules.Alert) {
|
return func(ctx context.Context, expr string, alerts ...*rules.Alert) {
|
||||||
var res []*notifier.Alert
|
var res []*notifier.Alert
|
||||||
|
|
||||||
for _, alert := range alerts {
|
for _, alert := range alerts {
|
||||||
// Only send actually firing alerts.
|
|
||||||
if alert.State == rules.StatePending {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
a := ¬ifier.Alert{
|
a := ¬ifier.Alert{
|
||||||
StartsAt: alert.FiredAt,
|
StartsAt: alert.FiredAt,
|
||||||
Labels: alert.Labels,
|
Labels: alert.Labels,
|
||||||
|
|
|
@ -88,6 +88,20 @@ type Alert struct {
|
||||||
ActiveAt time.Time
|
ActiveAt time.Time
|
||||||
FiredAt time.Time
|
FiredAt time.Time
|
||||||
ResolvedAt time.Time
|
ResolvedAt time.Time
|
||||||
|
LastSentAt time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *Alert) needsSending(ts time.Time, resendDelay time.Duration) bool {
|
||||||
|
if a.State == StatePending {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// if an alert has been resolved since the last send, resend it
|
||||||
|
if a.ResolvedAt.After(a.LastSentAt) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
return a.LastSentAt.Add(resendDelay).Before(ts)
|
||||||
}
|
}
|
||||||
|
|
||||||
// An AlertingRule generates alerts from its vector expression.
|
// An AlertingRule generates alerts from its vector expression.
|
||||||
|
@ -426,6 +440,18 @@ func (r *AlertingRule) ForEachActiveAlert(f func(*Alert)) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (r *AlertingRule) sendAlerts(ctx context.Context, ts time.Time, resendDelay time.Duration, notifyFunc NotifyFunc) {
|
||||||
|
alerts := make([]*Alert, 0)
|
||||||
|
r.ForEachActiveAlert(func(alert *Alert) {
|
||||||
|
if alert.needsSending(ts, resendDelay) {
|
||||||
|
alert.LastSentAt = ts
|
||||||
|
anew := *alert
|
||||||
|
alerts = append(alerts, &anew)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
notifyFunc(ctx, r.vector.String(), alerts...)
|
||||||
|
}
|
||||||
|
|
||||||
func (r *AlertingRule) String() string {
|
func (r *AlertingRule) String() string {
|
||||||
ar := rulefmt.Rule{
|
ar := rulefmt.Rule{
|
||||||
Alert: r.name,
|
Alert: r.name,
|
||||||
|
|
|
@ -393,7 +393,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if ar, ok := rule.(*AlertingRule); ok {
|
if ar, ok := rule.(*AlertingRule); ok {
|
||||||
g.opts.NotifyFunc(ctx, ar.vector.String(), ar.currentAlerts()...)
|
ar.sendAlerts(ctx, ts, g.opts.ResendDelay, g.opts.NotifyFunc)
|
||||||
}
|
}
|
||||||
var (
|
var (
|
||||||
numOutOfOrder = 0
|
numOutOfOrder = 0
|
||||||
|
@ -607,6 +607,7 @@ type ManagerOptions struct {
|
||||||
Registerer prometheus.Registerer
|
Registerer prometheus.Registerer
|
||||||
OutageTolerance time.Duration
|
OutageTolerance time.Duration
|
||||||
ForGracePeriod time.Duration
|
ForGracePeriod time.Duration
|
||||||
|
ResendDelay time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewManager returns an implementation of Manager, ready to be started
|
// NewManager returns an implementation of Manager, ready to be started
|
||||||
|
|
|
@ -651,3 +651,54 @@ func TestUpdate(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestNotify(t *testing.T) {
|
||||||
|
storage := testutil.NewStorage(t)
|
||||||
|
defer storage.Close()
|
||||||
|
engine := promql.NewEngine(nil, nil, 10, 10*time.Second)
|
||||||
|
var lastNotified []*Alert
|
||||||
|
notifyFunc := func(ctx context.Context, expr string, alerts ...*Alert) {
|
||||||
|
lastNotified = alerts
|
||||||
|
}
|
||||||
|
opts := &ManagerOptions{
|
||||||
|
QueryFunc: EngineQueryFunc(engine, storage),
|
||||||
|
Appendable: storage,
|
||||||
|
TSDB: storage,
|
||||||
|
Context: context.Background(),
|
||||||
|
Logger: log.NewNopLogger(),
|
||||||
|
NotifyFunc: notifyFunc,
|
||||||
|
ResendDelay: 2 * time.Second,
|
||||||
|
}
|
||||||
|
|
||||||
|
expr, err := promql.ParseExpr("a > 1")
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
rule := NewAlertingRule("aTooHigh", expr, 0, labels.Labels{}, labels.Labels{}, true, log.NewNopLogger())
|
||||||
|
group := NewGroup("alert", "", time.Second, []Rule{rule}, true, opts)
|
||||||
|
|
||||||
|
app, _ := storage.Appender()
|
||||||
|
app.Add(labels.FromStrings(model.MetricNameLabel, "a"), 1000, 2)
|
||||||
|
app.Add(labels.FromStrings(model.MetricNameLabel, "a"), 2000, 3)
|
||||||
|
app.Add(labels.FromStrings(model.MetricNameLabel, "a"), 5000, 3)
|
||||||
|
app.Add(labels.FromStrings(model.MetricNameLabel, "a"), 6000, 0)
|
||||||
|
|
||||||
|
err = app.Commit()
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
|
||||||
|
ctx := context.Background()
|
||||||
|
|
||||||
|
// Alert sent right away
|
||||||
|
group.Eval(ctx, time.Unix(1, 0))
|
||||||
|
testutil.Equals(t, 1, len(lastNotified))
|
||||||
|
|
||||||
|
// Alert is not sent 1s later
|
||||||
|
group.Eval(ctx, time.Unix(2, 0))
|
||||||
|
testutil.Equals(t, 0, len(lastNotified))
|
||||||
|
|
||||||
|
// Alert is resent at t=5s
|
||||||
|
group.Eval(ctx, time.Unix(5, 0))
|
||||||
|
testutil.Equals(t, 1, len(lastNotified))
|
||||||
|
|
||||||
|
// Resolution alert sent right away
|
||||||
|
group.Eval(ctx, time.Unix(6, 0))
|
||||||
|
testutil.Equals(t, 1, len(lastNotified))
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue