throttle resends of alerts to 1 minute by default (#4538)

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>
This commit is contained in:
Chris Marchbanks 2018-08-27 10:41:42 -06:00 committed by Brian Brazil
parent 53691ae261
commit 87f1dad16d
4 changed files with 84 additions and 6 deletions

View file

@ -89,6 +89,7 @@ func main() {
notifierTimeout model.Duration
forGracePeriod model.Duration
outageTolerance model.Duration
resendDelay model.Duration
web web.Options
tsdb tsdb.Options
lookbackDelta model.Duration
@ -173,6 +174,9 @@ func main() {
a.Flag("rules.alert.for-grace-period", "Minimum duration between alert and restored 'for' state. This is maintained only for alerts with configured 'for' time greater than grace period.").
Default("10m").SetValue(&cfg.forGracePeriod)
a.Flag("rules.alert.resend-delay", "Minimum amount of time to wait before resending an alert to Alertmanager. Must be lower than resolve_timeout in Alertmanager").
Default("1m").SetValue(&cfg.resendDelay)
a.Flag("alertmanager.notification-queue-capacity", "The capacity of the queue for pending Alertmanager notifications.").
Default("10000").IntVar(&cfg.notifier.QueueCapacity)
@ -272,6 +276,7 @@ func main() {
Logger: log.With(logger, "component", "rule manager"),
OutageTolerance: time.Duration(cfg.outageTolerance),
ForGracePeriod: time.Duration(cfg.forGracePeriod),
ResendDelay: time.Duration(cfg.resendDelay),
})
)
@ -682,16 +687,11 @@ func computeExternalURL(u, listenAddr string) (*url.URL, error) {
}
// sendAlerts implements the rules.NotifyFunc for a Notifier.
// It filters any non-firing alerts from the input.
func sendAlerts(n *notifier.Manager, externalURL string) rules.NotifyFunc {
return func(ctx context.Context, expr string, alerts ...*rules.Alert) {
var res []*notifier.Alert
for _, alert := range alerts {
// Only send actually firing alerts.
if alert.State == rules.StatePending {
continue
}
a := &notifier.Alert{
StartsAt: alert.FiredAt,
Labels: alert.Labels,

View file

@ -88,6 +88,20 @@ type Alert struct {
ActiveAt time.Time
FiredAt time.Time
ResolvedAt time.Time
LastSentAt time.Time
}
func (a *Alert) needsSending(ts time.Time, resendDelay time.Duration) bool {
if a.State == StatePending {
return false
}
// if an alert has been resolved since the last send, resend it
if a.ResolvedAt.After(a.LastSentAt) {
return true
}
return a.LastSentAt.Add(resendDelay).Before(ts)
}
// An AlertingRule generates alerts from its vector expression.
@ -426,6 +440,18 @@ func (r *AlertingRule) ForEachActiveAlert(f func(*Alert)) {
}
}
func (r *AlertingRule) sendAlerts(ctx context.Context, ts time.Time, resendDelay time.Duration, notifyFunc NotifyFunc) {
alerts := make([]*Alert, 0)
r.ForEachActiveAlert(func(alert *Alert) {
if alert.needsSending(ts, resendDelay) {
alert.LastSentAt = ts
anew := *alert
alerts = append(alerts, &anew)
}
})
notifyFunc(ctx, r.vector.String(), alerts...)
}
func (r *AlertingRule) String() string {
ar := rulefmt.Rule{
Alert: r.name,

View file

@ -393,7 +393,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
}
if ar, ok := rule.(*AlertingRule); ok {
g.opts.NotifyFunc(ctx, ar.vector.String(), ar.currentAlerts()...)
ar.sendAlerts(ctx, ts, g.opts.ResendDelay, g.opts.NotifyFunc)
}
var (
numOutOfOrder = 0
@ -607,6 +607,7 @@ type ManagerOptions struct {
Registerer prometheus.Registerer
OutageTolerance time.Duration
ForGracePeriod time.Duration
ResendDelay time.Duration
}
// NewManager returns an implementation of Manager, ready to be started

View file

@ -651,3 +651,54 @@ func TestUpdate(t *testing.T) {
}
}
}
func TestNotify(t *testing.T) {
storage := testutil.NewStorage(t)
defer storage.Close()
engine := promql.NewEngine(nil, nil, 10, 10*time.Second)
var lastNotified []*Alert
notifyFunc := func(ctx context.Context, expr string, alerts ...*Alert) {
lastNotified = alerts
}
opts := &ManagerOptions{
QueryFunc: EngineQueryFunc(engine, storage),
Appendable: storage,
TSDB: storage,
Context: context.Background(),
Logger: log.NewNopLogger(),
NotifyFunc: notifyFunc,
ResendDelay: 2 * time.Second,
}
expr, err := promql.ParseExpr("a > 1")
testutil.Ok(t, err)
rule := NewAlertingRule("aTooHigh", expr, 0, labels.Labels{}, labels.Labels{}, true, log.NewNopLogger())
group := NewGroup("alert", "", time.Second, []Rule{rule}, true, opts)
app, _ := storage.Appender()
app.Add(labels.FromStrings(model.MetricNameLabel, "a"), 1000, 2)
app.Add(labels.FromStrings(model.MetricNameLabel, "a"), 2000, 3)
app.Add(labels.FromStrings(model.MetricNameLabel, "a"), 5000, 3)
app.Add(labels.FromStrings(model.MetricNameLabel, "a"), 6000, 0)
err = app.Commit()
testutil.Ok(t, err)
ctx := context.Background()
// Alert sent right away
group.Eval(ctx, time.Unix(1, 0))
testutil.Equals(t, 1, len(lastNotified))
// Alert is not sent 1s later
group.Eval(ctx, time.Unix(2, 0))
testutil.Equals(t, 0, len(lastNotified))
// Alert is resent at t=5s
group.Eval(ctx, time.Unix(5, 0))
testutil.Equals(t, 1, len(lastNotified))
// Resolution alert sent right away
group.Eval(ctx, time.Unix(6, 0))
testutil.Equals(t, 1, len(lastNotified))
}