mirror of
https://github.com/prometheus/prometheus.git
synced 2024-12-23 12:44:05 -08:00
throttle resends of alerts to 1 minute by default (#4538)
Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>
This commit is contained in:
parent
53691ae261
commit
87f1dad16d
|
@ -89,6 +89,7 @@ func main() {
|
|||
notifierTimeout model.Duration
|
||||
forGracePeriod model.Duration
|
||||
outageTolerance model.Duration
|
||||
resendDelay model.Duration
|
||||
web web.Options
|
||||
tsdb tsdb.Options
|
||||
lookbackDelta model.Duration
|
||||
|
@ -173,6 +174,9 @@ func main() {
|
|||
a.Flag("rules.alert.for-grace-period", "Minimum duration between alert and restored 'for' state. This is maintained only for alerts with configured 'for' time greater than grace period.").
|
||||
Default("10m").SetValue(&cfg.forGracePeriod)
|
||||
|
||||
a.Flag("rules.alert.resend-delay", "Minimum amount of time to wait before resending an alert to Alertmanager. Must be lower than resolve_timeout in Alertmanager").
|
||||
Default("1m").SetValue(&cfg.resendDelay)
|
||||
|
||||
a.Flag("alertmanager.notification-queue-capacity", "The capacity of the queue for pending Alertmanager notifications.").
|
||||
Default("10000").IntVar(&cfg.notifier.QueueCapacity)
|
||||
|
||||
|
@ -272,6 +276,7 @@ func main() {
|
|||
Logger: log.With(logger, "component", "rule manager"),
|
||||
OutageTolerance: time.Duration(cfg.outageTolerance),
|
||||
ForGracePeriod: time.Duration(cfg.forGracePeriod),
|
||||
ResendDelay: time.Duration(cfg.resendDelay),
|
||||
})
|
||||
)
|
||||
|
||||
|
@ -682,16 +687,11 @@ func computeExternalURL(u, listenAddr string) (*url.URL, error) {
|
|||
}
|
||||
|
||||
// sendAlerts implements the rules.NotifyFunc for a Notifier.
|
||||
// It filters any non-firing alerts from the input.
|
||||
func sendAlerts(n *notifier.Manager, externalURL string) rules.NotifyFunc {
|
||||
return func(ctx context.Context, expr string, alerts ...*rules.Alert) {
|
||||
var res []*notifier.Alert
|
||||
|
||||
for _, alert := range alerts {
|
||||
// Only send actually firing alerts.
|
||||
if alert.State == rules.StatePending {
|
||||
continue
|
||||
}
|
||||
a := ¬ifier.Alert{
|
||||
StartsAt: alert.FiredAt,
|
||||
Labels: alert.Labels,
|
||||
|
|
|
@ -88,6 +88,20 @@ type Alert struct {
|
|||
ActiveAt time.Time
|
||||
FiredAt time.Time
|
||||
ResolvedAt time.Time
|
||||
LastSentAt time.Time
|
||||
}
|
||||
|
||||
func (a *Alert) needsSending(ts time.Time, resendDelay time.Duration) bool {
|
||||
if a.State == StatePending {
|
||||
return false
|
||||
}
|
||||
|
||||
// if an alert has been resolved since the last send, resend it
|
||||
if a.ResolvedAt.After(a.LastSentAt) {
|
||||
return true
|
||||
}
|
||||
|
||||
return a.LastSentAt.Add(resendDelay).Before(ts)
|
||||
}
|
||||
|
||||
// An AlertingRule generates alerts from its vector expression.
|
||||
|
@ -426,6 +440,18 @@ func (r *AlertingRule) ForEachActiveAlert(f func(*Alert)) {
|
|||
}
|
||||
}
|
||||
|
||||
func (r *AlertingRule) sendAlerts(ctx context.Context, ts time.Time, resendDelay time.Duration, notifyFunc NotifyFunc) {
|
||||
alerts := make([]*Alert, 0)
|
||||
r.ForEachActiveAlert(func(alert *Alert) {
|
||||
if alert.needsSending(ts, resendDelay) {
|
||||
alert.LastSentAt = ts
|
||||
anew := *alert
|
||||
alerts = append(alerts, &anew)
|
||||
}
|
||||
})
|
||||
notifyFunc(ctx, r.vector.String(), alerts...)
|
||||
}
|
||||
|
||||
func (r *AlertingRule) String() string {
|
||||
ar := rulefmt.Rule{
|
||||
Alert: r.name,
|
||||
|
|
|
@ -393,7 +393,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
|
|||
}
|
||||
|
||||
if ar, ok := rule.(*AlertingRule); ok {
|
||||
g.opts.NotifyFunc(ctx, ar.vector.String(), ar.currentAlerts()...)
|
||||
ar.sendAlerts(ctx, ts, g.opts.ResendDelay, g.opts.NotifyFunc)
|
||||
}
|
||||
var (
|
||||
numOutOfOrder = 0
|
||||
|
@ -607,6 +607,7 @@ type ManagerOptions struct {
|
|||
Registerer prometheus.Registerer
|
||||
OutageTolerance time.Duration
|
||||
ForGracePeriod time.Duration
|
||||
ResendDelay time.Duration
|
||||
}
|
||||
|
||||
// NewManager returns an implementation of Manager, ready to be started
|
||||
|
|
|
@ -651,3 +651,54 @@ func TestUpdate(t *testing.T) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestNotify(t *testing.T) {
|
||||
storage := testutil.NewStorage(t)
|
||||
defer storage.Close()
|
||||
engine := promql.NewEngine(nil, nil, 10, 10*time.Second)
|
||||
var lastNotified []*Alert
|
||||
notifyFunc := func(ctx context.Context, expr string, alerts ...*Alert) {
|
||||
lastNotified = alerts
|
||||
}
|
||||
opts := &ManagerOptions{
|
||||
QueryFunc: EngineQueryFunc(engine, storage),
|
||||
Appendable: storage,
|
||||
TSDB: storage,
|
||||
Context: context.Background(),
|
||||
Logger: log.NewNopLogger(),
|
||||
NotifyFunc: notifyFunc,
|
||||
ResendDelay: 2 * time.Second,
|
||||
}
|
||||
|
||||
expr, err := promql.ParseExpr("a > 1")
|
||||
testutil.Ok(t, err)
|
||||
rule := NewAlertingRule("aTooHigh", expr, 0, labels.Labels{}, labels.Labels{}, true, log.NewNopLogger())
|
||||
group := NewGroup("alert", "", time.Second, []Rule{rule}, true, opts)
|
||||
|
||||
app, _ := storage.Appender()
|
||||
app.Add(labels.FromStrings(model.MetricNameLabel, "a"), 1000, 2)
|
||||
app.Add(labels.FromStrings(model.MetricNameLabel, "a"), 2000, 3)
|
||||
app.Add(labels.FromStrings(model.MetricNameLabel, "a"), 5000, 3)
|
||||
app.Add(labels.FromStrings(model.MetricNameLabel, "a"), 6000, 0)
|
||||
|
||||
err = app.Commit()
|
||||
testutil.Ok(t, err)
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
// Alert sent right away
|
||||
group.Eval(ctx, time.Unix(1, 0))
|
||||
testutil.Equals(t, 1, len(lastNotified))
|
||||
|
||||
// Alert is not sent 1s later
|
||||
group.Eval(ctx, time.Unix(2, 0))
|
||||
testutil.Equals(t, 0, len(lastNotified))
|
||||
|
||||
// Alert is resent at t=5s
|
||||
group.Eval(ctx, time.Unix(5, 0))
|
||||
testutil.Equals(t, 1, len(lastNotified))
|
||||
|
||||
// Resolution alert sent right away
|
||||
group.Eval(ctx, time.Unix(6, 0))
|
||||
testutil.Equals(t, 1, len(lastNotified))
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue