mirror of
https://github.com/prometheus/prometheus.git
synced 2025-03-05 20:59:13 -08:00
chore(notifier): Split 'Run()' into two goroutines: one to receive target updates and trigger reloads and the other one to send notifications.
This is done to prevent the latter operation from blocking/starving the former, as previously, the `tsets` channel was consumed by the same goroutine that consumes and feeds the buffered `n.more` channel, the `tsets` channel was less likely to be ready as it's unbuffered and only fed every `SDManager.updatert` seconds.
See https://github.com/prometheus/prometheus/issues/13676 and https://github.com/prometheus/prometheus/issues/8768
The synchronization with the sendLoop goroutine is managed through the n.mtx mutex.
This uses a similar approach than scrape manager's efbd6e41c5/scrape/manager.go (L115-L117)
The old TestHangingNotifier was replaced by the new one to more closely reflect reality.
Signed-off-by: machine424 <ayoubmrini424@gmail.com>
This commit is contained in:
parent
94d28cd6cf
commit
690de487e2
|
@ -298,25 +298,14 @@ func (n *Manager) nextBatch() []*Alert {
|
||||||
return alerts
|
return alerts
|
||||||
}
|
}
|
||||||
|
|
||||||
// Run dispatches notifications continuously.
|
// sendLoop continuously consumes the notifications queue and sends alerts to
|
||||||
func (n *Manager) Run(tsets <-chan map[string][]*targetgroup.Group) {
|
// the configured Alertmanagers.
|
||||||
|
func (n *Manager) sendLoop() {
|
||||||
for {
|
for {
|
||||||
// The select is split in two parts, such as we will first try to read
|
|
||||||
// new alertmanager targets if they are available, before sending new
|
|
||||||
// alerts.
|
|
||||||
select {
|
select {
|
||||||
case <-n.ctx.Done():
|
case <-n.ctx.Done():
|
||||||
return
|
return
|
||||||
case ts := <-tsets:
|
case <-n.more:
|
||||||
n.reload(ts)
|
|
||||||
default:
|
|
||||||
select {
|
|
||||||
case <-n.ctx.Done():
|
|
||||||
return
|
|
||||||
case ts := <-tsets:
|
|
||||||
n.reload(ts)
|
|
||||||
case <-n.more:
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
alerts := n.nextBatch()
|
alerts := n.nextBatch()
|
||||||
|
|
||||||
|
@ -330,6 +319,21 @@ func (n *Manager) Run(tsets <-chan map[string][]*targetgroup.Group) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Run receives updates of target groups and triggers a reload.
|
||||||
|
// The dispatching of notifications occurs in the background to prevent blocking the receipt of target updates.
|
||||||
|
// Refer to https://github.com/prometheus/prometheus/issues/13676 for more details.
|
||||||
|
func (n *Manager) Run(tsets <-chan map[string][]*targetgroup.Group) {
|
||||||
|
go n.sendLoop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-n.ctx.Done():
|
||||||
|
return
|
||||||
|
case ts := <-tsets:
|
||||||
|
n.reload(ts)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (n *Manager) reload(tgs map[string][]*targetgroup.Group) {
|
func (n *Manager) reload(tgs map[string][]*targetgroup.Group) {
|
||||||
n.mtx.Lock()
|
n.mtx.Lock()
|
||||||
defer n.mtx.Unlock()
|
defer n.mtx.Unlock()
|
||||||
|
@ -483,6 +487,7 @@ func (n *Manager) sendAll(alerts ...*Alert) bool {
|
||||||
|
|
||||||
ams.mtx.RLock()
|
ams.mtx.RLock()
|
||||||
|
|
||||||
|
|
||||||
if len(ams.cfg.AlertRelabelConfigs) > 0 {
|
if len(ams.cfg.AlertRelabelConfigs) > 0 {
|
||||||
amAlerts = relabelAlerts(ams.cfg.AlertRelabelConfigs, labels.Labels{}, alerts)
|
amAlerts = relabelAlerts(ams.cfg.AlertRelabelConfigs, labels.Labels{}, alerts)
|
||||||
if len(amAlerts) == 0 {
|
if len(amAlerts) == 0 {
|
||||||
|
|
|
@ -701,125 +701,10 @@ func TestLabelsToOpenAPILabelSet(t *testing.T) {
|
||||||
require.Equal(t, models.LabelSet{"aaa": "111", "bbb": "222"}, labelsToOpenAPILabelSet(labels.FromStrings("aaa", "111", "bbb", "222")))
|
require.Equal(t, models.LabelSet{"aaa": "111", "bbb": "222"}, labelsToOpenAPILabelSet(labels.FromStrings("aaa", "111", "bbb", "222")))
|
||||||
}
|
}
|
||||||
|
|
||||||
// TestHangingNotifier validates that targets updates happen even when there are
|
// TestHangingNotifier ensures that the notifier takes into account SD changes even when there are
|
||||||
// queued alerts.
|
|
||||||
func TestHangingNotifier(t *testing.T) {
|
|
||||||
// Note: When targets are not updated in time, this test is flaky because go
|
|
||||||
// selects are not deterministic. Therefore we run 10 subtests to run into the issue.
|
|
||||||
for i := 0; i < 10; i++ {
|
|
||||||
t.Run(strconv.Itoa(i), func(t *testing.T) {
|
|
||||||
var (
|
|
||||||
done = make(chan struct{})
|
|
||||||
changed = make(chan struct{})
|
|
||||||
syncCh = make(chan map[string][]*targetgroup.Group)
|
|
||||||
)
|
|
||||||
|
|
||||||
defer func() {
|
|
||||||
close(done)
|
|
||||||
}()
|
|
||||||
|
|
||||||
var calledOnce bool
|
|
||||||
// Setting up a bad server. This server hangs for 2 seconds.
|
|
||||||
badServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
||||||
if calledOnce {
|
|
||||||
t.Fatal("hanging server called multiple times")
|
|
||||||
}
|
|
||||||
calledOnce = true
|
|
||||||
select {
|
|
||||||
case <-done:
|
|
||||||
case <-time.After(2 * time.Second):
|
|
||||||
}
|
|
||||||
}))
|
|
||||||
badURL, err := url.Parse(badServer.URL)
|
|
||||||
require.NoError(t, err)
|
|
||||||
badAddress := badURL.Host // Used for __name__ label in targets.
|
|
||||||
|
|
||||||
// Setting up a bad server. This server returns fast, signaling requests on
|
|
||||||
// by closing the changed channel.
|
|
||||||
goodServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
||||||
close(changed)
|
|
||||||
}))
|
|
||||||
goodURL, err := url.Parse(goodServer.URL)
|
|
||||||
require.NoError(t, err)
|
|
||||||
goodAddress := goodURL.Host // Used for __name__ label in targets.
|
|
||||||
|
|
||||||
h := NewManager(
|
|
||||||
&Options{
|
|
||||||
QueueCapacity: 20 * maxBatchSize,
|
|
||||||
},
|
|
||||||
nil,
|
|
||||||
)
|
|
||||||
|
|
||||||
h.alertmanagers = make(map[string]*alertmanagerSet)
|
|
||||||
|
|
||||||
am1Cfg := config.DefaultAlertmanagerConfig
|
|
||||||
am1Cfg.Timeout = model.Duration(200 * time.Millisecond)
|
|
||||||
|
|
||||||
h.alertmanagers["config-0"] = &alertmanagerSet{
|
|
||||||
ams: []alertmanager{},
|
|
||||||
cfg: &am1Cfg,
|
|
||||||
metrics: h.metrics,
|
|
||||||
}
|
|
||||||
go h.Run(syncCh)
|
|
||||||
defer h.Stop()
|
|
||||||
|
|
||||||
var alerts []*Alert
|
|
||||||
for i := range make([]struct{}, 20*maxBatchSize) {
|
|
||||||
alerts = append(alerts, &Alert{
|
|
||||||
Labels: labels.FromStrings("alertname", strconv.Itoa(i)),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// Injecting the hanging server URL.
|
|
||||||
syncCh <- map[string][]*targetgroup.Group{
|
|
||||||
"config-0": {
|
|
||||||
{
|
|
||||||
Targets: []model.LabelSet{
|
|
||||||
{
|
|
||||||
model.AddressLabel: model.LabelValue(badAddress),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
// Queing alerts.
|
|
||||||
h.Send(alerts...)
|
|
||||||
|
|
||||||
// Updating with a working alertmanager target.
|
|
||||||
go func() {
|
|
||||||
select {
|
|
||||||
case syncCh <- map[string][]*targetgroup.Group{
|
|
||||||
"config-0": {
|
|
||||||
{
|
|
||||||
Targets: []model.LabelSet{
|
|
||||||
{
|
|
||||||
model.AddressLabel: model.LabelValue(goodAddress),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}:
|
|
||||||
case <-done:
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
select {
|
|
||||||
case <-time.After(1 * time.Second):
|
|
||||||
t.Fatalf("Timeout after 1 second, targets not synced in time.")
|
|
||||||
case <-changed:
|
|
||||||
// The good server has been hit in less than 3 seconds, therefore
|
|
||||||
// targets have been updated before a second call could be made to the
|
|
||||||
// bad server.
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: renameit and even replace TestHangingNotifier with it.
|
|
||||||
// TestHangingNotifierXXX ensures that the notifier takes into account SD changes even when there are
|
|
||||||
// queued alerts. This test reproduces the issue described in https://github.com/prometheus/prometheus/issues/13676.
|
// queued alerts. This test reproduces the issue described in https://github.com/prometheus/prometheus/issues/13676.
|
||||||
func TestHangingNotifierXXX(t *testing.T) {
|
// and https://github.com/prometheus/prometheus/issues/8768.
|
||||||
|
func TestHangingNotifier(t *testing.T) {
|
||||||
const (
|
const (
|
||||||
batches = 100
|
batches = 100
|
||||||
alertsCount = maxBatchSize * batches
|
alertsCount = maxBatchSize * batches
|
||||||
|
@ -857,6 +742,8 @@ func TestHangingNotifierXXX(t *testing.T) {
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
// Initialize the discovery manager
|
// Initialize the discovery manager
|
||||||
|
// This is relevant as the updates aren't sent continually in real life, but only each updatert.
|
||||||
|
// The old implementation of TestHangingNotifier didn't take that into acount.
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
defer cancel()
|
defer cancel()
|
||||||
reg := prometheus.NewRegistry()
|
reg := prometheus.NewRegistry()
|
||||||
|
|
Loading…
Reference in a new issue