discovery: add metrics + send updates from one goroutine only

The added metrics are:

* prometheus_sd_discovered_targets
* prometheus_sd_received_updates_total
* prometheus_sd_updates_delayed_total
* prometheus_sd_updates_total

Signed-off-by: Simon Pasquier <spasquie@redhat.com>
This commit is contained in:
Simon Pasquier 2018-09-12 15:18:55 +02:00
parent f2d43af820
commit 365931ea83

View file

@ -47,10 +47,34 @@ var (
Help: "Total number of service discovery configurations that failed to load.", Help: "Total number of service discovery configurations that failed to load.",
}, },
) )
discoveredTargets = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: "prometheus_sd_discovered_targets",
Help: "Current number of discovered targets.",
},
)
receivedUpdates = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "prometheus_sd_received_updates_total",
Help: "Total number of update events received from the SD providers.",
},
)
delayedUpdates = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "prometheus_sd_updates_delayed_total",
Help: "Total number of update events that couldn't be sent immediately.",
},
)
sentUpdates = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "prometheus_sd_updates_total",
Help: "Total number of update events sent to the SD consumers.",
},
)
) )
func init() { func init() {
prometheus.MustRegister(failedConfigs) prometheus.MustRegister(failedConfigs, discoveredTargets, receivedUpdates, delayedUpdates, sentUpdates)
} }
// Discoverer provides information about target groups. It maintains a set // Discoverer provides information about target groups. It maintains a set
@ -62,7 +86,7 @@ func init() {
// //
// Discoverers should initially send a full set of all discoverable TargetGroups. // Discoverers should initially send a full set of all discoverable TargetGroups.
type Discoverer interface { type Discoverer interface {
// Run hands a channel to the discovery provider(consul,dns etc) through which it can send // Run hands a channel to the discovery provider (Consul, DNS etc) through which it can send
// updated target groups. // updated target groups.
// Must returns if the context gets canceled. It should not close the update // Must returns if the context gets canceled. It should not close the update
// channel on returning. // channel on returning.
@ -94,6 +118,7 @@ func NewManager(ctx context.Context, logger log.Logger) *Manager {
discoverCancel: []context.CancelFunc{}, discoverCancel: []context.CancelFunc{},
ctx: ctx, ctx: ctx,
updatert: 5 * time.Second, updatert: 5 * time.Second,
trigger: make(chan struct{}, 1),
} }
} }
@ -110,16 +135,20 @@ type Manager struct {
targets map[poolKey]map[string]*targetgroup.Group targets map[poolKey]map[string]*targetgroup.Group
// providers keeps track of SD providers. // providers keeps track of SD providers.
providers []*provider providers []*provider
// The sync channels sends the updates in map[targetSetName] where targetSetName is the job value from the scrape config. // The sync channel sends the updates as a map where the key is the job value from the scrape config.
syncCh chan map[string][]*targetgroup.Group syncCh chan map[string][]*targetgroup.Group
// How long to wait before sending updates to the channel. The variable // How long to wait before sending updates to the channel. The variable
// should only be modified in unit tests. // should only be modified in unit tests.
updatert time.Duration updatert time.Duration
// The trigger channel signals to the manager that new updates have been received from providers.
trigger chan struct{}
} }
// Run starts the background processing // Run starts the background processing
func (m *Manager) Run() error { func (m *Manager) Run() error {
go m.sendUpdates()
for range m.ctx.Done() { for range m.ctx.Done() {
m.cancelDiscoverers() m.cancelDiscoverers()
return m.ctx.Err() return m.ctx.Err()
@ -127,7 +156,7 @@ func (m *Manager) Run() error {
return nil return nil
} }
// SyncCh returns a read only channel used by all Discoverers to send target updates. // SyncCh returns a read only channel used by all the clients to receive target updates.
func (m *Manager) SyncCh() <-chan map[string][]*targetgroup.Group { func (m *Manager) SyncCh() <-chan map[string][]*targetgroup.Group {
return m.syncCh return m.syncCh
} }
@ -171,43 +200,48 @@ func (m *Manager) startProvider(ctx context.Context, p *provider) {
} }
func (m *Manager) updater(ctx context.Context, p *provider, updates chan []*targetgroup.Group) { func (m *Manager) updater(ctx context.Context, p *provider, updates chan []*targetgroup.Group) {
ticker := time.NewTicker(m.updatert)
defer ticker.Stop()
triggerUpdate := make(chan struct{}, 1)
for { for {
select { select {
case <-ctx.Done(): case <-ctx.Done():
return return
case tgs, ok := <-updates: case tgs, ok := <-updates:
receivedUpdates.Inc()
if !ok { if !ok {
level.Debug(m.logger).Log("msg", "discoverer channel closed, sending the last update", "provider", p.name) level.Debug(m.logger).Log("msg", "discoverer channel closed", "provider", p.name)
select {
case m.syncCh <- m.allGroups(): // Waiting until the receiver can accept the last update.
level.Debug(m.logger).Log("msg", "discoverer exited", "provider", p.name)
case <-ctx.Done():
}
return return
} }
for _, s := range p.subs { for _, s := range p.subs {
m.updateGroup(poolKey{setName: s, provider: p.name}, tgs) m.updateGroup(poolKey{setName: s, provider: p.name}, tgs)
} }
select { select {
case triggerUpdate <- struct{}{}: case m.trigger <- struct{}{}:
default: default:
} }
}
}
}
func (m *Manager) sendUpdates() {
ticker := time.NewTicker(m.updatert)
defer ticker.Stop()
for {
select {
case <-m.ctx.Done():
return
case <-ticker.C: // Some discoverers send updates too often so we throttle these with the ticker. case <-ticker.C: // Some discoverers send updates too often so we throttle these with the ticker.
select { select {
case <-triggerUpdate: case <-m.trigger:
sentUpdates.Inc()
select { select {
case m.syncCh <- m.allGroups(): case m.syncCh <- m.allGroups():
default: default:
level.Debug(m.logger).Log("msg", "discovery receiver's channel was full so will retry the next cycle", "provider", p.name) delayedUpdates.Inc()
level.Debug(m.logger).Log("msg", "discovery receiver's channel was full so will retry the next cycle")
select { select {
case triggerUpdate <- struct{}{}: case m.trigger <- struct{}{}:
default: default:
} }
} }
@ -245,13 +279,16 @@ func (m *Manager) allGroups() map[string][]*targetgroup.Group {
defer m.mtx.Unlock() defer m.mtx.Unlock()
tSets := map[string][]*targetgroup.Group{} tSets := map[string][]*targetgroup.Group{}
var n int
for pkey, tsets := range m.targets { for pkey, tsets := range m.targets {
for _, tg := range tsets { for _, tg := range tsets {
// Even if the target group 'tg' is empty we still need to send it to the 'Scrape manager' // Even if the target group 'tg' is empty we still need to send it to the 'Scrape manager'
// to signal that it needs to stop all scrape loops for this target set. // to signal that it needs to stop all scrape loops for this target set.
tSets[pkey.setName] = append(tSets[pkey.setName], tg) tSets[pkey.setName] = append(tSets[pkey.setName], tg)
n += len(tg.Targets)
} }
} }
discoveredTargets.Set(float64(n))
return tSets return tSets
} }