mirror of
https://github.com/prometheus/prometheus.git
synced 2025-03-05 20:59:13 -08:00
To experiment with Prometheus pull methond on semi-short lived serverless environments we (Google Cloud) want to attempt init and shutdown scrapes to have some basic data from short lived workloads. This has little sense for Prometheus scrape manager, but it might be useful for external importers like OpenTelemetry and distributions, which can be deployed in various configurations (e.g. sidecar with just single target). It's up to us to merge it or close. I would be fine maintaining on upstream, but we might as well keep it in our fork to experiment on this--and perhaps merge once official contrib Otel decides to use those options. NOTE: Also added high level scrape manager test. I think it was kind of bad we never had test integrating all scrape manager pieces. Can add that in separate PR as well. Alternatives attempted: * Manager Option for scrape on shutdown. This was not trivial due to 3 different context we pass. We would need to disconnect them from parent context (sometimes) for scrapeAndReport. Intrusive and prone to mistaken cancelations. * ForceScrape method. This is not trivia as the scrape would need to be additionally locked. Plus semantics on what to do after (continue interval?) is not clear. We can scope this problem down to stopAndScrape semantics. Signed-off-by: bwplotka <bwplotka@gmail.com>
412 lines
11 KiB
Go
412 lines
11 KiB
Go
// Copyright 2013 The Prometheus Authors
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package scrape
|
|
|
|
import (
|
|
"fmt"
|
|
"hash/fnv"
|
|
"reflect"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/go-kit/log"
|
|
"github.com/go-kit/log/level"
|
|
"github.com/pkg/errors"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
config_util "github.com/prometheus/common/config"
|
|
"github.com/prometheus/common/model"
|
|
|
|
"github.com/prometheus/prometheus/config"
|
|
"github.com/prometheus/prometheus/discovery/targetgroup"
|
|
"github.com/prometheus/prometheus/model/labels"
|
|
"github.com/prometheus/prometheus/storage"
|
|
"github.com/prometheus/prometheus/util/osutil"
|
|
)
|
|
|
|
var targetMetadataCache = newMetadataMetricsCollector()
|
|
|
|
// MetadataMetricsCollector is a Custom Collector for the metadata cache metrics.
|
|
type MetadataMetricsCollector struct {
|
|
CacheEntries *prometheus.Desc
|
|
CacheBytes *prometheus.Desc
|
|
|
|
scrapeManager *Manager
|
|
}
|
|
|
|
func newMetadataMetricsCollector() *MetadataMetricsCollector {
|
|
return &MetadataMetricsCollector{
|
|
CacheEntries: prometheus.NewDesc(
|
|
"prometheus_target_metadata_cache_entries",
|
|
"Total number of metric metadata entries in the cache",
|
|
[]string{"scrape_job"},
|
|
nil,
|
|
),
|
|
CacheBytes: prometheus.NewDesc(
|
|
"prometheus_target_metadata_cache_bytes",
|
|
"The number of bytes that are currently used for storing metric metadata in the cache",
|
|
[]string{"scrape_job"},
|
|
nil,
|
|
),
|
|
}
|
|
}
|
|
|
|
func (mc *MetadataMetricsCollector) registerManager(m *Manager) {
|
|
mc.scrapeManager = m
|
|
}
|
|
|
|
// Describe sends the metrics descriptions to the channel.
|
|
func (mc *MetadataMetricsCollector) Describe(ch chan<- *prometheus.Desc) {
|
|
ch <- mc.CacheEntries
|
|
ch <- mc.CacheBytes
|
|
}
|
|
|
|
// Collect creates and sends the metrics for the metadata cache.
|
|
func (mc *MetadataMetricsCollector) Collect(ch chan<- prometheus.Metric) {
|
|
if mc.scrapeManager == nil {
|
|
return
|
|
}
|
|
|
|
for tset, targets := range mc.scrapeManager.TargetsActive() {
|
|
var size, length int
|
|
for _, t := range targets {
|
|
size += t.MetadataSize()
|
|
length += t.MetadataLength()
|
|
}
|
|
|
|
ch <- prometheus.MustNewConstMetric(
|
|
mc.CacheEntries,
|
|
prometheus.GaugeValue,
|
|
float64(length),
|
|
tset,
|
|
)
|
|
|
|
ch <- prometheus.MustNewConstMetric(
|
|
mc.CacheBytes,
|
|
prometheus.GaugeValue,
|
|
float64(size),
|
|
tset,
|
|
)
|
|
}
|
|
}
|
|
|
|
// NewManager is the Manager constructor
|
|
func NewManager(o *Options, logger log.Logger, app storage.Appendable) *Manager {
|
|
if o == nil {
|
|
o = &Options{}
|
|
}
|
|
if logger == nil {
|
|
logger = log.NewNopLogger()
|
|
}
|
|
m := &Manager{
|
|
append: app,
|
|
opts: o,
|
|
logger: logger,
|
|
scrapeConfigs: make(map[string]*config.ScrapeConfig),
|
|
scrapePools: make(map[string]*scrapePool),
|
|
graceShut: make(chan struct{}),
|
|
triggerReload: make(chan struct{}, 1),
|
|
}
|
|
targetMetadataCache.registerManager(m)
|
|
|
|
return m
|
|
}
|
|
|
|
// Options are the configuration parameters to the scrape manager.
|
|
type Options struct {
|
|
ExtraMetrics bool
|
|
NoDefaultPort bool
|
|
// Option used by downstream scraper users like OpenTelemetry Collector
|
|
// to help lookup metric metadata. Should be false for Prometheus.
|
|
PassMetadataInContext bool
|
|
// Option to enable the experimental in-memory metadata storage and append
|
|
// metadata to the WAL.
|
|
EnableMetadataStorage bool
|
|
// Option to enable protobuf negotiation with the client. Note that the client can already
|
|
// send protobuf without needing to enable this.
|
|
EnableProtobufNegotiation bool
|
|
// Option to increase the interval used by scrape manager to throttle target groups updates.
|
|
DiscoveryReloadInterval model.Duration
|
|
|
|
// Optional HTTP client options to use when scraping.
|
|
HTTPClientOptions []config_util.HTTPClientOption
|
|
|
|
// IgnoreJitter causes all targets managed by this manager to be scraped
|
|
// as soon as they are discovered. By default, all targets have offset,
|
|
// so we spread the scraping load evenly within Prometheus server.
|
|
// NOTE(bwplotka): This option is experimental and not used by Prometheus.
|
|
// It was created for serverless flavors of OpenTelemetry contrib's prometheusreceiver.
|
|
IgnoreJitter bool
|
|
}
|
|
|
|
// Manager maintains a set of scrape pools and manages start/stop cycles
|
|
// when receiving new target groups from the discovery manager.
|
|
type Manager struct {
|
|
opts *Options
|
|
logger log.Logger
|
|
append storage.Appendable
|
|
graceShut chan struct{}
|
|
|
|
offsetSeed uint64 // Global offsetSeed seed is used to spread scrape workload across HA setup.
|
|
mtxScrape sync.Mutex // Guards the fields below.
|
|
scrapeConfigs map[string]*config.ScrapeConfig
|
|
scrapePools map[string]*scrapePool
|
|
targetSets map[string][]*targetgroup.Group
|
|
|
|
triggerReload chan struct{}
|
|
}
|
|
|
|
// Run receives and saves target set updates and triggers the scraping loops reloading.
|
|
// Reloading happens in the background so that it doesn't block receiving targets updates.
|
|
func (m *Manager) Run(tsets <-chan map[string][]*targetgroup.Group) {
|
|
go m.reloader()
|
|
for {
|
|
select {
|
|
case ts := <-tsets:
|
|
m.updateTsets(ts)
|
|
|
|
select {
|
|
case m.triggerReload <- struct{}{}:
|
|
default:
|
|
}
|
|
|
|
case <-m.graceShut:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
func (m *Manager) reloader() {
|
|
reloadIntervalDuration := m.opts.DiscoveryReloadInterval
|
|
if reloadIntervalDuration < model.Duration(5*time.Second) {
|
|
reloadIntervalDuration = model.Duration(5 * time.Second)
|
|
}
|
|
|
|
ticker := time.NewTicker(time.Duration(reloadIntervalDuration))
|
|
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-m.graceShut:
|
|
return
|
|
case <-ticker.C:
|
|
select {
|
|
case <-m.triggerReload:
|
|
m.reload()
|
|
case <-m.graceShut:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (m *Manager) reload() {
|
|
m.mtxScrape.Lock()
|
|
var wg sync.WaitGroup
|
|
for setName, groups := range m.targetSets {
|
|
if _, ok := m.scrapePools[setName]; !ok {
|
|
scrapeConfig, ok := m.scrapeConfigs[setName]
|
|
if !ok {
|
|
level.Error(m.logger).Log("msg", "error reloading target set", "err", "invalid config id:"+setName)
|
|
continue
|
|
}
|
|
sp, err := newScrapePool(scrapeConfig, m.append, m.offsetSeed, log.With(m.logger, "scrape_pool", setName), m.opts)
|
|
if err != nil {
|
|
level.Error(m.logger).Log("msg", "error creating new scrape pool", "err", err, "scrape_pool", setName)
|
|
continue
|
|
}
|
|
m.scrapePools[setName] = sp
|
|
}
|
|
|
|
wg.Add(1)
|
|
// Run the sync in parallel as these take a while and at high load can't catch up.
|
|
go func(sp *scrapePool, groups []*targetgroup.Group) {
|
|
sp.Sync(groups)
|
|
wg.Done()
|
|
}(m.scrapePools[setName], groups)
|
|
|
|
}
|
|
m.mtxScrape.Unlock()
|
|
wg.Wait()
|
|
}
|
|
|
|
// setOffsetSeed calculates a global offsetSeed per server relying on extra label set.
|
|
func (m *Manager) setOffsetSeed(labels labels.Labels) error {
|
|
h := fnv.New64a()
|
|
hostname, err := osutil.GetFQDN()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if _, err := fmt.Fprintf(h, "%s%s", hostname, labels.String()); err != nil {
|
|
return err
|
|
}
|
|
m.offsetSeed = h.Sum64()
|
|
return nil
|
|
}
|
|
|
|
// Stop cancels all running scrape pools and blocks until all have exited.
|
|
func (m *Manager) Stop() {
|
|
m.mtxScrape.Lock()
|
|
defer m.mtxScrape.Unlock()
|
|
|
|
for _, sp := range m.scrapePools {
|
|
sp.stop()
|
|
}
|
|
close(m.graceShut)
|
|
}
|
|
|
|
// StopAfterScrapeAttempt stops manager after ensuring all targets' last scrape
|
|
// attempt happened after minScrapeTime. It cancels all running scrape pools and
|
|
// blocks until all have exited.
|
|
//
|
|
// It is likely that such shutdown scrape will cause irregular scrape interval and
|
|
// sudden efficiency (memory, CPU) spikes. However, it can be a fair tradeoff for
|
|
// short-lived and small number of targets, where at least one or two samples could be
|
|
// preserved (and ideally aggregated further by separate processes).
|
|
//
|
|
// NOTE(bwplotka): This method is experimental and not used by Prometheus.
|
|
// It was created for serverless flavors of OpenTelemetry contrib's prometheusreceiver.
|
|
func (m *Manager) StopAfterScrapeAttempt(minScrapeTime time.Time) {
|
|
m.mtxScrape.Lock()
|
|
defer m.mtxScrape.Unlock()
|
|
|
|
var wg sync.WaitGroup
|
|
for _, p := range m.scrapePools {
|
|
for _, l := range p.loops {
|
|
l := l
|
|
wg.Add(1)
|
|
go func() {
|
|
l.stopAfterScrapeAttempt(minScrapeTime)
|
|
wg.Done()
|
|
}()
|
|
}
|
|
}
|
|
wg.Wait()
|
|
|
|
for _, sp := range m.scrapePools {
|
|
sp.stop()
|
|
}
|
|
close(m.graceShut)
|
|
}
|
|
|
|
func (m *Manager) updateTsets(tsets map[string][]*targetgroup.Group) {
|
|
m.mtxScrape.Lock()
|
|
m.targetSets = tsets
|
|
m.mtxScrape.Unlock()
|
|
}
|
|
|
|
// ApplyConfig resets the manager's target providers and job configurations as defined by the new cfg.
|
|
func (m *Manager) ApplyConfig(cfg *config.Config) error {
|
|
m.mtxScrape.Lock()
|
|
defer m.mtxScrape.Unlock()
|
|
|
|
scfgs, err := cfg.GetScrapeConfigs()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
c := make(map[string]*config.ScrapeConfig)
|
|
for _, scfg := range scfgs {
|
|
c[scfg.JobName] = scfg
|
|
}
|
|
m.scrapeConfigs = c
|
|
|
|
if err := m.setOffsetSeed(cfg.GlobalConfig.ExternalLabels); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Cleanup and reload pool if the configuration has changed.
|
|
var failed bool
|
|
for name, sp := range m.scrapePools {
|
|
switch cfg, ok := m.scrapeConfigs[name]; {
|
|
case !ok:
|
|
sp.stop()
|
|
delete(m.scrapePools, name)
|
|
case !reflect.DeepEqual(sp.config, cfg):
|
|
err := sp.reload(cfg)
|
|
if err != nil {
|
|
level.Error(m.logger).Log("msg", "error reloading scrape pool", "err", err, "scrape_pool", name)
|
|
failed = true
|
|
}
|
|
}
|
|
}
|
|
|
|
if failed {
|
|
return errors.New("failed to apply the new configuration")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// TargetsAll returns active and dropped targets grouped by job_name.
|
|
func (m *Manager) TargetsAll() map[string][]*Target {
|
|
m.mtxScrape.Lock()
|
|
defer m.mtxScrape.Unlock()
|
|
|
|
targets := make(map[string][]*Target, len(m.scrapePools))
|
|
for tset, sp := range m.scrapePools {
|
|
targets[tset] = append(sp.ActiveTargets(), sp.DroppedTargets()...)
|
|
}
|
|
return targets
|
|
}
|
|
|
|
// ScrapePools returns the list of all scrape pool names.
|
|
func (m *Manager) ScrapePools() []string {
|
|
m.mtxScrape.Lock()
|
|
defer m.mtxScrape.Unlock()
|
|
|
|
names := make([]string, 0, len(m.scrapePools))
|
|
for name := range m.scrapePools {
|
|
names = append(names, name)
|
|
}
|
|
return names
|
|
}
|
|
|
|
// TargetsActive returns the active targets currently being scraped.
|
|
func (m *Manager) TargetsActive() map[string][]*Target {
|
|
m.mtxScrape.Lock()
|
|
defer m.mtxScrape.Unlock()
|
|
|
|
var (
|
|
wg sync.WaitGroup
|
|
mtx sync.Mutex
|
|
)
|
|
|
|
targets := make(map[string][]*Target, len(m.scrapePools))
|
|
wg.Add(len(m.scrapePools))
|
|
for tset, sp := range m.scrapePools {
|
|
// Running in parallel limits the blocking time of scrapePool to scrape
|
|
// interval when there's an update from SD.
|
|
go func(tset string, sp *scrapePool) {
|
|
mtx.Lock()
|
|
targets[tset] = sp.ActiveTargets()
|
|
mtx.Unlock()
|
|
wg.Done()
|
|
}(tset, sp)
|
|
}
|
|
wg.Wait()
|
|
return targets
|
|
}
|
|
|
|
// TargetsDropped returns the dropped targets during relabelling.
|
|
func (m *Manager) TargetsDropped() map[string][]*Target {
|
|
m.mtxScrape.Lock()
|
|
defer m.mtxScrape.Unlock()
|
|
|
|
targets := make(map[string][]*Target, len(m.scrapePools))
|
|
for tset, sp := range m.scrapePools {
|
|
targets[tset] = sp.DroppedTargets()
|
|
}
|
|
return targets
|
|
}
|