prometheus/storage/remote/write.go
Đurica Yuri Nikolić 101b1c307f
Some checks failed
buf.build / lint and publish (push) Has been cancelled
CI / Go tests (push) Has been cancelled
CI / More Go tests (push) Has been cancelled
CI / Go tests with previous Go version (push) Has been cancelled
CI / UI tests (push) Has been cancelled
CI / Go tests on Windows (push) Has been cancelled
CI / Mixins tests (push) Has been cancelled
CI / Build Prometheus for common architectures (0) (push) Has been cancelled
CI / Build Prometheus for common architectures (1) (push) Has been cancelled
CI / Build Prometheus for common architectures (2) (push) Has been cancelled
CI / Build Prometheus for all architectures (0) (push) Has been cancelled
CI / Build Prometheus for all architectures (1) (push) Has been cancelled
CI / Build Prometheus for all architectures (10) (push) Has been cancelled
CI / Build Prometheus for all architectures (11) (push) Has been cancelled
CI / Build Prometheus for all architectures (2) (push) Has been cancelled
CI / Build Prometheus for all architectures (3) (push) Has been cancelled
CI / Build Prometheus for all architectures (4) (push) Has been cancelled
CI / Build Prometheus for all architectures (5) (push) Has been cancelled
CI / Build Prometheus for all architectures (6) (push) Has been cancelled
CI / Build Prometheus for all architectures (7) (push) Has been cancelled
CI / Build Prometheus for all architectures (8) (push) Has been cancelled
CI / Build Prometheus for all architectures (9) (push) Has been cancelled
CI / Check generated parser (push) Has been cancelled
CI / golangci-lint (push) Has been cancelled
CI / fuzzing (push) Has been cancelled
CI / codeql (push) Has been cancelled
Scorecards supply-chain security / Scorecards analysis (push) Has been cancelled
CI / Report status of build Prometheus for all architectures (push) Has been cancelled
CI / Publish main branch artifacts (push) Has been cancelled
CI / Publish release artefacts (push) Has been cancelled
CI / Publish UI on npm Registry (push) Has been cancelled
[ENHANCEMEN] Remote-Write: optionally use a DNS resolver that picks a random IP (#15329)
When a remote-write is executed towards a host name that is resolved to multiple IP addresses, this PR introduces a possibility to force creation of new connections used for the remote-write request to a randomly chosen IP address from the ones corresponding to the host name. The default behavior remains unchanged, i.s., the IP address used for the connection creation remains the one chosen by Go.

This is an experimental feature, it is disabled by default.

Signed-off-by: Yuri Nikolic <durica.nikolic@grafana.com>
2024-11-15 15:41:49 +00:00

357 lines
11 KiB
Go

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package remote
import (
"context"
"errors"
"fmt"
"log/slog"
"math"
"sync"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/common/promslog"
"github.com/prometheus/prometheus/config"
"github.com/prometheus/prometheus/model/exemplar"
"github.com/prometheus/prometheus/model/histogram"
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/model/metadata"
"github.com/prometheus/prometheus/storage"
"github.com/prometheus/prometheus/tsdb/wlog"
)
var (
samplesIn = promauto.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "samples_in_total",
Help: "Samples in to remote storage, compare to samples out for queue managers.",
})
exemplarsIn = promauto.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "exemplars_in_total",
Help: "Exemplars in to remote storage, compare to exemplars out for queue managers.",
})
histogramsIn = promauto.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "histograms_in_total",
Help: "HistogramSamples in to remote storage, compare to histograms out for queue managers.",
})
)
// WriteStorage represents all the remote write storage.
type WriteStorage struct {
logger *slog.Logger
reg prometheus.Registerer
mtx sync.Mutex
watcherMetrics *wlog.WatcherMetrics
liveReaderMetrics *wlog.LiveReaderMetrics
externalLabels labels.Labels
dir string
queues map[string]*QueueManager
metadataInWAL bool
samplesIn *ewmaRate
flushDeadline time.Duration
interner *pool
scraper ReadyScrapeManager
quit chan struct{}
// For timestampTracker.
highestTimestamp *maxTimestamp
}
// NewWriteStorage creates and runs a WriteStorage.
func NewWriteStorage(logger *slog.Logger, reg prometheus.Registerer, dir string, flushDeadline time.Duration, sm ReadyScrapeManager, metadataInWal bool) *WriteStorage {
if logger == nil {
logger = promslog.NewNopLogger()
}
rws := &WriteStorage{
queues: make(map[string]*QueueManager),
watcherMetrics: wlog.NewWatcherMetrics(reg),
liveReaderMetrics: wlog.NewLiveReaderMetrics(reg),
logger: logger,
reg: reg,
flushDeadline: flushDeadline,
samplesIn: newEWMARate(ewmaWeight, shardUpdateDuration),
dir: dir,
interner: newPool(),
scraper: sm,
quit: make(chan struct{}),
metadataInWAL: metadataInWal,
highestTimestamp: &maxTimestamp{
Gauge: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "highest_timestamp_in_seconds",
Help: "Highest timestamp that has come into the remote storage via the Appender interface, in seconds since epoch. Initialized to 0 when no data has been received yet.",
}),
},
}
if reg != nil {
reg.MustRegister(rws.highestTimestamp)
}
go rws.run()
return rws
}
func (rws *WriteStorage) run() {
ticker := time.NewTicker(shardUpdateDuration)
defer ticker.Stop()
for {
select {
case <-ticker.C:
rws.samplesIn.tick()
case <-rws.quit:
return
}
}
}
func (rws *WriteStorage) Notify() {
rws.mtx.Lock()
defer rws.mtx.Unlock()
for _, q := range rws.queues {
// These should all be non blocking
q.watcher.Notify()
}
}
// ApplyConfig updates the state as the new config requires.
// Only stop & create queues which have changes.
func (rws *WriteStorage) ApplyConfig(conf *config.Config) error {
rws.mtx.Lock()
defer rws.mtx.Unlock()
// Remote write queues only need to change if the remote write config or
// external labels change.
externalLabelUnchanged := labels.Equal(conf.GlobalConfig.ExternalLabels, rws.externalLabels)
rws.externalLabels = conf.GlobalConfig.ExternalLabels
newQueues := make(map[string]*QueueManager)
newHashes := []string{}
for _, rwConf := range conf.RemoteWriteConfigs {
if rwConf.ProtobufMessage == config.RemoteWriteProtoMsgV2 && !rws.metadataInWAL {
return errors.New("invalid remote write configuration, if you are using remote write version 2.0 the `--enable-feature=metadata-wal-records` feature flag must be enabled")
}
hash, err := toHash(rwConf)
if err != nil {
return err
}
// Don't allow duplicate remote write configs.
if _, ok := newQueues[hash]; ok {
return fmt.Errorf("duplicate remote write configs are not allowed, found duplicate for URL: %s", rwConf.URL)
}
// Set the queue name to the config hash if the user has not set
// a name in their remote write config so we can still differentiate
// between queues that have the same remote write endpoint.
name := hash[:6]
if rwConf.Name != "" {
name = rwConf.Name
}
c, err := NewWriteClient(name, &ClientConfig{
URL: rwConf.URL,
WriteProtoMsg: rwConf.ProtobufMessage,
Timeout: rwConf.RemoteTimeout,
HTTPClientConfig: rwConf.HTTPClientConfig,
SigV4Config: rwConf.SigV4Config,
AzureADConfig: rwConf.AzureADConfig,
GoogleIAMConfig: rwConf.GoogleIAMConfig,
Headers: rwConf.Headers,
RetryOnRateLimit: rwConf.QueueConfig.RetryOnRateLimit,
RoundRobinDNS: rwConf.RoundRobinDNS,
})
if err != nil {
return err
}
queue, ok := rws.queues[hash]
if externalLabelUnchanged && ok {
// Update the client in case any secret configuration has changed.
queue.SetClient(c)
newQueues[hash] = queue
delete(rws.queues, hash)
continue
}
// Redacted to remove any passwords in the URL (that are
// technically accepted but not recommended) since this is
// only used for metric labels.
endpoint := rwConf.URL.Redacted()
newQueues[hash] = NewQueueManager(
newQueueManagerMetrics(rws.reg, name, endpoint),
rws.watcherMetrics,
rws.liveReaderMetrics,
rws.logger,
rws.dir,
rws.samplesIn,
rwConf.QueueConfig,
rwConf.MetadataConfig,
conf.GlobalConfig.ExternalLabels,
rwConf.WriteRelabelConfigs,
c,
rws.flushDeadline,
rws.interner,
rws.highestTimestamp,
rws.scraper,
rwConf.SendExemplars,
rwConf.SendNativeHistograms,
rwConf.ProtobufMessage,
)
// Keep track of which queues are new so we know which to start.
newHashes = append(newHashes, hash)
}
// Anything remaining in rws.queues is a queue who's config has
// changed or was removed from the overall remote write config.
for _, q := range rws.queues {
q.Stop()
}
for _, hash := range newHashes {
newQueues[hash].Start()
}
rws.queues = newQueues
return nil
}
// Appender implements storage.Storage.
func (rws *WriteStorage) Appender(_ context.Context) storage.Appender {
return &timestampTracker{
writeStorage: rws,
highestRecvTimestamp: rws.highestTimestamp,
}
}
// LowestSentTimestamp returns the lowest sent timestamp across all queues.
func (rws *WriteStorage) LowestSentTimestamp() int64 {
rws.mtx.Lock()
defer rws.mtx.Unlock()
var lowestTs int64 = math.MaxInt64
for _, q := range rws.queues {
ts := int64(q.metrics.highestSentTimestamp.Get() * 1000)
if ts < lowestTs {
lowestTs = ts
}
}
if len(rws.queues) == 0 {
lowestTs = 0
}
return lowestTs
}
// Close closes the WriteStorage.
func (rws *WriteStorage) Close() error {
rws.mtx.Lock()
defer rws.mtx.Unlock()
for _, q := range rws.queues {
q.Stop()
}
close(rws.quit)
return nil
}
type timestampTracker struct {
writeStorage *WriteStorage
appendOptions *storage.AppendOptions
samples int64
exemplars int64
histograms int64
highestTimestamp int64
highestRecvTimestamp *maxTimestamp
}
func (t *timestampTracker) SetOptions(opts *storage.AppendOptions) {
t.appendOptions = opts
}
// Append implements storage.Appender.
func (t *timestampTracker) Append(_ storage.SeriesRef, _ labels.Labels, ts int64, _ float64) (storage.SeriesRef, error) {
t.samples++
if ts > t.highestTimestamp {
t.highestTimestamp = ts
}
return 0, nil
}
func (t *timestampTracker) AppendExemplar(_ storage.SeriesRef, _ labels.Labels, _ exemplar.Exemplar) (storage.SeriesRef, error) {
t.exemplars++
return 0, nil
}
func (t *timestampTracker) AppendHistogram(_ storage.SeriesRef, _ labels.Labels, ts int64, _ *histogram.Histogram, _ *histogram.FloatHistogram) (storage.SeriesRef, error) {
t.histograms++
if ts > t.highestTimestamp {
t.highestTimestamp = ts
}
return 0, nil
}
func (t *timestampTracker) AppendCTZeroSample(_ storage.SeriesRef, _ labels.Labels, _, ct int64) (storage.SeriesRef, error) {
t.samples++
if ct > t.highestTimestamp {
// Theoretically, we should never see a CT zero sample with a timestamp higher than the highest timestamp we've seen so far.
// However, we're not going to enforce that here, as it is not the responsibility of the tracker to enforce this.
t.highestTimestamp = ct
}
return 0, nil
}
func (t *timestampTracker) AppendHistogramCTZeroSample(_ storage.SeriesRef, _ labels.Labels, _, ct int64, _ *histogram.Histogram, _ *histogram.FloatHistogram) (storage.SeriesRef, error) {
t.histograms++
if ct > t.highestTimestamp {
// Theoretically, we should never see a CT zero sample with a timestamp higher than the highest timestamp we've seen so far.
// However, we're not going to enforce that here, as it is not the responsibility of the tracker to enforce this.
t.highestTimestamp = ct
}
return 0, nil
}
func (t *timestampTracker) UpdateMetadata(_ storage.SeriesRef, _ labels.Labels, _ metadata.Metadata) (storage.SeriesRef, error) {
// TODO: Add and increment a `metadata` field when we get around to wiring metadata in remote_write.
// UpdateMetadata is no-op for remote write (where timestampTracker is being used) for now.
return 0, nil
}
// Commit implements storage.Appender.
func (t *timestampTracker) Commit() error {
t.writeStorage.samplesIn.incr(t.samples + t.exemplars + t.histograms)
samplesIn.Add(float64(t.samples))
exemplarsIn.Add(float64(t.exemplars))
histogramsIn.Add(float64(t.histograms))
t.highestRecvTimestamp.Set(float64(t.highestTimestamp / 1000))
return nil
}
// Rollback implements storage.Appender.
func (*timestampTracker) Rollback() error {
return nil
}