prometheus/storage/remote/write.go
Pedro Tanaka bab587b9dc
Some checks failed
buf.build / lint and publish (push) Has been cancelled
CI / Go tests (push) Has been cancelled
CI / More Go tests (push) Has been cancelled
CI / Go tests with previous Go version (push) Has been cancelled
CI / UI tests (push) Has been cancelled
CI / Go tests on Windows (push) Has been cancelled
CI / Mixins tests (push) Has been cancelled
CI / Build Prometheus for common architectures (0) (push) Has been cancelled
CI / Build Prometheus for common architectures (1) (push) Has been cancelled
CI / Build Prometheus for common architectures (2) (push) Has been cancelled
CI / Build Prometheus for all architectures (0) (push) Has been cancelled
CI / Build Prometheus for all architectures (1) (push) Has been cancelled
CI / Build Prometheus for all architectures (10) (push) Has been cancelled
CI / Build Prometheus for all architectures (11) (push) Has been cancelled
CI / Build Prometheus for all architectures (2) (push) Has been cancelled
CI / Build Prometheus for all architectures (3) (push) Has been cancelled
CI / Build Prometheus for all architectures (4) (push) Has been cancelled
CI / Build Prometheus for all architectures (5) (push) Has been cancelled
CI / Build Prometheus for all architectures (6) (push) Has been cancelled
CI / Build Prometheus for all architectures (7) (push) Has been cancelled
CI / Build Prometheus for all architectures (8) (push) Has been cancelled
CI / Build Prometheus for all architectures (9) (push) Has been cancelled
CI / Check generated parser (push) Has been cancelled
CI / golangci-lint (push) Has been cancelled
CI / fuzzing (push) Has been cancelled
CI / codeql (push) Has been cancelled
Scorecards supply-chain security / Scorecards analysis (push) Has been cancelled
CI / Report status of build Prometheus for all architectures (push) Has been cancelled
CI / Publish main branch artifacts (push) Has been cancelled
CI / Publish release artefacts (push) Has been cancelled
CI / Publish UI on npm Registry (push) Has been cancelled
Agent: allow for ingestion of CT samples (#15124)
* Remove unused option from HeadOptions

Signed-off-by: Pedro Tanaka <pedro.tanaka@shopify.com>

* Improve docs for appendable() method in head appender

Signed-off-by: Pedro Tanaka <pedro.tanaka@shopify.com>

* Ingest CT (float) samples in Agent DB

Signed-off-by: Pedro Tanaka <pedro.tanaka@shopify.com>

* allow for ingestion of CT native histogram

Signed-off-by: Pedro Tanaka <pedro.tanaka@shopify.com>

* adding some verification for ct ts

Signed-off-by: Pedro Tanaka <pedro.tanaka@shopify.com>

* Validating CT histogram before append and add newly created series to pending series

Signed-off-by: Pedro Tanaka <pedro.tanaka@shopify.com>

* checking the wal for written samples

Signed-off-by: Pedro Tanaka <pedro.tanaka@shopify.com>

* Checking for samples in test

Signed-off-by: Pedro Tanaka <pedro.tanaka@shopify.com>

* adding case for validations

Signed-off-by: Pedro Tanaka <pedro.tanaka@shopify.com>

* fixing comparison when dedupelabels is enabled

Signed-off-by: Pedro Tanaka <pedro.tanaka@shopify.com>

* unite tests, use table testing

Signed-off-by: Pedro Tanaka <pedro.tanaka@shopify.com>

* Implement CT related methods in timestampTracker for write storage

Signed-off-by: Pedro Tanaka <pedro.tanaka@shopify.com>

* adding error case to test

Signed-off-by: Pedro Tanaka <pedro.tanaka@shopify.com>

* removing unused fields

Signed-off-by: Pedro Tanaka <pedro.tanaka@shopify.com>

* Updating lastTs for series when adding CT to invalidate duplicates

Signed-off-by: Pedro Tanaka <pedro.tanaka@shopify.com>

* making sure that updating the lastTS wont cause OOO later on in Commit();

Signed-off-by: Pedro Tanaka <pedro.tanaka@shopify.com>

---------

Signed-off-by: Pedro Tanaka <pedro.tanaka@shopify.com>
2024-10-27 01:06:34 +01:00

356 lines
11 KiB
Go

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package remote
import (
"context"
"errors"
"fmt"
"log/slog"
"math"
"sync"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/common/promslog"
"github.com/prometheus/prometheus/config"
"github.com/prometheus/prometheus/model/exemplar"
"github.com/prometheus/prometheus/model/histogram"
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/model/metadata"
"github.com/prometheus/prometheus/storage"
"github.com/prometheus/prometheus/tsdb/wlog"
)
var (
samplesIn = promauto.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "samples_in_total",
Help: "Samples in to remote storage, compare to samples out for queue managers.",
})
exemplarsIn = promauto.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "exemplars_in_total",
Help: "Exemplars in to remote storage, compare to exemplars out for queue managers.",
})
histogramsIn = promauto.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "histograms_in_total",
Help: "HistogramSamples in to remote storage, compare to histograms out for queue managers.",
})
)
// WriteStorage represents all the remote write storage.
type WriteStorage struct {
logger *slog.Logger
reg prometheus.Registerer
mtx sync.Mutex
watcherMetrics *wlog.WatcherMetrics
liveReaderMetrics *wlog.LiveReaderMetrics
externalLabels labels.Labels
dir string
queues map[string]*QueueManager
metadataInWAL bool
samplesIn *ewmaRate
flushDeadline time.Duration
interner *pool
scraper ReadyScrapeManager
quit chan struct{}
// For timestampTracker.
highestTimestamp *maxTimestamp
}
// NewWriteStorage creates and runs a WriteStorage.
func NewWriteStorage(logger *slog.Logger, reg prometheus.Registerer, dir string, flushDeadline time.Duration, sm ReadyScrapeManager, metadataInWal bool) *WriteStorage {
if logger == nil {
logger = promslog.NewNopLogger()
}
rws := &WriteStorage{
queues: make(map[string]*QueueManager),
watcherMetrics: wlog.NewWatcherMetrics(reg),
liveReaderMetrics: wlog.NewLiveReaderMetrics(reg),
logger: logger,
reg: reg,
flushDeadline: flushDeadline,
samplesIn: newEWMARate(ewmaWeight, shardUpdateDuration),
dir: dir,
interner: newPool(),
scraper: sm,
quit: make(chan struct{}),
metadataInWAL: metadataInWal,
highestTimestamp: &maxTimestamp{
Gauge: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "highest_timestamp_in_seconds",
Help: "Highest timestamp that has come into the remote storage via the Appender interface, in seconds since epoch. Initialized to 0 when no data has been received yet.",
}),
},
}
if reg != nil {
reg.MustRegister(rws.highestTimestamp)
}
go rws.run()
return rws
}
func (rws *WriteStorage) run() {
ticker := time.NewTicker(shardUpdateDuration)
defer ticker.Stop()
for {
select {
case <-ticker.C:
rws.samplesIn.tick()
case <-rws.quit:
return
}
}
}
func (rws *WriteStorage) Notify() {
rws.mtx.Lock()
defer rws.mtx.Unlock()
for _, q := range rws.queues {
// These should all be non blocking
q.watcher.Notify()
}
}
// ApplyConfig updates the state as the new config requires.
// Only stop & create queues which have changes.
func (rws *WriteStorage) ApplyConfig(conf *config.Config) error {
rws.mtx.Lock()
defer rws.mtx.Unlock()
// Remote write queues only need to change if the remote write config or
// external labels change.
externalLabelUnchanged := labels.Equal(conf.GlobalConfig.ExternalLabels, rws.externalLabels)
rws.externalLabels = conf.GlobalConfig.ExternalLabels
newQueues := make(map[string]*QueueManager)
newHashes := []string{}
for _, rwConf := range conf.RemoteWriteConfigs {
if rwConf.ProtobufMessage == config.RemoteWriteProtoMsgV2 && !rws.metadataInWAL {
return errors.New("invalid remote write configuration, if you are using remote write version 2.0 the `--enable-feature=metadata-wal-records` feature flag must be enabled")
}
hash, err := toHash(rwConf)
if err != nil {
return err
}
// Don't allow duplicate remote write configs.
if _, ok := newQueues[hash]; ok {
return fmt.Errorf("duplicate remote write configs are not allowed, found duplicate for URL: %s", rwConf.URL)
}
// Set the queue name to the config hash if the user has not set
// a name in their remote write config so we can still differentiate
// between queues that have the same remote write endpoint.
name := hash[:6]
if rwConf.Name != "" {
name = rwConf.Name
}
c, err := NewWriteClient(name, &ClientConfig{
URL: rwConf.URL,
WriteProtoMsg: rwConf.ProtobufMessage,
Timeout: rwConf.RemoteTimeout,
HTTPClientConfig: rwConf.HTTPClientConfig,
SigV4Config: rwConf.SigV4Config,
AzureADConfig: rwConf.AzureADConfig,
GoogleIAMConfig: rwConf.GoogleIAMConfig,
Headers: rwConf.Headers,
RetryOnRateLimit: rwConf.QueueConfig.RetryOnRateLimit,
})
if err != nil {
return err
}
queue, ok := rws.queues[hash]
if externalLabelUnchanged && ok {
// Update the client in case any secret configuration has changed.
queue.SetClient(c)
newQueues[hash] = queue
delete(rws.queues, hash)
continue
}
// Redacted to remove any passwords in the URL (that are
// technically accepted but not recommended) since this is
// only used for metric labels.
endpoint := rwConf.URL.Redacted()
newQueues[hash] = NewQueueManager(
newQueueManagerMetrics(rws.reg, name, endpoint),
rws.watcherMetrics,
rws.liveReaderMetrics,
rws.logger,
rws.dir,
rws.samplesIn,
rwConf.QueueConfig,
rwConf.MetadataConfig,
conf.GlobalConfig.ExternalLabels,
rwConf.WriteRelabelConfigs,
c,
rws.flushDeadline,
rws.interner,
rws.highestTimestamp,
rws.scraper,
rwConf.SendExemplars,
rwConf.SendNativeHistograms,
rwConf.ProtobufMessage,
)
// Keep track of which queues are new so we know which to start.
newHashes = append(newHashes, hash)
}
// Anything remaining in rws.queues is a queue who's config has
// changed or was removed from the overall remote write config.
for _, q := range rws.queues {
q.Stop()
}
for _, hash := range newHashes {
newQueues[hash].Start()
}
rws.queues = newQueues
return nil
}
// Appender implements storage.Storage.
func (rws *WriteStorage) Appender(_ context.Context) storage.Appender {
return &timestampTracker{
writeStorage: rws,
highestRecvTimestamp: rws.highestTimestamp,
}
}
// LowestSentTimestamp returns the lowest sent timestamp across all queues.
func (rws *WriteStorage) LowestSentTimestamp() int64 {
rws.mtx.Lock()
defer rws.mtx.Unlock()
var lowestTs int64 = math.MaxInt64
for _, q := range rws.queues {
ts := int64(q.metrics.highestSentTimestamp.Get() * 1000)
if ts < lowestTs {
lowestTs = ts
}
}
if len(rws.queues) == 0 {
lowestTs = 0
}
return lowestTs
}
// Close closes the WriteStorage.
func (rws *WriteStorage) Close() error {
rws.mtx.Lock()
defer rws.mtx.Unlock()
for _, q := range rws.queues {
q.Stop()
}
close(rws.quit)
return nil
}
type timestampTracker struct {
writeStorage *WriteStorage
appendOptions *storage.AppendOptions
samples int64
exemplars int64
histograms int64
highestTimestamp int64
highestRecvTimestamp *maxTimestamp
}
func (t *timestampTracker) SetOptions(opts *storage.AppendOptions) {
t.appendOptions = opts
}
// Append implements storage.Appender.
func (t *timestampTracker) Append(_ storage.SeriesRef, _ labels.Labels, ts int64, _ float64) (storage.SeriesRef, error) {
t.samples++
if ts > t.highestTimestamp {
t.highestTimestamp = ts
}
return 0, nil
}
func (t *timestampTracker) AppendExemplar(_ storage.SeriesRef, _ labels.Labels, _ exemplar.Exemplar) (storage.SeriesRef, error) {
t.exemplars++
return 0, nil
}
func (t *timestampTracker) AppendHistogram(_ storage.SeriesRef, _ labels.Labels, ts int64, _ *histogram.Histogram, _ *histogram.FloatHistogram) (storage.SeriesRef, error) {
t.histograms++
if ts > t.highestTimestamp {
t.highestTimestamp = ts
}
return 0, nil
}
func (t *timestampTracker) AppendCTZeroSample(_ storage.SeriesRef, _ labels.Labels, _, ct int64) (storage.SeriesRef, error) {
t.samples++
if ct > t.highestTimestamp {
// Theoretically, we should never see a CT zero sample with a timestamp higher than the highest timestamp we've seen so far.
// However, we're not going to enforce that here, as it is not the responsibility of the tracker to enforce this.
t.highestTimestamp = ct
}
return 0, nil
}
func (t *timestampTracker) AppendHistogramCTZeroSample(_ storage.SeriesRef, _ labels.Labels, _, ct int64, _ *histogram.Histogram, _ *histogram.FloatHistogram) (storage.SeriesRef, error) {
t.histograms++
if ct > t.highestTimestamp {
// Theoretically, we should never see a CT zero sample with a timestamp higher than the highest timestamp we've seen so far.
// However, we're not going to enforce that here, as it is not the responsibility of the tracker to enforce this.
t.highestTimestamp = ct
}
return 0, nil
}
func (t *timestampTracker) UpdateMetadata(_ storage.SeriesRef, _ labels.Labels, _ metadata.Metadata) (storage.SeriesRef, error) {
// TODO: Add and increment a `metadata` field when we get around to wiring metadata in remote_write.
// UpdateMetadata is no-op for remote write (where timestampTracker is being used) for now.
return 0, nil
}
// Commit implements storage.Appender.
func (t *timestampTracker) Commit() error {
t.writeStorage.samplesIn.incr(t.samples + t.exemplars + t.histograms)
samplesIn.Add(float64(t.samples))
exemplarsIn.Add(float64(t.exemplars))
histogramsIn.Add(float64(t.histograms))
t.highestRecvTimestamp.Set(float64(t.highestTimestamp / 1000))
return nil
}
// Rollback implements storage.Appender.
func (*timestampTracker) Rollback() error {
return nil
}