Merge pull request #7073 from csmarchbanks/fix-md5-remote-write

Fix remote write not updating when relabel configs or secrets change
This commit is contained in:
Chris Marchbanks 2020-04-16 16:36:25 -06:00 committed by GitHub
commit cd12f0873c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 106 additions and 75 deletions

View file

@ -237,9 +237,11 @@ type QueueManager struct {
cfg config.QueueConfig
externalLabels labels.Labels
relabelConfigs []*relabel.Config
client StorageClient
watcher *wal.Watcher
clientMtx sync.RWMutex
storeClient StorageClient
seriesMtx sync.Mutex
seriesLabels map[uint64]labels.Labels
seriesSegmentIndexes map[uint64]int
@ -283,7 +285,7 @@ func NewQueueManager(metrics *queueManagerMetrics, watcherMetrics *wal.WatcherMe
cfg: cfg,
externalLabels: externalLabels,
relabelConfigs: relabelConfigs,
client: client,
storeClient: client,
seriesLabels: make(map[uint64]labels.Labels),
seriesSegmentIndexes: make(map[uint64]int),
@ -358,8 +360,8 @@ func (t *QueueManager) Start() {
// Setup the QueueManagers metrics. We do this here rather than in the
// constructor because of the ordering of creating Queue Managers's, stopping them,
// and then starting new ones in storage/remote/storage.go ApplyConfig.
name := t.client.Name()
ep := t.client.Endpoint()
name := t.client().Name()
ep := t.client().Endpoint()
t.highestSentTimestampMetric = &maxGauge{
Gauge: t.metrics.queueHighestSentTimestamp.WithLabelValues(name, ep),
}
@ -413,8 +415,8 @@ func (t *QueueManager) Stop() {
}
t.seriesMtx.Unlock()
// Delete metrics so we don't have alerts for queues that are gone.
name := t.client.Name()
ep := t.client.Endpoint()
name := t.client().Name()
ep := t.client().Endpoint()
t.metrics.queueHighestSentTimestamp.DeleteLabelValues(name, ep)
t.metrics.queuePendingSamples.DeleteLabelValues(name, ep)
t.metrics.enqueueRetriesTotal.DeleteLabelValues(name, ep)
@ -472,6 +474,20 @@ func (t *QueueManager) SeriesReset(index int) {
}
}
// SetClient updates the client used by a queue. Used when only client specific
// fields are updated to avoid restarting the queue.
func (t *QueueManager) SetClient(c StorageClient) {
t.clientMtx.Lock()
t.storeClient = c
t.clientMtx.Unlock()
}
func (t *QueueManager) client() StorageClient {
t.clientMtx.RLock()
defer t.clientMtx.RUnlock()
return t.storeClient
}
func internLabels(lbls labels.Labels) {
for i, l := range lbls {
lbls[i].Name = interner.intern(l.Name)
@ -866,7 +882,7 @@ func (s *shards) sendSamplesWithBackoff(ctx context.Context, samples []prompb.Ti
default:
}
begin := time.Now()
err := s.qm.client.Store(ctx, req)
err := s.qm.client().Store(ctx, req)
s.qm.sentBatchDuration.Observe(time.Since(begin).Seconds())

View file

@ -17,12 +17,12 @@ import (
"context"
"crypto/md5"
"encoding/hex"
"encoding/json"
"fmt"
"sync"
"time"
"github.com/go-kit/kit/log"
"gopkg.in/yaml.v2"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/model"
@ -174,7 +174,7 @@ func labelsToEqualityMatchers(ls model.LabelSet) []*labels.Matcher {
// Used for hashing configs and diff'ing hashes in ApplyConfig.
func toHash(data interface{}) (string, error) {
bytes, err := json.Marshal(data)
bytes, err := yaml.Marshal(data)
if err != nil {
return "", err
}

View file

@ -51,7 +51,7 @@ func TestStorageLifecycle(t *testing.T) {
},
}
s.ApplyConfig(conf)
testutil.Ok(t, s.ApplyConfig(conf))
// make sure remote write has a queue.
testutil.Equals(t, 1, len(s.rws.queues))
@ -73,13 +73,13 @@ func TestUpdateRemoteReadConfigs(t *testing.T) {
conf := &config.Config{
GlobalConfig: config.GlobalConfig{},
}
s.ApplyConfig(conf)
testutil.Ok(t, s.ApplyConfig(conf))
testutil.Equals(t, 0, len(s.queryables))
conf.RemoteReadConfigs = []*config.RemoteReadConfig{
&config.DefaultRemoteReadConfig,
}
s.ApplyConfig(conf)
testutil.Ok(t, s.ApplyConfig(conf))
testutil.Equals(t, 1, len(s.queryables))
err = s.Close()

View file

@ -19,7 +19,6 @@ import (
"time"
"github.com/go-kit/kit/log"
"github.com/go-kit/kit/log/level"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/prometheus/config"
@ -53,8 +52,7 @@ type WriteStorage struct {
queueMetrics *queueManagerMetrics
watcherMetrics *wal.WatcherMetrics
liveReaderMetrics *wal.LiveReaderMetrics
configHash string
externalLabelHash string
externalLabels labels.Labels
walDir string
queues map[string]*QueueManager
samplesIn *ewmaRate
@ -94,25 +92,10 @@ func (rws *WriteStorage) ApplyConfig(conf *config.Config) error {
rws.mtx.Lock()
defer rws.mtx.Unlock()
configHash, err := toHash(conf.RemoteWriteConfigs)
if err != nil {
return err
}
externalLabelHash, err := toHash(conf.GlobalConfig.ExternalLabels)
if err != nil {
return err
}
// Remote write queues only need to change if the remote write config or
// external labels change.
externalLabelUnchanged := externalLabelHash == rws.externalLabelHash
if configHash == rws.configHash && externalLabelUnchanged {
level.Debug(rws.logger).Log("msg", "Remote write config has not changed, no need to restart QueueManagers")
return nil
}
rws.configHash = configHash
rws.externalLabelHash = externalLabelHash
externalLabelUnchanged := labels.Equal(conf.GlobalConfig.ExternalLabels, rws.externalLabels)
rws.externalLabels = conf.GlobalConfig.ExternalLabels
newQueues := make(map[string]*QueueManager)
newHashes := []string{}
@ -122,6 +105,11 @@ func (rws *WriteStorage) ApplyConfig(conf *config.Config) error {
return err
}
// Don't allow duplicate remote write configs.
if _, ok := newQueues[hash]; ok {
return fmt.Errorf("duplicate remote write configs are not allowed, found duplicate for URL: %s", rwConf.URL)
}
// Set the queue name to the config hash if the user has not set
// a name in their remote write config so we can still differentiate
// between queues that have the same remote write endpoint.
@ -130,22 +118,6 @@ func (rws *WriteStorage) ApplyConfig(conf *config.Config) error {
name = rwConf.Name
}
// Don't allow duplicate remote write configs.
if _, ok := newQueues[hash]; ok {
return fmt.Errorf("duplicate remote write configs are not allowed, found duplicate for URL: %s", rwConf.URL)
}
var nameUnchanged bool
queue, ok := rws.queues[hash]
if ok {
nameUnchanged = queue.client.Name() == name
}
if externalLabelUnchanged && nameUnchanged {
newQueues[hash] = queue
delete(rws.queues, hash)
continue
}
c, err := NewClient(name, &ClientConfig{
URL: rwConf.URL,
Timeout: rwConf.RemoteTimeout,
@ -154,6 +126,16 @@ func (rws *WriteStorage) ApplyConfig(conf *config.Config) error {
if err != nil {
return err
}
queue, ok := rws.queues[hash]
if externalLabelUnchanged && ok {
// Update the client in case any secret configuration has changed.
queue.SetClient(c)
newQueues[hash] = queue
delete(rws.queues, hash)
continue
}
newQueues[hash] = NewQueueManager(
rws.queueMetrics,
rws.watcherMetrics,

View file

@ -24,6 +24,7 @@ import (
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/config"
"github.com/prometheus/prometheus/pkg/labels"
"github.com/prometheus/prometheus/pkg/relabel"
"github.com/prometheus/prometheus/util/testutil"
)
@ -140,14 +141,14 @@ func TestRestartOnNameChange(t *testing.T) {
},
}
testutil.Ok(t, s.ApplyConfig(conf))
testutil.Equals(t, s.queues[hash].client.Name(), cfg.Name)
testutil.Equals(t, s.queues[hash].client().Name(), cfg.Name)
// Change the queues name, ensure the queue has been restarted.
conf.RemoteWriteConfigs[0].Name = "dev-2"
testutil.Ok(t, s.ApplyConfig(conf))
hash, err = toHash(cfg)
testutil.Ok(t, err)
testutil.Equals(t, s.queues[hash].client.Name(), conf.RemoteWriteConfigs[0].Name)
testutil.Equals(t, s.queues[hash].client().Name(), conf.RemoteWriteConfigs[0].Name)
err = s.Close()
testutil.Ok(t, err)
@ -247,10 +248,18 @@ func TestWriteStorageApplyConfigsPartialUpdate(t *testing.T) {
c0 := &config.RemoteWriteConfig{
RemoteTimeout: model.Duration(10 * time.Second),
QueueConfig: config.DefaultQueueConfig,
WriteRelabelConfigs: []*relabel.Config{
&relabel.Config{
Regex: relabel.MustNewRegexp(".+"),
},
},
}
c1 := &config.RemoteWriteConfig{
RemoteTimeout: model.Duration(20 * time.Second),
QueueConfig: config.DefaultQueueConfig,
HTTPClientConfig: common_config.HTTPClientConfig{
BearerToken: "foo",
},
}
c2 := &config.RemoteWriteConfig{
RemoteTimeout: model.Duration(30 * time.Second),
@ -262,43 +271,63 @@ func TestWriteStorageApplyConfigsPartialUpdate(t *testing.T) {
RemoteWriteConfigs: []*config.RemoteWriteConfig{c0, c1, c2},
}
// We need to set URL's so that metric creation doesn't panic.
conf.RemoteWriteConfigs[0].URL = &common_config.URL{
URL: &url.URL{
Host: "http://test-storage.com",
},
for i := range conf.RemoteWriteConfigs {
conf.RemoteWriteConfigs[i].URL = &common_config.URL{
URL: &url.URL{
Host: "http://test-storage.com",
},
}
}
conf.RemoteWriteConfigs[1].URL = &common_config.URL{
URL: &url.URL{
Host: "http://test-storage.com",
},
}
conf.RemoteWriteConfigs[2].URL = &common_config.URL{
URL: &url.URL{
Host: "http://test-storage.com",
},
}
s.ApplyConfig(conf)
testutil.Ok(t, s.ApplyConfig(conf))
testutil.Equals(t, 3, len(s.queues))
c0Hash, err := toHash(c0)
testutil.Ok(t, err)
c1Hash, err := toHash(c1)
testutil.Ok(t, err)
// q := s.queues[1]
hashes := make([]string, len(conf.RemoteWriteConfigs))
queues := make([]*QueueManager, len(conf.RemoteWriteConfigs))
storeHashes := func() {
for i := range conf.RemoteWriteConfigs {
hash, err := toHash(conf.RemoteWriteConfigs[i])
testutil.Ok(t, err)
hashes[i] = hash
queues[i] = s.queues[hash]
}
}
storeHashes()
// Update c0 and c2.
c0.RemoteTimeout = model.Duration(40 * time.Second)
c0.WriteRelabelConfigs[0] = &relabel.Config{Regex: relabel.MustNewRegexp("foo")}
c2.RemoteTimeout = model.Duration(50 * time.Second)
conf = &config.Config{
GlobalConfig: config.GlobalConfig{},
RemoteWriteConfigs: []*config.RemoteWriteConfig{c0, c1, c2},
}
s.ApplyConfig(conf)
testutil.Ok(t, s.ApplyConfig(conf))
testutil.Equals(t, 3, len(s.queues))
_, hashExists := s.queues[c1Hash]
_, hashExists := s.queues[hashes[0]]
testutil.Assert(t, !hashExists, "The queue for the first remote write configuration should have been restarted because the relabel configuration has changed.")
q, hashExists := s.queues[hashes[1]]
testutil.Assert(t, hashExists, "Hash of unchanged queue should have remained the same")
testutil.Assert(t, q == queues[1], "Pointer of unchanged queue should have remained the same")
_, hashExists = s.queues[hashes[2]]
testutil.Assert(t, !hashExists, "The queue for the third remote write configuration should have been restarted because the timeout has changed.")
storeHashes()
secondClient := s.queues[hashes[1]].client()
// Update c1.
c1.HTTPClientConfig.BearerToken = "bar"
err = s.ApplyConfig(conf)
testutil.Ok(t, err)
testutil.Equals(t, 3, len(s.queues))
_, hashExists = s.queues[hashes[0]]
testutil.Assert(t, hashExists, "Pointer of unchanged queue should have remained the same")
q, hashExists = s.queues[hashes[1]]
testutil.Assert(t, hashExists, "Hash of queue with secret change should have remained the same")
testutil.Assert(t, secondClient != q.client(), "Pointer of a client with a secret change should not be the same")
_, hashExists = s.queues[hashes[2]]
testutil.Assert(t, hashExists, "Pointer of unchanged queue should have remained the same")
storeHashes()
// Delete c0.
conf = &config.Config{
GlobalConfig: config.GlobalConfig{},
@ -307,8 +336,12 @@ func TestWriteStorageApplyConfigsPartialUpdate(t *testing.T) {
s.ApplyConfig(conf)
testutil.Equals(t, 2, len(s.queues))
_, hashExists = s.queues[c0Hash]
testutil.Assert(t, !hashExists, "If the index changed, the queue should be stopped and recreated.")
_, hashExists = s.queues[hashes[0]]
testutil.Assert(t, !hashExists, "If a config is removed, the queue should be stopped and recreated.")
_, hashExists = s.queues[hashes[1]]
testutil.Assert(t, hashExists, "Pointer of unchanged queue should have remained the same")
_, hashExists = s.queues[hashes[2]]
testutil.Assert(t, hashExists, "Pointer of unchanged queue should have remained the same")
err = s.Close()
testutil.Ok(t, err)