From 7ca31c66beb92230df4c68f42992007614af5b0e Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Tue, 2 Apr 2024 14:56:19 +0100 Subject: [PATCH 1/6] Scraping: add metric for symbol table size Signed-off-by: Bryan Boreham --- scrape/metrics.go | 10 ++++++++++ scrape/scrape.go | 2 ++ 2 files changed, 12 insertions(+) diff --git a/scrape/metrics.go b/scrape/metrics.go index b67d0686b..e7395c619 100644 --- a/scrape/metrics.go +++ b/scrape/metrics.go @@ -34,6 +34,7 @@ type scrapeMetrics struct { targetScrapePoolExceededTargetLimit prometheus.Counter targetScrapePoolTargetLimit *prometheus.GaugeVec targetScrapePoolTargetsAdded *prometheus.GaugeVec + targetScrapePoolSymbolTableItems *prometheus.GaugeVec targetSyncIntervalLength *prometheus.SummaryVec targetSyncFailed *prometheus.CounterVec @@ -129,6 +130,13 @@ func newScrapeMetrics(reg prometheus.Registerer) (*scrapeMetrics, error) { }, []string{"scrape_job"}, ) + sm.targetScrapePoolSymbolTableItems = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "prometheus_target_scrape_pool_symboltable_items", + Help: "Current number of symbols in table for this scrape pool.", + }, + []string{"scrape_job"}, + ) sm.targetScrapePoolSyncsCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ Name: "prometheus_target_scrape_pool_sync_total", @@ -234,6 +242,7 @@ func newScrapeMetrics(reg prometheus.Registerer) (*scrapeMetrics, error) { sm.targetScrapePoolExceededTargetLimit, sm.targetScrapePoolTargetLimit, sm.targetScrapePoolTargetsAdded, + sm.targetScrapePoolSymbolTableItems, sm.targetSyncFailed, // Used by targetScraper. sm.targetScrapeExceededBodySizeLimit, @@ -274,6 +283,7 @@ func (sm *scrapeMetrics) Unregister() { sm.reg.Unregister(sm.targetScrapePoolExceededTargetLimit) sm.reg.Unregister(sm.targetScrapePoolTargetLimit) sm.reg.Unregister(sm.targetScrapePoolTargetsAdded) + sm.reg.Unregister(sm.targetScrapePoolSymbolTableItems) sm.reg.Unregister(sm.targetSyncFailed) sm.reg.Unregister(sm.targetScrapeExceededBodySizeLimit) sm.reg.Unregister(sm.targetScrapeCacheFlushForced) diff --git a/scrape/scrape.go b/scrape/scrape.go index 17e9913e8..c16f14cec 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -246,6 +246,7 @@ func (sp *scrapePool) stop() { sp.metrics.targetScrapePoolSyncsCounter.DeleteLabelValues(sp.config.JobName) sp.metrics.targetScrapePoolTargetLimit.DeleteLabelValues(sp.config.JobName) sp.metrics.targetScrapePoolTargetsAdded.DeleteLabelValues(sp.config.JobName) + sp.metrics.targetScrapePoolSymbolTableItems.DeleteLabelValues(sp.config.JobName) sp.metrics.targetSyncIntervalLength.DeleteLabelValues(sp.config.JobName) sp.metrics.targetSyncFailed.DeleteLabelValues(sp.config.JobName) } @@ -408,6 +409,7 @@ func (sp *scrapePool) Sync(tgs []*targetgroup.Group) { } } } + sp.metrics.targetScrapePoolSymbolTableItems.WithLabelValues(sp.config.JobName).Set(float64(sp.symbolTable.Len())) sp.targetMtx.Unlock() sp.sync(all) From b42b5fbd74627d5c5ea9fb38857fc7a4b55ebfe1 Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Tue, 2 Apr 2024 18:42:40 +0100 Subject: [PATCH 2/6] Scraping: check symbol-table on sync Previously they were only checked on a change of config. Signed-off-by: Bryan Boreham --- scrape/scrape.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scrape/scrape.go b/scrape/scrape.go index c16f14cec..2655ffd16 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -357,7 +357,11 @@ func (sp *scrapePool) reload(cfg *config.ScrapeConfig) error { sp.metrics.targetReloadIntervalLength.WithLabelValues(interval.String()).Observe( time.Since(start).Seconds(), ) + return nil +} +// Must be called with sp.mtx held. +func (sp *scrapePool) checkSymbolTable() { // Here we take steps to clear out the symbol table if it has grown a lot. // After waiting some time for things to settle, we take the size of the symbol-table. // If, after some more time, the table has grown to twice that size, we start a new one. @@ -371,8 +375,6 @@ func (sp *scrapePool) reload(cfg *config.ScrapeConfig) error { } sp.lastSymbolTableCheck = time.Now() } - - return nil } // Sync converts target groups into actual scrape targets and synchronizes @@ -412,6 +414,7 @@ func (sp *scrapePool) Sync(tgs []*targetgroup.Group) { sp.metrics.targetScrapePoolSymbolTableItems.WithLabelValues(sp.config.JobName).Set(float64(sp.symbolTable.Len())) sp.targetMtx.Unlock() sp.sync(all) + sp.checkSymbolTable() sp.metrics.targetSyncIntervalLength.WithLabelValues(sp.config.JobName).Observe( time.Since(start).Seconds(), From 74b1f3daa604c3b5801b67bbea8de780f93a6470 Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Tue, 9 Apr 2024 18:43:49 +0100 Subject: [PATCH 3/6] Refactor: scraping: extract method restartLoops Signed-off-by: Bryan Boreham --- scrape/scrape.go | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/scrape/scrape.go b/scrape/scrape.go index 2655ffd16..57bb164b7 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -274,6 +274,15 @@ func (sp *scrapePool) reload(cfg *config.ScrapeConfig) error { sp.metrics.targetScrapePoolTargetLimit.WithLabelValues(sp.config.JobName).Set(float64(sp.config.TargetLimit)) + sp.restartLoops(reuseCache) + oldClient.CloseIdleConnections() + sp.metrics.targetReloadIntervalLength.WithLabelValues(time.Duration(sp.config.ScrapeInterval).String()).Observe( + time.Since(start).Seconds(), + ) + return nil +} + +func (sp *scrapePool) restartLoops(reuseCache bool) { var ( wg sync.WaitGroup interval = time.Duration(sp.config.ScrapeInterval) @@ -314,7 +323,7 @@ func (sp *scrapePool) reload(cfg *config.ScrapeConfig) error { client: sp.client, timeout: timeout, bodySizeLimit: bodySizeLimit, - acceptHeader: acceptHeader(cfg.ScrapeProtocols), + acceptHeader: acceptHeader(sp.config.ScrapeProtocols), acceptEncodingHeader: acceptEncodingHeader(enableCompression), } newLoop = sp.newLoop(scrapeLoopOptions{ @@ -353,11 +362,6 @@ func (sp *scrapePool) reload(cfg *config.ScrapeConfig) error { sp.targetMtx.Unlock() wg.Wait() - oldClient.CloseIdleConnections() - sp.metrics.targetReloadIntervalLength.WithLabelValues(interval.String()).Observe( - time.Since(start).Seconds(), - ) - return nil } // Must be called with sp.mtx held. From e6356e64bd6fe68e7568699e2c7f345745a12b3b Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Tue, 2 Apr 2024 17:07:00 +0100 Subject: [PATCH 4/6] Scraping: drop series cache when resizing symbol table Clear caches by restarting scraping loops: each loop assumes it has exclusive ownership of its cache, so we can't come in from another goroutine and change it. Signed-off-by: Bryan Boreham --- scrape/scrape.go | 1 + 1 file changed, 1 insertion(+) diff --git a/scrape/scrape.go b/scrape/scrape.go index 57bb164b7..68411a62e 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -376,6 +376,7 @@ func (sp *scrapePool) checkSymbolTable() { } else if sp.symbolTable.Len() > 2*sp.initialSymbolTableLen { sp.symbolTable = labels.NewSymbolTable() sp.initialSymbolTableLen = 0 + sp.restartLoops(false) // To drop all caches. } sp.lastSymbolTableCheck = time.Now() } From 5281a6bc1b8e36b0c778f6b2a41119a18c185aa5 Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Tue, 2 Apr 2024 15:08:58 +0100 Subject: [PATCH 5/6] TSDB: rebuild labels symbol-table on each compaction Log begin/end for timing, plus some stats. Signed-off-by: Bryan Boreham --- tsdb/db.go | 3 +++ tsdb/head_dedupelabels.go | 40 +++++++++++++++++++++++++++++++++++++++ tsdb/head_other.go | 7 +++++++ 3 files changed, 50 insertions(+) diff --git a/tsdb/db.go b/tsdb/db.go index b2175d475..090d6fcf0 100644 --- a/tsdb/db.go +++ b/tsdb/db.go @@ -1407,6 +1407,9 @@ func (db *DB) compactHead(head *RangeHead) error { if err = db.head.truncateMemory(head.BlockMaxTime()); err != nil { return fmt.Errorf("head memory truncate: %w", err) } + + db.head.RebuildSymbolTable(db.logger) + return nil } diff --git a/tsdb/head_dedupelabels.go b/tsdb/head_dedupelabels.go index 203f92e6a..aaab7c25b 100644 --- a/tsdb/head_dedupelabels.go +++ b/tsdb/head_dedupelabels.go @@ -16,6 +16,9 @@ package tsdb import ( + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/prometheus/prometheus/model/labels" ) @@ -25,3 +28,40 @@ func (s *memSeries) labels() labels.Labels { defer s.Unlock() return s.lset } + +// RebuildSymbolTable goes through all the series in h, build a SymbolTable with all names and values, +// replace each series' Labels with one using that SymbolTable. +func (h *Head) RebuildSymbolTable(logger log.Logger) *labels.SymbolTable { + level.Info(logger).Log("msg", "RebuildSymbolTable starting") + st := labels.NewSymbolTable() + builder := labels.NewScratchBuilderWithSymbolTable(st, 0) + rebuildLabels := func(lbls labels.Labels) labels.Labels { + builder.Reset() + lbls.Range(func(l labels.Label) { + builder.Add(l.Name, l.Value) + }) + return builder.Labels() + } + + for i := 0; i < h.series.size; i++ { + h.series.locks[i].Lock() + + for _, s := range h.series.hashes[i].unique { + s.Lock() + s.lset = rebuildLabels(s.lset) + s.Unlock() + } + + for _, all := range h.series.hashes[i].conflicts { + for _, s := range all { + s.Lock() + s.lset = rebuildLabels(s.lset) + s.Unlock() + } + } + + h.series.locks[i].Unlock() + } + level.Info(logger).Log("msg", "RebuildSymbolTable finished", "size", st.Len()) + return st +} diff --git a/tsdb/head_other.go b/tsdb/head_other.go index 9306913d8..eb1b93a3e 100644 --- a/tsdb/head_other.go +++ b/tsdb/head_other.go @@ -16,6 +16,8 @@ package tsdb import ( + "github.com/go-kit/log" + "github.com/prometheus/prometheus/model/labels" ) @@ -23,3 +25,8 @@ import ( func (s *memSeries) labels() labels.Labels { return s.lset } + +// No-op when not using dedupelabels. +func (h *Head) RebuildSymbolTable(logger log.Logger) *labels.SymbolTable { + return nil +} From 4d7532f60b1731f0c3a1aa62f50f4b964288e6c1 Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Sat, 11 May 2024 11:00:42 +0100 Subject: [PATCH 6/6] tsdb: reset symbol table for exemplars periodically To avoid keeping the memory alive forever. Signed-off-by: Bryan Boreham --- tsdb/head_dedupelabels.go | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tsdb/head_dedupelabels.go b/tsdb/head_dedupelabels.go index aaab7c25b..a16d90726 100644 --- a/tsdb/head_dedupelabels.go +++ b/tsdb/head_dedupelabels.go @@ -62,6 +62,34 @@ func (h *Head) RebuildSymbolTable(logger log.Logger) *labels.SymbolTable { h.series.locks[i].Unlock() } + type withReset interface{ ResetSymbolTable(*labels.SymbolTable) } + if e, ok := h.exemplars.(withReset); ok { + e.ResetSymbolTable(st) + } level.Info(logger).Log("msg", "RebuildSymbolTable finished", "size", st.Len()) return st } + +func (ce *CircularExemplarStorage) ResetSymbolTable(st *labels.SymbolTable) { + builder := labels.NewScratchBuilderWithSymbolTable(st, 0) + rebuildLabels := func(lbls labels.Labels) labels.Labels { + builder.Reset() + lbls.Range(func(l labels.Label) { + builder.Add(l.Name, l.Value) + }) + return builder.Labels() + } + + ce.lock.RLock() + defer ce.lock.RUnlock() + + for _, v := range ce.index { + v.seriesLabels = rebuildLabels(v.seriesLabels) + } + for i := range ce.exemplars { + if ce.exemplars[i].ref == nil { + continue + } + ce.exemplars[i].exemplar.Labels = rebuildLabels(ce.exemplars[i].exemplar.Labels) + } +}