From 6544f95403301979fa5b2668e4750ac29c6aaf03 Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Fri, 25 Sep 2020 15:44:47 +0200 Subject: [PATCH 01/11] Introduce timestamp tolerance in scrapes Signed-off-by: Julien Pivotto --- scrape/scrape.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/scrape/scrape.go b/scrape/scrape.go index eb0294c85a..f8de16bede 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -47,6 +47,11 @@ import ( "github.com/prometheus/prometheus/storage" ) +// Temporary tolerance for scrape appends timestamps alignment, to enable better +// compression at the TSDB level. +// See https://github.com/prometheus/prometheus/issues/7846 +const scrapeTimestampTolerance = 2 * time.Millisecond + var errNameLabelMandatory = fmt.Errorf("missing metric name (%s label)", labels.MetricName) var ( @@ -1033,6 +1038,16 @@ func (sl *scrapeLoop) scrapeAndReport(interval, timeout time.Duration, last time ) } + // Temporary workaround for a jitter in go timers that causes disk space + // increase in TSDB. + // See https://github.com/prometheus/prometheus/issues/7846 + if last.IsZero() && interval > 100*scrapeTimestampTolerance { + exactStart := last.Add(interval) + if t := start.Sub(exactStart); t >= -scrapeTimestampTolerance && t <= scrapeTimestampTolerance { + start = exactStart + } + } + b := sl.buffers.Get(sl.lastScrapeSize).([]byte) defer sl.buffers.Put(b) buf := bytes.NewBuffer(b) From 98e14611a528e7eec323786f8ec67c81e111c51b Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Mon, 5 Oct 2020 18:17:50 +0200 Subject: [PATCH 02/11] Move the tolerance logic in the loop function. Signed-off-by: Julien Pivotto --- scrape/scrape.go | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/scrape/scrape.go b/scrape/scrape.go index f8de16bede..e3c4f02011 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -991,6 +991,7 @@ func (sl *scrapeLoop) run(interval, timeout time.Duration, errc chan<- error) { var last time.Time ticker := time.NewTicker(interval) + alignedScrapeTime := time.Now() defer ticker.Stop() mainLoop: @@ -1004,7 +1005,23 @@ mainLoop: default: } - last = sl.scrapeAndReport(interval, timeout, last, errc) + // Temporary workaround for a jitter in go timers that causes disk space + // increase in TSDB. + // See https://github.com/prometheus/prometheus/issues/7846 + scrapeTime := time.Now() + if interval > 100*scrapeTimestampTolerance { + // For some reason, a ticker might have been skipped, in which case we + // would call alignedScrapeTime.Add(interval) multiple times. + for scrapeTime.Sub(alignedScrapeTime) >= interval { + alignedScrapeTime = alignedScrapeTime.Add(interval) + } + // Align the scrape time if we are in the tolerance boundaries. + if t := scrapeTime.Sub(alignedScrapeTime); t >= -scrapeTimestampTolerance && t <= scrapeTimestampTolerance { + scrapeTime = alignedScrapeTime + } + } + + last = sl.scrapeAndReport(interval, timeout, last, scrapeTime, errc) select { case <-sl.parentCtx.Done(): @@ -1028,7 +1045,7 @@ mainLoop: // In the happy scenario, a single appender is used. // This function uses sl.parentCtx instead of sl.ctx on purpose. A scrape should // only be cancelled on shutdown, not on reloads. -func (sl *scrapeLoop) scrapeAndReport(interval, timeout time.Duration, last time.Time, errc chan<- error) time.Time { +func (sl *scrapeLoop) scrapeAndReport(interval, timeout time.Duration, last, appendTime time.Time, errc chan<- error) time.Time { start := time.Now() // Only record after the first scrape. @@ -1038,16 +1055,6 @@ func (sl *scrapeLoop) scrapeAndReport(interval, timeout time.Duration, last time ) } - // Temporary workaround for a jitter in go timers that causes disk space - // increase in TSDB. - // See https://github.com/prometheus/prometheus/issues/7846 - if last.IsZero() && interval > 100*scrapeTimestampTolerance { - exactStart := last.Add(interval) - if t := start.Sub(exactStart); t >= -scrapeTimestampTolerance && t <= scrapeTimestampTolerance { - start = exactStart - } - } - b := sl.buffers.Get(sl.lastScrapeSize).([]byte) defer sl.buffers.Put(b) buf := bytes.NewBuffer(b) @@ -1068,7 +1075,7 @@ func (sl *scrapeLoop) scrapeAndReport(interval, timeout time.Duration, last time }() defer func() { - if err = sl.report(app, start, time.Since(start), total, added, seriesAdded, scrapeErr); err != nil { + if err = sl.report(app, appendTime, time.Since(start), total, added, seriesAdded, scrapeErr); err != nil { level.Warn(sl.l).Log("msg", "Appending scrape report failed", "err", err) } }() @@ -1076,7 +1083,7 @@ func (sl *scrapeLoop) scrapeAndReport(interval, timeout time.Duration, last time if forcedErr := sl.getForcedError(); forcedErr != nil { scrapeErr = forcedErr // Add stale markers. - if _, _, _, err := sl.append(app, []byte{}, "", start); err != nil { + if _, _, _, err := sl.append(app, []byte{}, "", appendTime); err != nil { app.Rollback() app = sl.appender(sl.parentCtx) level.Warn(sl.l).Log("msg", "Append failed", "err", err) @@ -1110,14 +1117,14 @@ func (sl *scrapeLoop) scrapeAndReport(interval, timeout time.Duration, last time // A failed scrape is the same as an empty scrape, // we still call sl.append to trigger stale markers. - total, added, seriesAdded, appErr = sl.append(app, b, contentType, start) + total, added, seriesAdded, appErr = sl.append(app, b, contentType, appendTime) if appErr != nil { app.Rollback() app = sl.appender(sl.parentCtx) level.Debug(sl.l).Log("msg", "Append failed", "err", appErr) // The append failed, probably due to a parse error or sample limit. // Call sl.append again with an empty scrape to trigger stale markers. - if _, _, _, err := sl.append(app, []byte{}, "", start); err != nil { + if _, _, _, err := sl.append(app, []byte{}, "", appendTime); err != nil { app.Rollback() app = sl.appender(sl.parentCtx) level.Warn(sl.l).Log("msg", "Append failed", "err", err) From ccc1df3140239496ab8984293c6ae29f1f62a36d Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Tue, 6 Oct 2020 13:48:24 +0200 Subject: [PATCH 03/11] Fix comment Signed-off-by: Julien Pivotto --- scrape/scrape.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrape/scrape.go b/scrape/scrape.go index e3c4f02011..6e81f4300e 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -1010,7 +1010,7 @@ mainLoop: // See https://github.com/prometheus/prometheus/issues/7846 scrapeTime := time.Now() if interval > 100*scrapeTimestampTolerance { - // For some reason, a ticker might have been skipped, in which case we + // For some reason, a tick might have been skipped, in which case we // would call alignedScrapeTime.Add(interval) multiple times. for scrapeTime.Sub(alignedScrapeTime) >= interval { alignedScrapeTime = alignedScrapeTime.Add(interval) From b90c7a55da04231643895b192d15d499adbc480f Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Tue, 6 Oct 2020 21:17:16 +0200 Subject: [PATCH 04/11] Simplify logic Signed-off-by: Julien Pivotto --- scrape/scrape.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrape/scrape.go b/scrape/scrape.go index 6e81f4300e..de248d8761 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -990,8 +990,8 @@ func (sl *scrapeLoop) run(interval, timeout time.Duration, errc chan<- error) { var last time.Time - ticker := time.NewTicker(interval) alignedScrapeTime := time.Now() + ticker := time.NewTicker(interval) defer ticker.Stop() mainLoop: @@ -1016,7 +1016,7 @@ mainLoop: alignedScrapeTime = alignedScrapeTime.Add(interval) } // Align the scrape time if we are in the tolerance boundaries. - if t := scrapeTime.Sub(alignedScrapeTime); t >= -scrapeTimestampTolerance && t <= scrapeTimestampTolerance { + if scrapeTime.Sub(alignedScrapeTime) <= scrapeTimestampTolerance { scrapeTime = alignedScrapeTime } } From 536dfb62345ccdd3bd297e6a127a77b5a469b9f5 Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Wed, 7 Oct 2020 17:31:32 +0200 Subject: [PATCH 05/11] Add an experimental, hidden flag Signed-off-by: Julien Pivotto --- cmd/prometheus/main.go | 3 +++ scrape/scrape.go | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index 5c004d660b..0be3dfbc47 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -239,6 +239,9 @@ func main() { a.Flag("rules.alert.resend-delay", "Minimum amount of time to wait before resending an alert to Alertmanager."). Default("1m").SetValue(&cfg.resendDelay) + a.Flag("scrape.timestamp-tolerance", "Tolerance applied to scrapes timestamp to improve timestamp compression. Experimental."). + Hidden().Default("2ms").DurationVar(&scrape.ScrapeTimestampTolerance) + a.Flag("alertmanager.notification-queue-capacity", "The capacity of the queue for pending Alertmanager notifications."). Default("10000").IntVar(&cfg.notifier.QueueCapacity) diff --git a/scrape/scrape.go b/scrape/scrape.go index de248d8761..53d573fccd 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -50,7 +50,7 @@ import ( // Temporary tolerance for scrape appends timestamps alignment, to enable better // compression at the TSDB level. // See https://github.com/prometheus/prometheus/issues/7846 -const scrapeTimestampTolerance = 2 * time.Millisecond +var ScrapeTimestampTolerance = 2 * time.Millisecond var errNameLabelMandatory = fmt.Errorf("missing metric name (%s label)", labels.MetricName) @@ -1009,14 +1009,14 @@ mainLoop: // increase in TSDB. // See https://github.com/prometheus/prometheus/issues/7846 scrapeTime := time.Now() - if interval > 100*scrapeTimestampTolerance { + if interval > 100*ScrapeTimestampTolerance { // For some reason, a tick might have been skipped, in which case we // would call alignedScrapeTime.Add(interval) multiple times. for scrapeTime.Sub(alignedScrapeTime) >= interval { alignedScrapeTime = alignedScrapeTime.Add(interval) } // Align the scrape time if we are in the tolerance boundaries. - if scrapeTime.Sub(alignedScrapeTime) <= scrapeTimestampTolerance { + if scrapeTime.Sub(alignedScrapeTime) <= ScrapeTimestampTolerance { scrapeTime = alignedScrapeTime } } From 6b618ecf02dcf9bdae4279b87fe93ea32f2f6376 Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Wed, 7 Oct 2020 17:43:42 +0200 Subject: [PATCH 06/11] Better description Signed-off-by: Julien Pivotto --- cmd/prometheus/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index 0be3dfbc47..5587f0cb37 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -239,7 +239,7 @@ func main() { a.Flag("rules.alert.resend-delay", "Minimum amount of time to wait before resending an alert to Alertmanager."). Default("1m").SetValue(&cfg.resendDelay) - a.Flag("scrape.timestamp-tolerance", "Tolerance applied to scrapes timestamp to improve timestamp compression. Experimental."). + a.Flag("scrape.timestamp-tolerance", "Adjust scrape timestamps by up to this amount to align them to the intended schedule. Experimental."). Hidden().Default("2ms").DurationVar(&scrape.ScrapeTimestampTolerance) a.Flag("alertmanager.notification-queue-capacity", "The capacity of the queue for pending Alertmanager notifications."). From 627ff84599055da123882847b0bc5e3a74d50c5d Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Wed, 7 Oct 2020 18:25:52 +0200 Subject: [PATCH 07/11] Adjust flag Signed-off-by: Julien Pivotto --- cmd/prometheus/main.go | 4 ++-- scrape/scrape.go | 9 ++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index 5587f0cb37..63ea99eab7 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -239,8 +239,8 @@ func main() { a.Flag("rules.alert.resend-delay", "Minimum amount of time to wait before resending an alert to Alertmanager."). Default("1m").SetValue(&cfg.resendDelay) - a.Flag("scrape.timestamp-tolerance", "Adjust scrape timestamps by up to this amount to align them to the intended schedule. Experimental."). - Hidden().Default("2ms").DurationVar(&scrape.ScrapeTimestampTolerance) + a.Flag("scrape.adjust-timestamps", "Adjust scrape timestamps by up to 2ms to align them to the intended schedule. Experimental."). + Hidden().Default("true").BoolVar(&scrape.AlignScrapeTimestamps) a.Flag("alertmanager.notification-queue-capacity", "The capacity of the queue for pending Alertmanager notifications."). Default("10000").IntVar(&cfg.notifier.QueueCapacity) diff --git a/scrape/scrape.go b/scrape/scrape.go index 53d573fccd..900c99137e 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -50,7 +50,10 @@ import ( // Temporary tolerance for scrape appends timestamps alignment, to enable better // compression at the TSDB level. // See https://github.com/prometheus/prometheus/issues/7846 -var ScrapeTimestampTolerance = 2 * time.Millisecond +const scrapeTimestampTolerance = 2 * time.Millisecond + +// Enable or disable the tolerance for scrape described above. +var AlignScrapeTimestamps = true var errNameLabelMandatory = fmt.Errorf("missing metric name (%s label)", labels.MetricName) @@ -1009,14 +1012,14 @@ mainLoop: // increase in TSDB. // See https://github.com/prometheus/prometheus/issues/7846 scrapeTime := time.Now() - if interval > 100*ScrapeTimestampTolerance { + if AlignScrapeTimestamps && interval > 100*scrapeTimestampTolerance { // For some reason, a tick might have been skipped, in which case we // would call alignedScrapeTime.Add(interval) multiple times. for scrapeTime.Sub(alignedScrapeTime) >= interval { alignedScrapeTime = alignedScrapeTime.Add(interval) } // Align the scrape time if we are in the tolerance boundaries. - if scrapeTime.Sub(alignedScrapeTime) <= ScrapeTimestampTolerance { + if scrapeTime.Sub(alignedScrapeTime) <= scrapeTimestampTolerance { scrapeTime = alignedScrapeTime } } From 671f7c66e5e318f4ab195bf6aec5324a08a326eb Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Wed, 7 Oct 2020 18:28:02 +0200 Subject: [PATCH 08/11] Adjust comment Signed-off-by: Julien Pivotto --- scrape/scrape.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrape/scrape.go b/scrape/scrape.go index 900c99137e..4fc4ef5e6e 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -52,7 +52,7 @@ import ( // See https://github.com/prometheus/prometheus/issues/7846 const scrapeTimestampTolerance = 2 * time.Millisecond -// Enable or disable the tolerance for scrape described above. +// Enable or disable the tolerance for scrape appends timestamps described above. var AlignScrapeTimestamps = true var errNameLabelMandatory = fmt.Errorf("missing metric name (%s label)", labels.MetricName) From e2a2bf3c064a2bf45afd0839dc27f27eadc1f5ca Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Wed, 7 Oct 2020 18:30:32 +0200 Subject: [PATCH 09/11] Add context Signed-off-by: Julien Pivotto --- cmd/prometheus/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index 63ea99eab7..1dda7b1268 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -239,7 +239,7 @@ func main() { a.Flag("rules.alert.resend-delay", "Minimum amount of time to wait before resending an alert to Alertmanager."). Default("1m").SetValue(&cfg.resendDelay) - a.Flag("scrape.adjust-timestamps", "Adjust scrape timestamps by up to 2ms to align them to the intended schedule. Experimental."). + a.Flag("scrape.adjust-timestamps", "Adjust scrape timestamps by up to 2ms to align them to the intended schedule. Introduced due to a regression in Go 2.14. See https://github.com/prometheus/prometheus/issues/7846 for more context. Experimental."). Hidden().Default("true").BoolVar(&scrape.AlignScrapeTimestamps) a.Flag("alertmanager.notification-queue-capacity", "The capacity of the queue for pending Alertmanager notifications."). From 4617d16b4b356d926cf82ff7a5134875c3f5f6bd Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Wed, 7 Oct 2020 18:31:46 +0200 Subject: [PATCH 10/11] Specify the removal Signed-off-by: Julien Pivotto --- cmd/prometheus/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index 1dda7b1268..165c275f2f 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -239,7 +239,7 @@ func main() { a.Flag("rules.alert.resend-delay", "Minimum amount of time to wait before resending an alert to Alertmanager."). Default("1m").SetValue(&cfg.resendDelay) - a.Flag("scrape.adjust-timestamps", "Adjust scrape timestamps by up to 2ms to align them to the intended schedule. Introduced due to a regression in Go 2.14. See https://github.com/prometheus/prometheus/issues/7846 for more context. Experimental."). + a.Flag("scrape.adjust-timestamps", "Adjust scrape timestamps by up to 2ms to align them to the intended schedule. See https://github.com/prometheus/prometheus/issues/7846 for more context. Experimental. This flag will be removed in the future."). Hidden().Default("true").BoolVar(&scrape.AlignScrapeTimestamps) a.Flag("alertmanager.notification-queue-capacity", "The capacity of the queue for pending Alertmanager notifications."). From be5ba1a62d5a27e62631634d558621d703bfd228 Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Wed, 7 Oct 2020 21:44:36 +0200 Subject: [PATCH 11/11] Fix wordings Signed-off-by: Julien Pivotto --- cmd/prometheus/main.go | 2 +- scrape/scrape.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index 165c275f2f..f09c88d01c 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -239,7 +239,7 @@ func main() { a.Flag("rules.alert.resend-delay", "Minimum amount of time to wait before resending an alert to Alertmanager."). Default("1m").SetValue(&cfg.resendDelay) - a.Flag("scrape.adjust-timestamps", "Adjust scrape timestamps by up to 2ms to align them to the intended schedule. See https://github.com/prometheus/prometheus/issues/7846 for more context. Experimental. This flag will be removed in the future."). + a.Flag("scrape.adjust-timestamps", "Adjust scrape timestamps by up to 2ms to align them to the intended schedule. See https://github.com/prometheus/prometheus/issues/7846 for more context. Experimental. This flag will be removed in a future release."). Hidden().Default("true").BoolVar(&scrape.AlignScrapeTimestamps) a.Flag("alertmanager.notification-queue-capacity", "The capacity of the queue for pending Alertmanager notifications."). diff --git a/scrape/scrape.go b/scrape/scrape.go index 4fc4ef5e6e..d52b0ac7f3 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -52,7 +52,7 @@ import ( // See https://github.com/prometheus/prometheus/issues/7846 const scrapeTimestampTolerance = 2 * time.Millisecond -// Enable or disable the tolerance for scrape appends timestamps described above. +// AlignScrapeTimestamps enables the tolerance for scrape appends timestamps described above. var AlignScrapeTimestamps = true var errNameLabelMandatory = fmt.Errorf("missing metric name (%s label)", labels.MetricName)