From ce0f09b125b888ad0c4061c4506cd47ddce5c20f Mon Sep 17 00:00:00 2001 From: Julien Date: Mon, 26 Aug 2024 11:41:56 +0200 Subject: [PATCH] Scrape: Add scrape_failure_log_file to log Scrape Failures Signed-off-by: Julien --- cmd/prometheus/main.go | 1 + cmd/prometheus/scrape_failure_log_test.go | 193 ++++++++++++++++++++++ config/config.go | 10 ++ config/config_test.go | 41 ++++- config/testdata/conf.good.yml | 3 + docs/configuration/configuration.md | 8 + scrape/manager.go | 73 ++++++-- scrape/manager_test.go | 12 +- scrape/scrape.go | 43 +++++ scrape/scrape_test.go | 5 +- 10 files changed, 363 insertions(+), 26 deletions(-) create mode 100644 cmd/prometheus/scrape_failure_log_test.go diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index 65ffd7de5..a021259f9 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -755,6 +755,7 @@ func main() { scrapeManager, err := scrape.NewManager( &cfg.scrape, log.With(logger, "component", "scrape manager"), + func(s string) (log.Logger, error) { return logging.NewJSONFileLogger(s) }, fanoutStorage, prometheus.DefaultRegisterer, ) diff --git a/cmd/prometheus/scrape_failure_log_test.go b/cmd/prometheus/scrape_failure_log_test.go new file mode 100644 index 000000000..8d86d719f --- /dev/null +++ b/cmd/prometheus/scrape_failure_log_test.go @@ -0,0 +1,193 @@ +// Copyright 2024 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "bytes" + "fmt" + "net/http" + "net/http/httptest" + "net/url" + "os" + "os/exec" + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/require" + "go.uber.org/atomic" + + "github.com/prometheus/prometheus/util/testutil" +) + +func TestScrapeFailureLogFile(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + // Tracks the number of requests made to the mock server. + var requestCount atomic.Int32 + + // Starts a server that always returns HTTP 500 errors. + mockServerAddress := startGarbageServer(t, &requestCount) + + // Create a temporary directory for Prometheus configuration and logs. + tempDir := t.TempDir() + + // Define file paths for the scrape failure log and Prometheus configuration. + // Like other files, the scrape failure log file should be relative to the + // config file. Therefore, we split the name we put in the file and the full + // path used to check the content of the file. + scrapeFailureLogFileName := "scrape_failure.log" + scrapeFailureLogFile := filepath.Join(tempDir, scrapeFailureLogFileName) + promConfigFile := filepath.Join(tempDir, "prometheus.yml") + + // Step 1: Set up an initial Prometheus configuration that globally + // specifies a scrape failure log file. + promConfig := fmt.Sprintf(` +global: + scrape_interval: 500ms + scrape_failure_log_file: %s + +scrape_configs: + - job_name: 'test_job' + static_configs: + - targets: ['%s'] +`, scrapeFailureLogFileName, mockServerAddress) + + err := os.WriteFile(promConfigFile, []byte(promConfig), 0o644) + require.NoError(t, err, "Failed to write Prometheus configuration file") + + // Start Prometheus with the generated configuration and a random port, enabling the lifecycle API. + port := testutil.RandomUnprivilegedPort(t) + params := []string{ + "-test.main", + "--config.file=" + promConfigFile, + "--storage.tsdb.path=" + filepath.Join(tempDir, "data"), + fmt.Sprintf("--web.listen-address=127.0.0.1:%d", port), + "--web.enable-lifecycle", + } + prometheusProcess := exec.Command(promPath, params...) + prometheusProcess.Stdout = os.Stdout + prometheusProcess.Stderr = os.Stderr + + err = prometheusProcess.Start() + require.NoError(t, err, "Failed to start Prometheus") + defer prometheusProcess.Process.Kill() + + // Wait until the mock server receives at least two requests from Prometheus. + require.Eventually(t, func() bool { + return requestCount.Load() >= 2 + }, 30*time.Second, 500*time.Millisecond, "Expected at least two requests to the mock server") + + // Verify that the scrape failures have been logged to the specified file. + content, err := os.ReadFile(scrapeFailureLogFile) + require.NoError(t, err, "Failed to read scrape failure log") + require.Contains(t, string(content), "server returned HTTP status 500 Internal Server Error", "Expected scrape failure log entry not found") + + // Step 2: Update the Prometheus configuration to remove the scrape failure + // log file setting. + promConfig = fmt.Sprintf(` +global: + scrape_interval: 1s + +scrape_configs: + - job_name: 'test_job' + static_configs: + - targets: ['%s'] +`, mockServerAddress) + + err = os.WriteFile(promConfigFile, []byte(promConfig), 0o644) + require.NoError(t, err, "Failed to update Prometheus configuration file") + + // Reload Prometheus with the updated configuration. + reloadPrometheus(t, port) + + // Count the number of lines in the scrape failure log file before any + // further requests. + preReloadLogLineCount := countLinesInFile(scrapeFailureLogFile) + + // Wait for at least two more requests to the mock server to ensure + // Prometheus continues scraping. + requestsBeforeReload := requestCount.Load() + require.Eventually(t, func() bool { + return requestCount.Load() >= requestsBeforeReload+2 + }, 30*time.Second, 500*time.Millisecond, "Expected two more requests to the mock server after configuration reload") + + // Ensure that no new lines were added to the scrape failure log file after + // the configuration change. + require.Equal(t, preReloadLogLineCount, countLinesInFile(scrapeFailureLogFile), "No new lines should be added to the scrape failure log file after removing the log setting") + + // Step 3: Re-add the scrape failure log file setting, but this time under + // scrape_configs, and reload Prometheus. + promConfig = fmt.Sprintf(` +global: + scrape_interval: 1s + +scrape_configs: + - job_name: 'test_job' + scrape_failure_log_file: %s + static_configs: + - targets: ['%s'] +`, scrapeFailureLogFileName, mockServerAddress) + + err = os.WriteFile(promConfigFile, []byte(promConfig), 0o644) + require.NoError(t, err, "Failed to update Prometheus configuration file") + + // Reload Prometheus with the updated configuration. + reloadPrometheus(t, port) + + // Wait for at least two more requests to the mock server and verify that + // new log entries are created. + postReloadLogLineCount := countLinesInFile(scrapeFailureLogFile) + requestsBeforeReAddingLog := requestCount.Load() + require.Eventually(t, func() bool { + return requestCount.Load() >= requestsBeforeReAddingLog+2 + }, 30*time.Second, 500*time.Millisecond, "Expected two additional requests after re-adding the log setting") + + // Confirm that new lines were added to the scrape failure log file. + require.Greater(t, countLinesInFile(scrapeFailureLogFile), postReloadLogLineCount, "New lines should be added to the scrape failure log file after re-adding the log setting") +} + +// reloadPrometheus sends a reload request to the Prometheus server to apply +// updated configurations. +func reloadPrometheus(t *testing.T, port int) { + resp, err := http.Post(fmt.Sprintf("http://127.0.0.1:%d/-/reload", port), "", nil) + require.NoError(t, err, "Failed to reload Prometheus") + require.Equal(t, http.StatusOK, resp.StatusCode, "Unexpected status code when reloading Prometheus") +} + +// startGarbageServer sets up a mock server that returns a 500 Internal Server Error +// for all requests. It also increments the request count each time it's hit. +func startGarbageServer(t *testing.T, requestCount *atomic.Int32) string { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + requestCount.Inc() + w.WriteHeader(http.StatusInternalServerError) + })) + t.Cleanup(server.Close) + + parsedURL, err := url.Parse(server.URL) + require.NoError(t, err, "Failed to parse mock server URL") + + return parsedURL.Host +} + +// countLinesInFile counts and returns the number of lines in the specified file. +func countLinesInFile(filePath string) int { + data, err := os.ReadFile(filePath) + if err != nil { + return 0 // Return 0 if the file doesn't exist or can't be read. + } + return bytes.Count(data, []byte{'\n'}) +} diff --git a/config/config.go b/config/config.go index c9e8efbf3..4f80b551b 100644 --- a/config/config.go +++ b/config/config.go @@ -429,6 +429,8 @@ type GlobalConfig struct { RuleQueryOffset model.Duration `yaml:"rule_query_offset,omitempty"` // File to which PromQL queries are logged. QueryLogFile string `yaml:"query_log_file,omitempty"` + // File to which scrape failures are logged. + ScrapeFailureLogFile string `yaml:"scrape_failure_log_file,omitempty"` // The labels to add to any timeseries that this Prometheus instance scrapes. ExternalLabels labels.Labels `yaml:"external_labels,omitempty"` // An uncompressed response body larger than this many bytes will cause the @@ -529,6 +531,7 @@ func validateAcceptScrapeProtocols(sps []ScrapeProtocol) error { // SetDirectory joins any relative file paths with dir. func (c *GlobalConfig) SetDirectory(dir string) { c.QueryLogFile = config.JoinDir(dir, c.QueryLogFile) + c.ScrapeFailureLogFile = config.JoinDir(dir, c.ScrapeFailureLogFile) } // UnmarshalYAML implements the yaml.Unmarshaler interface. @@ -591,6 +594,7 @@ func (c *GlobalConfig) isZero() bool { c.EvaluationInterval == 0 && c.RuleQueryOffset == 0 && c.QueryLogFile == "" && + c.ScrapeFailureLogFile == "" && c.ScrapeProtocols == nil } @@ -632,6 +636,8 @@ type ScrapeConfig struct { ScrapeProtocols []ScrapeProtocol `yaml:"scrape_protocols,omitempty"` // Whether to scrape a classic histogram that is also exposed as a native histogram. ScrapeClassicHistograms bool `yaml:"scrape_classic_histograms,omitempty"` + // File to which scrape failures are logged. + ScrapeFailureLogFile string `yaml:"scrape_failure_log_file,omitempty"` // The HTTP resource path on which to fetch metrics from targets. MetricsPath string `yaml:"metrics_path,omitempty"` // The URL scheme with which to fetch metrics from targets. @@ -684,6 +690,7 @@ type ScrapeConfig struct { func (c *ScrapeConfig) SetDirectory(dir string) { c.ServiceDiscoveryConfigs.SetDirectory(dir) c.HTTPClientConfig.SetDirectory(dir) + c.ScrapeFailureLogFile = config.JoinDir(dir, c.ScrapeFailureLogFile) } // UnmarshalYAML implements the yaml.Unmarshaler interface. @@ -765,6 +772,9 @@ func (c *ScrapeConfig) Validate(globalConfig GlobalConfig) error { if c.KeepDroppedTargets == 0 { c.KeepDroppedTargets = globalConfig.KeepDroppedTargets } + if c.ScrapeFailureLogFile == "" { + c.ScrapeFailureLogFile = globalConfig.ScrapeFailureLogFile + } if c.ScrapeProtocols == nil { c.ScrapeProtocols = globalConfig.ScrapeProtocols diff --git a/config/config_test.go b/config/config_test.go index 221906182..726b233cc 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -78,14 +78,16 @@ const ( globLabelNameLengthLimit = 200 globLabelValueLengthLimit = 200 globalGoGC = 42 + globScrapeFailureLogFile = "testdata/fail.log" ) var expectedConf = &Config{ GlobalConfig: GlobalConfig{ - ScrapeInterval: model.Duration(15 * time.Second), - ScrapeTimeout: DefaultGlobalConfig.ScrapeTimeout, - EvaluationInterval: model.Duration(30 * time.Second), - QueryLogFile: "", + ScrapeInterval: model.Duration(15 * time.Second), + ScrapeTimeout: DefaultGlobalConfig.ScrapeTimeout, + EvaluationInterval: model.Duration(30 * time.Second), + QueryLogFile: "testdata/query.log", + ScrapeFailureLogFile: globScrapeFailureLogFile, ExternalLabels: labels.FromStrings("foo", "bar", "monitor", "codelab"), @@ -211,6 +213,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: "testdata/fail_prom.log", MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -314,6 +317,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: 210, LabelValueLengthLimit: 210, ScrapeProtocols: []ScrapeProtocol{PrometheusText0_0_4}, + ScrapeFailureLogFile: globScrapeFailureLogFile, HTTPClientConfig: config.HTTPClientConfig{ BasicAuth: &config.BasicAuth{ @@ -411,6 +415,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -466,6 +471,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: "/metrics", Scheme: "http", @@ -499,6 +505,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -538,6 +545,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -577,6 +585,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -606,6 +615,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -643,6 +653,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -677,6 +688,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -718,6 +730,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -749,6 +762,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -783,6 +797,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -810,6 +825,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -840,6 +856,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: "/federate", Scheme: DefaultScrapeConfig.Scheme, @@ -870,6 +887,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -900,6 +918,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -927,6 +946,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -962,6 +982,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -996,6 +1017,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -1027,6 +1049,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -1057,6 +1080,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -1091,6 +1115,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -1128,6 +1153,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -1184,6 +1210,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -1211,6 +1238,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, HTTPClientConfig: config.DefaultHTTPClientConfig, MetricsPath: DefaultScrapeConfig.MetricsPath, @@ -1249,6 +1277,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, HTTPClientConfig: config.DefaultHTTPClientConfig, MetricsPath: DefaultScrapeConfig.MetricsPath, @@ -1293,6 +1322,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -1328,6 +1358,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, HTTPClientConfig: config.DefaultHTTPClientConfig, MetricsPath: DefaultScrapeConfig.MetricsPath, @@ -1357,6 +1388,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, @@ -1389,6 +1421,7 @@ var expectedConf = &Config{ LabelNameLengthLimit: globLabelNameLengthLimit, LabelValueLengthLimit: globLabelValueLengthLimit, ScrapeProtocols: DefaultGlobalConfig.ScrapeProtocols, + ScrapeFailureLogFile: globScrapeFailureLogFile, MetricsPath: DefaultScrapeConfig.MetricsPath, Scheme: DefaultScrapeConfig.Scheme, diff --git a/config/testdata/conf.good.yml b/config/testdata/conf.good.yml index 56741822c..8da6e5c56 100644 --- a/config/testdata/conf.good.yml +++ b/config/testdata/conf.good.yml @@ -8,6 +8,8 @@ global: label_limit: 30 label_name_length_limit: 200 label_value_length_limit: 200 + query_log_file: query.log + scrape_failure_log_file: fail.log # scrape_timeout is set to the global default (10s). external_labels: @@ -72,6 +74,7 @@ scrape_configs: # metrics_path defaults to '/metrics' # scheme defaults to 'http'. + scrape_failure_log_file: fail_prom.log file_sd_configs: - files: - foo/*.slow.json diff --git a/docs/configuration/configuration.md b/docs/configuration/configuration.md index a42126cf2..a8c8d6e26 100644 --- a/docs/configuration/configuration.md +++ b/docs/configuration/configuration.md @@ -84,6 +84,10 @@ global: # Reloading the configuration will reopen the file. [ query_log_file: ] + # File to which scrape failures are logged. + # Reloading the configuration will reopen the file. + [ scrape_failure_log_file: ] + # An uncompressed response body larger than this many bytes will cause the # scrape to fail. 0 means no limit. Example: 100MB. # This is an experimental feature, this behaviour could @@ -319,6 +323,10 @@ http_headers: # Files to read header values from. [ files: [, ...] ] ] +# File to which scrape failures are logged. +# Reloading the configuration will reopen the file. +[ scrape_failure_log_file: ] + # List of Azure service discovery configurations. azure_sd_configs: [ - ... ] diff --git a/scrape/manager.go b/scrape/manager.go index e3dba5f0e..d7786a082 100644 --- a/scrape/manager.go +++ b/scrape/manager.go @@ -17,6 +17,7 @@ import ( "errors" "fmt" "hash/fnv" + "io" "reflect" "sync" "time" @@ -36,7 +37,7 @@ import ( ) // NewManager is the Manager constructor. -func NewManager(o *Options, logger log.Logger, app storage.Appendable, registerer prometheus.Registerer) (*Manager, error) { +func NewManager(o *Options, logger log.Logger, newScrapeFailureLogger func(string) (log.Logger, error), app storage.Appendable, registerer prometheus.Registerer) (*Manager, error) { if o == nil { o = &Options{} } @@ -50,15 +51,16 @@ func NewManager(o *Options, logger log.Logger, app storage.Appendable, registere } m := &Manager{ - append: app, - opts: o, - logger: logger, - scrapeConfigs: make(map[string]*config.ScrapeConfig), - scrapePools: make(map[string]*scrapePool), - graceShut: make(chan struct{}), - triggerReload: make(chan struct{}, 1), - metrics: sm, - buffers: pool.New(1e3, 100e6, 3, func(sz int) interface{} { return make([]byte, 0, sz) }), + append: app, + opts: o, + logger: logger, + newScrapeFailureLogger: newScrapeFailureLogger, + scrapeConfigs: make(map[string]*config.ScrapeConfig), + scrapePools: make(map[string]*scrapePool), + graceShut: make(chan struct{}), + triggerReload: make(chan struct{}, 1), + metrics: sm, + buffers: pool.New(1e3, 100e6, 3, func(sz int) interface{} { return make([]byte, 0, sz) }), } m.metrics.setTargetMetadataCacheGatherer(m) @@ -103,12 +105,14 @@ type Manager struct { append storage.Appendable graceShut chan struct{} - offsetSeed uint64 // Global offsetSeed seed is used to spread scrape workload across HA setup. - mtxScrape sync.Mutex // Guards the fields below. - scrapeConfigs map[string]*config.ScrapeConfig - scrapePools map[string]*scrapePool - targetSets map[string][]*targetgroup.Group - buffers *pool.Pool + offsetSeed uint64 // Global offsetSeed seed is used to spread scrape workload across HA setup. + mtxScrape sync.Mutex // Guards the fields below. + scrapeConfigs map[string]*config.ScrapeConfig + scrapePools map[string]*scrapePool + newScrapeFailureLogger func(string) (log.Logger, error) + scrapeFailureLoggers map[string]log.Logger + targetSets map[string][]*targetgroup.Group + buffers *pool.Pool triggerReload chan struct{} @@ -183,6 +187,11 @@ func (m *Manager) reload() { continue } m.scrapePools[setName] = sp + if l, ok := m.scrapeFailureLoggers[scrapeConfig.ScrapeFailureLogFile]; ok { + sp.SetScrapeFailureLogger(l) + } else { + level.Error(sp.logger).Log("msg", "No logger found. This is a bug in Prometheus that should be reported upstream.", "scrape_pool", setName) + } } wg.Add(1) @@ -238,11 +247,36 @@ func (m *Manager) ApplyConfig(cfg *config.Config) error { } c := make(map[string]*config.ScrapeConfig) + scrapeFailureLoggers := map[string]log.Logger{ + "": nil, // Emptying the file name sets the scrape logger to nil. + } for _, scfg := range scfgs { c[scfg.JobName] = scfg + if _, ok := scrapeFailureLoggers[scfg.ScrapeFailureLogFile]; !ok { + // We promise to reopen the file on each reload. + var ( + l log.Logger + err error + ) + if m.newScrapeFailureLogger != nil { + if l, err = m.newScrapeFailureLogger(scfg.ScrapeFailureLogFile); err != nil { + return err + } + } + scrapeFailureLoggers[scfg.ScrapeFailureLogFile] = l + } } m.scrapeConfigs = c + oldScrapeFailureLoggers := m.scrapeFailureLoggers + for _, s := range oldScrapeFailureLoggers { + if closer, ok := s.(io.Closer); ok { + defer closer.Close() + } + } + + m.scrapeFailureLoggers = scrapeFailureLoggers + if err := m.setOffsetSeed(cfg.GlobalConfig.ExternalLabels); err != nil { return err } @@ -260,6 +294,13 @@ func (m *Manager) ApplyConfig(cfg *config.Config) error { level.Error(m.logger).Log("msg", "error reloading scrape pool", "err", err, "scrape_pool", name) failed = true } + fallthrough + case ok: + if l, ok := m.scrapeFailureLoggers[cfg.ScrapeFailureLogFile]; ok { + sp.SetScrapeFailureLogger(l) + } else { + level.Error(sp.logger).Log("msg", "No logger found. This is a bug in Prometheus that should be reported upstream.", "scrape_pool", name) + } } } diff --git a/scrape/manager_test.go b/scrape/manager_test.go index c71691c95..ba32f36cf 100644 --- a/scrape/manager_test.go +++ b/scrape/manager_test.go @@ -511,7 +511,7 @@ scrape_configs: ) opts := Options{} - scrapeManager, err := NewManager(&opts, nil, nil, testRegistry) + scrapeManager, err := NewManager(&opts, nil, nil, nil, testRegistry) require.NoError(t, err) newLoop := func(scrapeLoopOptions) loop { ch <- struct{}{} @@ -576,7 +576,7 @@ scrape_configs: func TestManagerTargetsUpdates(t *testing.T) { opts := Options{} testRegistry := prometheus.NewRegistry() - m, err := NewManager(&opts, nil, nil, testRegistry) + m, err := NewManager(&opts, nil, nil, nil, testRegistry) require.NoError(t, err) ts := make(chan map[string][]*targetgroup.Group) @@ -629,7 +629,7 @@ global: opts := Options{} testRegistry := prometheus.NewRegistry() - scrapeManager, err := NewManager(&opts, nil, nil, testRegistry) + scrapeManager, err := NewManager(&opts, nil, nil, nil, testRegistry) require.NoError(t, err) // Load the first config. @@ -706,7 +706,7 @@ scrape_configs: } opts := Options{} - scrapeManager, err := NewManager(&opts, nil, nil, testRegistry) + scrapeManager, err := NewManager(&opts, nil, nil, nil, testRegistry) require.NoError(t, err) reload(scrapeManager, cfg1) @@ -758,6 +758,7 @@ func TestManagerCTZeroIngestion(t *testing.T) { skipOffsetting: true, }, log.NewLogfmtLogger(os.Stderr), + nil, &collectResultAppendable{app}, prometheus.NewRegistry(), ) @@ -857,7 +858,7 @@ func TestUnregisterMetrics(t *testing.T) { // Check that all metrics can be unregistered, allowing a second manager to be created. for i := 0; i < 2; i++ { opts := Options{} - manager, err := NewManager(&opts, nil, nil, reg) + manager, err := NewManager(&opts, nil, nil, nil, reg) require.NotNil(t, manager) require.NoError(t, err) // Unregister all metrics. @@ -901,6 +902,7 @@ func runManagers(t *testing.T, ctx context.Context) (*discovery.Manager, *Manage scrapeManager, err := NewManager( &Options{DiscoveryReloadInterval: model.Duration(100 * time.Millisecond)}, nil, + nil, nopAppendable{}, prometheus.NewRegistry(), ) diff --git a/scrape/scrape.go b/scrape/scrape.go index 2abd4691d..ea98432be 100644 --- a/scrape/scrape.go +++ b/scrape/scrape.go @@ -90,6 +90,9 @@ type scrapePool struct { noDefaultPort bool metrics *scrapeMetrics + + scrapeFailureLogger log.Logger + scrapeFailureLoggerMtx sync.RWMutex } type labelLimits struct { @@ -218,6 +221,27 @@ func (sp *scrapePool) DroppedTargetsCount() int { return sp.droppedTargetsCount } +func (sp *scrapePool) SetScrapeFailureLogger(l log.Logger) { + sp.scrapeFailureLoggerMtx.Lock() + defer sp.scrapeFailureLoggerMtx.Unlock() + if l != nil { + l = log.With(l, "job_name", sp.config.JobName) + } + sp.scrapeFailureLogger = l + + sp.targetMtx.Lock() + defer sp.targetMtx.Unlock() + for _, s := range sp.loops { + s.setScrapeFailureLogger(sp.scrapeFailureLogger) + } +} + +func (sp *scrapePool) getScrapeFailureLogger() log.Logger { + sp.scrapeFailureLoggerMtx.RLock() + defer sp.scrapeFailureLoggerMtx.RUnlock() + return sp.scrapeFailureLogger +} + // stop terminates all scrape loops and returns after they all terminated. func (sp *scrapePool) stop() { sp.mtx.Lock() @@ -361,6 +385,7 @@ func (sp *scrapePool) restartLoops(reuseCache bool) { wg.Done() newLoop.setForcedError(forcedErr) + newLoop.setScrapeFailureLogger(sp.getScrapeFailureLogger()) newLoop.run(nil) }(oldLoop, newLoop) @@ -503,6 +528,7 @@ func (sp *scrapePool) sync(targets []*Target) { if err != nil { l.setForcedError(err) } + l.setScrapeFailureLogger(sp.scrapeFailureLogger) sp.activeTargets[hash] = t sp.loops[hash] = l @@ -825,6 +851,7 @@ func (s *targetScraper) readResponse(ctx context.Context, resp *http.Response, w type loop interface { run(errc chan<- error) setForcedError(err error) + setScrapeFailureLogger(log.Logger) stop() getCache() *scrapeCache disableEndOfRunStalenessMarkers() @@ -840,6 +867,8 @@ type cacheEntry struct { type scrapeLoop struct { scraper scraper l log.Logger + scrapeFailureLogger log.Logger + scrapeFailureLoggerMtx sync.RWMutex cache *scrapeCache lastScrapeSize int buffers *pool.Pool @@ -1223,6 +1252,15 @@ func newScrapeLoop(ctx context.Context, return sl } +func (sl *scrapeLoop) setScrapeFailureLogger(l log.Logger) { + sl.scrapeFailureLoggerMtx.Lock() + defer sl.scrapeFailureLoggerMtx.Unlock() + if ts, ok := sl.scraper.(fmt.Stringer); ok && l != nil { + l = log.With(l, "target", ts.String()) + } + sl.scrapeFailureLogger = l +} + func (sl *scrapeLoop) run(errc chan<- error) { if !sl.skipOffsetting { select { @@ -1366,6 +1404,11 @@ func (sl *scrapeLoop) scrapeAndReport(last, appendTime time.Time, errc chan<- er bytesRead = len(b) } else { level.Debug(sl.l).Log("msg", "Scrape failed", "err", scrapeErr) + sl.scrapeFailureLoggerMtx.RLock() + if sl.scrapeFailureLogger != nil { + sl.scrapeFailureLogger.Log("err", scrapeErr) + } + sl.scrapeFailureLoggerMtx.RUnlock() if errc != nil { errc <- scrapeErr } diff --git a/scrape/scrape_test.go b/scrape/scrape_test.go index b703f21d4..a69a19d7f 100644 --- a/scrape/scrape_test.go +++ b/scrape/scrape_test.go @@ -158,6 +158,9 @@ type testLoop struct { timeout time.Duration } +func (l *testLoop) setScrapeFailureLogger(log.Logger) { +} + func (l *testLoop) run(errc chan<- error) { if l.runOnce { panic("loop must be started only once") @@ -3782,7 +3785,7 @@ scrape_configs: s.DB.EnableNativeHistograms() reg := prometheus.NewRegistry() - mng, err := NewManager(&Options{EnableNativeHistogramsIngestion: true}, nil, s, reg) + mng, err := NewManager(&Options{EnableNativeHistogramsIngestion: true}, nil, nil, s, reg) require.NoError(t, err) cfg, err := config.Load(configStr, false, log.NewNopLogger()) require.NoError(t, err)