From 5df6ea30420d94eec1f8f247367767053aabcb3c Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Wed, 15 Jan 2025 08:45:05 +0100 Subject: [PATCH] promtool: Support linting of scrape interval (#15719) * PromTool: Support Scrape Interval Lint Checking --------- Signed-off-by: zhaowang Signed-off-by: Arve Knudsen Co-authored-by: zhaowang --- CHANGELOG.md | 2 + cmd/promtool/main.go | 150 ++++++++++++------ cmd/promtool/main_test.go | 61 +++++-- ...s-config.lint.too_long_scrape_interval.yml | 3 + docs/command-line/promtool.md | 9 +- 5 files changed, 167 insertions(+), 58 deletions(-) create mode 100644 cmd/promtool/testdata/prometheus-config.lint.too_long_scrape_interval.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index d0a7ef6611..40be59f724 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## unreleased +* [ENHANCEMENT] promtool: Support linting of scrape interval, through lint option `too-long-scrape-interval`. #15719 + ## 3.1.0 / 2025-01-02 * [SECURITY] upgrade golang.org/x/crypto to address reported CVE-2024-45337. #15691 diff --git a/cmd/promtool/main.go b/cmd/promtool/main.go index 62a1d4f906..54292bbfe5 100644 --- a/cmd/promtool/main.go +++ b/cmd/promtool/main.go @@ -73,14 +73,19 @@ const ( // Exit code 3 is used for "one or more lint issues detected". lintErrExitCode = 3 - lintOptionAll = "all" - lintOptionDuplicateRules = "duplicate-rules" - lintOptionNone = "none" - checkHealth = "/-/healthy" - checkReadiness = "/-/ready" + lintOptionAll = "all" + lintOptionDuplicateRules = "duplicate-rules" + lintOptionTooLongScrapeInterval = "too-long-scrape-interval" + lintOptionNone = "none" + checkHealth = "/-/healthy" + checkReadiness = "/-/ready" ) -var lintOptions = []string{lintOptionAll, lintOptionDuplicateRules, lintOptionNone} +var ( + lintRulesOptions = []string{lintOptionAll, lintOptionDuplicateRules, lintOptionNone} + // Same as lintRulesOptions, but including scrape config linting options as well. + lintConfigOptions = append(append([]string{}, lintRulesOptions...), lintOptionTooLongScrapeInterval) +) func main() { var ( @@ -97,6 +102,10 @@ func main() { app.HelpFlag.Short('h') checkCmd := app.Command("check", "Check the resources for validity.") + checkLookbackDelta := checkCmd.Flag( + "query.lookback-delta", + "The server's maximum query lookback duration.", + ).Default("5m").Duration() experimental := app.Flag("experimental", "Enable experimental commands.").Bool() @@ -113,7 +122,7 @@ func main() { checkConfigSyntaxOnly := checkConfigCmd.Flag("syntax-only", "Only check the config file syntax, ignoring file and content validation referenced in the config").Bool() checkConfigLint := checkConfigCmd.Flag( "lint", - "Linting checks to apply to the rules specified in the config. Available options are: "+strings.Join(lintOptions, ", ")+". Use --lint=none to disable linting", + "Linting checks to apply to the rules/scrape configs specified in the config. Available options are: "+strings.Join(lintConfigOptions, ", ")+". Use --lint=none to disable linting", ).Default(lintOptionDuplicateRules).String() checkConfigLintFatal := checkConfigCmd.Flag( "lint-fatal", @@ -140,7 +149,7 @@ func main() { ).ExistingFiles() checkRulesLint := checkRulesCmd.Flag( "lint", - "Linting checks to apply. Available options are: "+strings.Join(lintOptions, ", ")+". Use --lint=none to disable linting", + "Linting checks to apply. Available options are: "+strings.Join(lintRulesOptions, ", ")+". Use --lint=none to disable linting", ).Default(lintOptionDuplicateRules).String() checkRulesLintFatal := checkRulesCmd.Flag( "lint-fatal", @@ -339,7 +348,7 @@ func main() { os.Exit(CheckSD(*sdConfigFile, *sdJobName, *sdTimeout, prometheus.DefaultRegisterer)) case checkConfigCmd.FullCommand(): - os.Exit(CheckConfig(*agentMode, *checkConfigSyntaxOnly, newLintConfig(*checkConfigLint, *checkConfigLintFatal), *configFiles...)) + os.Exit(CheckConfig(*agentMode, *checkConfigSyntaxOnly, newConfigLintConfig(*checkConfigLint, *checkConfigLintFatal, model.Duration(*checkLookbackDelta)), *configFiles...)) case checkServerHealthCmd.FullCommand(): os.Exit(checkErr(CheckServerStatus(serverURL, checkHealth, httpRoundTripper))) @@ -351,7 +360,7 @@ func main() { os.Exit(CheckWebConfig(*webConfigFiles...)) case checkRulesCmd.FullCommand(): - os.Exit(CheckRules(newLintConfig(*checkRulesLint, *checkRulesLintFatal), *ruleFiles...)) + os.Exit(CheckRules(newRulesLintConfig(*checkRulesLint, *checkRulesLintFatal), *ruleFiles...)) case checkMetricsCmd.FullCommand(): os.Exit(CheckMetrics(*checkMetricsExtended)) @@ -445,15 +454,15 @@ func checkExperimental(f bool) { var errLint = errors.New("lint error") -type lintConfig struct { +type rulesLintConfig struct { all bool duplicateRules bool fatal bool } -func newLintConfig(stringVal string, fatal bool) lintConfig { +func newRulesLintConfig(stringVal string, fatal bool) rulesLintConfig { items := strings.Split(stringVal, ",") - ls := lintConfig{ + ls := rulesLintConfig{ fatal: fatal, } for _, setting := range items { @@ -464,16 +473,57 @@ func newLintConfig(stringVal string, fatal bool) lintConfig { ls.duplicateRules = true case lintOptionNone: default: - fmt.Printf("WARNING: unknown lint option %s\n", setting) + fmt.Printf("WARNING: unknown lint option: %q\n", setting) } } return ls } -func (ls lintConfig) lintDuplicateRules() bool { +func (ls rulesLintConfig) lintDuplicateRules() bool { return ls.all || ls.duplicateRules } +type configLintConfig struct { + rulesLintConfig + + lookbackDelta model.Duration +} + +func newConfigLintConfig(optionsStr string, fatal bool, lookbackDelta model.Duration) configLintConfig { + c := configLintConfig{ + rulesLintConfig: rulesLintConfig{ + fatal: fatal, + }, + } + + lintNone := false + var rulesOptions []string + for _, option := range strings.Split(optionsStr, ",") { + switch option { + case lintOptionAll, lintOptionTooLongScrapeInterval: + c.lookbackDelta = lookbackDelta + if option == lintOptionAll { + rulesOptions = append(rulesOptions, lintOptionAll) + } + case lintOptionNone: + lintNone = true + default: + rulesOptions = append(rulesOptions, option) + } + } + + if lintNone { + c.lookbackDelta = 0 + rulesOptions = nil + } + + if len(rulesOptions) > 0 { + c.rulesLintConfig = newRulesLintConfig(strings.Join(rulesOptions, ","), fatal) + } + + return c +} + // CheckServerStatus - healthy & ready. func CheckServerStatus(serverURL *url.URL, checkEndpoint string, roundTripper http.RoundTripper) error { if serverURL.Scheme == "" { @@ -512,12 +562,12 @@ func CheckServerStatus(serverURL *url.URL, checkEndpoint string, roundTripper ht } // CheckConfig validates configuration files. -func CheckConfig(agentMode, checkSyntaxOnly bool, lintSettings lintConfig, files ...string) int { +func CheckConfig(agentMode, checkSyntaxOnly bool, lintSettings configLintConfig, files ...string) int { failed := false hasErrors := false for _, f := range files { - ruleFiles, err := checkConfig(agentMode, f, checkSyntaxOnly) + ruleFiles, scrapeConfigs, err := checkConfig(agentMode, f, checkSyntaxOnly) if err != nil { fmt.Fprintln(os.Stderr, " FAILED:", err) hasErrors = true @@ -530,12 +580,12 @@ func CheckConfig(agentMode, checkSyntaxOnly bool, lintSettings lintConfig, files } fmt.Println() - rulesFailed, rulesHasErrors := checkRules(ruleFiles, lintSettings) - if rulesFailed { - failed = rulesFailed - } - if rulesHasErrors { - hasErrors = rulesHasErrors + if !checkSyntaxOnly { + scrapeConfigsFailed := lintScrapeConfigs(scrapeConfigs, lintSettings) + failed = failed || scrapeConfigsFailed + rulesFailed, rulesHaveErrors := checkRules(ruleFiles, lintSettings.rulesLintConfig) + failed = failed || rulesFailed + hasErrors = hasErrors || rulesHaveErrors } } if failed && hasErrors { @@ -574,12 +624,12 @@ func checkFileExists(fn string) error { return err } -func checkConfig(agentMode bool, filename string, checkSyntaxOnly bool) ([]string, error) { +func checkConfig(agentMode bool, filename string, checkSyntaxOnly bool) ([]string, []*config.ScrapeConfig, error) { fmt.Println("Checking", filename) cfg, err := config.LoadFile(filename, agentMode, promslog.NewNopLogger()) if err != nil { - return nil, err + return nil, nil, err } var ruleFiles []string @@ -587,15 +637,15 @@ func checkConfig(agentMode bool, filename string, checkSyntaxOnly bool) ([]strin for _, rf := range cfg.RuleFiles { rfs, err := filepath.Glob(rf) if err != nil { - return nil, err + return nil, nil, err } // If an explicit file was given, error if it is not accessible. if !strings.Contains(rf, "*") { if len(rfs) == 0 { - return nil, fmt.Errorf("%q does not point to an existing file", rf) + return nil, nil, fmt.Errorf("%q does not point to an existing file", rf) } if err := checkFileExists(rfs[0]); err != nil { - return nil, fmt.Errorf("error checking rule file %q: %w", rfs[0], err) + return nil, nil, fmt.Errorf("error checking rule file %q: %w", rfs[0], err) } } ruleFiles = append(ruleFiles, rfs...) @@ -609,26 +659,26 @@ func checkConfig(agentMode bool, filename string, checkSyntaxOnly bool) ([]strin var err error scfgs, err = cfg.GetScrapeConfigs() if err != nil { - return nil, fmt.Errorf("error loading scrape configs: %w", err) + return nil, nil, fmt.Errorf("error loading scrape configs: %w", err) } } for _, scfg := range scfgs { if !checkSyntaxOnly && scfg.HTTPClientConfig.Authorization != nil { if err := checkFileExists(scfg.HTTPClientConfig.Authorization.CredentialsFile); err != nil { - return nil, fmt.Errorf("error checking authorization credentials or bearer token file %q: %w", scfg.HTTPClientConfig.Authorization.CredentialsFile, err) + return nil, nil, fmt.Errorf("error checking authorization credentials or bearer token file %q: %w", scfg.HTTPClientConfig.Authorization.CredentialsFile, err) } } if err := checkTLSConfig(scfg.HTTPClientConfig.TLSConfig, checkSyntaxOnly); err != nil { - return nil, err + return nil, nil, err } for _, c := range scfg.ServiceDiscoveryConfigs { switch c := c.(type) { case *kubernetes.SDConfig: if err := checkTLSConfig(c.HTTPClientConfig.TLSConfig, checkSyntaxOnly); err != nil { - return nil, err + return nil, nil, err } case *file.SDConfig: if checkSyntaxOnly { @@ -637,17 +687,17 @@ func checkConfig(agentMode bool, filename string, checkSyntaxOnly bool) ([]strin for _, file := range c.Files { files, err := filepath.Glob(file) if err != nil { - return nil, err + return nil, nil, err } if len(files) != 0 { for _, f := range files { var targetGroups []*targetgroup.Group targetGroups, err = checkSDFile(f) if err != nil { - return nil, fmt.Errorf("checking SD file %q: %w", file, err) + return nil, nil, fmt.Errorf("checking SD file %q: %w", file, err) } if err := checkTargetGroupsForScrapeConfig(targetGroups, scfg); err != nil { - return nil, err + return nil, nil, err } } continue @@ -656,7 +706,7 @@ func checkConfig(agentMode bool, filename string, checkSyntaxOnly bool) ([]strin } case discovery.StaticConfig: if err := checkTargetGroupsForScrapeConfig(c, scfg); err != nil { - return nil, err + return nil, nil, err } } } @@ -673,18 +723,18 @@ func checkConfig(agentMode bool, filename string, checkSyntaxOnly bool) ([]strin for _, file := range c.Files { files, err := filepath.Glob(file) if err != nil { - return nil, err + return nil, nil, err } if len(files) != 0 { for _, f := range files { var targetGroups []*targetgroup.Group targetGroups, err = checkSDFile(f) if err != nil { - return nil, fmt.Errorf("checking SD file %q: %w", file, err) + return nil, nil, fmt.Errorf("checking SD file %q: %w", file, err) } if err := checkTargetGroupsForAlertmanager(targetGroups, amcfg); err != nil { - return nil, err + return nil, nil, err } } continue @@ -693,12 +743,12 @@ func checkConfig(agentMode bool, filename string, checkSyntaxOnly bool) ([]strin } case discovery.StaticConfig: if err := checkTargetGroupsForAlertmanager(c, amcfg); err != nil { - return nil, err + return nil, nil, err } } } } - return ruleFiles, nil + return ruleFiles, scfgs, nil } func checkTLSConfig(tlsConfig promconfig.TLSConfig, checkSyntaxOnly bool) error { @@ -760,7 +810,7 @@ func checkSDFile(filename string) ([]*targetgroup.Group, error) { } // CheckRules validates rule files. -func CheckRules(ls lintConfig, files ...string) int { +func CheckRules(ls rulesLintConfig, files ...string) int { failed := false hasErrors := false if len(files) == 0 { @@ -780,7 +830,7 @@ func CheckRules(ls lintConfig, files ...string) int { } // checkRulesFromStdin validates rule from stdin. -func checkRulesFromStdin(ls lintConfig) (bool, bool) { +func checkRulesFromStdin(ls rulesLintConfig) (bool, bool) { failed := false hasErrors := false fmt.Println("Checking standard input") @@ -818,7 +868,7 @@ func checkRulesFromStdin(ls lintConfig) (bool, bool) { } // checkRules validates rule files. -func checkRules(files []string, ls lintConfig) (bool, bool) { +func checkRules(files []string, ls rulesLintConfig) (bool, bool) { failed := false hasErrors := false for _, f := range files { @@ -852,7 +902,7 @@ func checkRules(files []string, ls lintConfig) (bool, bool) { return failed, hasErrors } -func checkRuleGroups(rgs *rulefmt.RuleGroups, lintSettings lintConfig) (int, []error) { +func checkRuleGroups(rgs *rulefmt.RuleGroups, lintSettings rulesLintConfig) (int, []error) { numRules := 0 for _, rg := range rgs.Groups { numRules += len(rg.Rules) @@ -876,6 +926,16 @@ func checkRuleGroups(rgs *rulefmt.RuleGroups, lintSettings lintConfig) (int, []e return numRules, nil } +func lintScrapeConfigs(scrapeConfigs []*config.ScrapeConfig, lintSettings configLintConfig) bool { + for _, scfg := range scrapeConfigs { + if lintSettings.lookbackDelta > 0 && scfg.ScrapeInterval >= lintSettings.lookbackDelta { + fmt.Fprintf(os.Stderr, " FAILED: too long scrape interval found, data point will be marked as stale - job: %s, interval: %s\n", scfg.JobName, scfg.ScrapeInterval) + return true + } + } + return false +} + type compareRuleType struct { metric string label labels.Labels diff --git a/cmd/promtool/main_test.go b/cmd/promtool/main_test.go index 9a07269188..92e5ff9e67 100644 --- a/cmd/promtool/main_test.go +++ b/cmd/promtool/main_test.go @@ -234,7 +234,7 @@ func TestCheckTargetConfig(t *testing.T) { for _, test := range cases { t.Run(test.name, func(t *testing.T) { t.Parallel() - _, err := checkConfig(false, "testdata/"+test.file, false) + _, _, err := checkConfig(false, "testdata/"+test.file, false) if test.err != "" { require.EqualErrorf(t, err, test.err, "Expected error %q, got %q", test.err, err.Error()) return @@ -319,7 +319,7 @@ func TestCheckConfigSyntax(t *testing.T) { for _, test := range cases { t.Run(test.name, func(t *testing.T) { t.Parallel() - _, err := checkConfig(false, "testdata/"+test.file, test.syntaxOnly) + _, _, err := checkConfig(false, "testdata/"+test.file, test.syntaxOnly) expectedErrMsg := test.err if strings.Contains(runtime.GOOS, "windows") { expectedErrMsg = test.errWindows @@ -355,7 +355,7 @@ func TestAuthorizationConfig(t *testing.T) { for _, test := range cases { t.Run(test.name, func(t *testing.T) { t.Parallel() - _, err := checkConfig(false, "testdata/"+test.file, false) + _, _, err := checkConfig(false, "testdata/"+test.file, false) if test.err != "" { require.ErrorContains(t, err, test.err, "Expected error to contain %q, got %q", test.err, err.Error()) return @@ -508,7 +508,7 @@ func TestCheckRules(t *testing.T) { defer func(v *os.File) { os.Stdin = v }(os.Stdin) os.Stdin = r - exitCode := CheckRules(newLintConfig(lintOptionDuplicateRules, false)) + exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, false)) require.Equal(t, successExitCode, exitCode, "") }) @@ -530,7 +530,7 @@ func TestCheckRules(t *testing.T) { defer func(v *os.File) { os.Stdin = v }(os.Stdin) os.Stdin = r - exitCode := CheckRules(newLintConfig(lintOptionDuplicateRules, false)) + exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, false)) require.Equal(t, failureExitCode, exitCode, "") }) @@ -552,7 +552,7 @@ func TestCheckRules(t *testing.T) { defer func(v *os.File) { os.Stdin = v }(os.Stdin) os.Stdin = r - exitCode := CheckRules(newLintConfig(lintOptionDuplicateRules, true)) + exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, true)) require.Equal(t, lintErrExitCode, exitCode, "") }) } @@ -560,23 +560,66 @@ func TestCheckRules(t *testing.T) { func TestCheckRulesWithRuleFiles(t *testing.T) { t.Run("rules-good", func(t *testing.T) { t.Parallel() - exitCode := CheckRules(newLintConfig(lintOptionDuplicateRules, false), "./testdata/rules.yml") + exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, false), "./testdata/rules.yml") require.Equal(t, successExitCode, exitCode, "") }) t.Run("rules-bad", func(t *testing.T) { t.Parallel() - exitCode := CheckRules(newLintConfig(lintOptionDuplicateRules, false), "./testdata/rules-bad.yml") + exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, false), "./testdata/rules-bad.yml") require.Equal(t, failureExitCode, exitCode, "") }) t.Run("rules-lint-fatal", func(t *testing.T) { t.Parallel() - exitCode := CheckRules(newLintConfig(lintOptionDuplicateRules, true), "./testdata/prometheus-rules.lint.yml") + exitCode := CheckRules(newRulesLintConfig(lintOptionDuplicateRules, true), "./testdata/prometheus-rules.lint.yml") require.Equal(t, lintErrExitCode, exitCode, "") }) } +func TestCheckScrapeConfigs(t *testing.T) { + for _, tc := range []struct { + name string + lookbackDelta model.Duration + expectError bool + }{ + { + name: "scrape interval less than lookback delta", + lookbackDelta: model.Duration(11 * time.Minute), + expectError: false, + }, + { + name: "scrape interval greater than lookback delta", + lookbackDelta: model.Duration(5 * time.Minute), + expectError: true, + }, + { + name: "scrape interval same as lookback delta", + lookbackDelta: model.Duration(10 * time.Minute), + expectError: true, + }, + } { + t.Run(tc.name, func(t *testing.T) { + // Non-fatal linting. + code := CheckConfig(false, false, newConfigLintConfig(lintOptionTooLongScrapeInterval, false, tc.lookbackDelta), "./testdata/prometheus-config.lint.too_long_scrape_interval.yml") + require.Equal(t, successExitCode, code, "Non-fatal linting should return success") + // Fatal linting. + code = CheckConfig(false, false, newConfigLintConfig(lintOptionTooLongScrapeInterval, true, tc.lookbackDelta), "./testdata/prometheus-config.lint.too_long_scrape_interval.yml") + if tc.expectError { + require.Equal(t, lintErrExitCode, code, "Fatal linting should return error") + } else { + require.Equal(t, successExitCode, code, "Fatal linting should return success when there are no problems") + } + // Check syntax only, no linting. + code = CheckConfig(false, true, newConfigLintConfig(lintOptionTooLongScrapeInterval, true, tc.lookbackDelta), "./testdata/prometheus-config.lint.too_long_scrape_interval.yml") + require.Equal(t, successExitCode, code, "Fatal linting should return success when checking syntax only") + // Lint option "none" should disable linting. + code = CheckConfig(false, false, newConfigLintConfig(lintOptionNone+","+lintOptionTooLongScrapeInterval, true, tc.lookbackDelta), "./testdata/prometheus-config.lint.too_long_scrape_interval.yml") + require.Equal(t, successExitCode, code, `Fatal linting should return success when lint option "none" is specified`) + }) + } +} + func TestTSDBDumpCommand(t *testing.T) { if testing.Short() { t.Skip("skipping test in short mode.") diff --git a/cmd/promtool/testdata/prometheus-config.lint.too_long_scrape_interval.yml b/cmd/promtool/testdata/prometheus-config.lint.too_long_scrape_interval.yml new file mode 100644 index 0000000000..0c85d13f31 --- /dev/null +++ b/cmd/promtool/testdata/prometheus-config.lint.too_long_scrape_interval.yml @@ -0,0 +1,3 @@ +scrape_configs: + - job_name: too_long_scrape_interval_test + scrape_interval: 10m diff --git a/docs/command-line/promtool.md b/docs/command-line/promtool.md index 5e2a8f6bb1..09800af748 100644 --- a/docs/command-line/promtool.md +++ b/docs/command-line/promtool.md @@ -59,9 +59,10 @@ Check the resources for validity. #### Flags -| Flag | Description | -| --- | --- | -| --extended | Print extended information related to the cardinality of the metrics. | +| Flag | Description | Default | +| --- | --- | --- | +| --query.lookback-delta | The server's maximum query lookback duration. | `5m` | +| --extended | Print extended information related to the cardinality of the metrics. | | @@ -102,7 +103,7 @@ Check if the config files are valid or not. | Flag | Description | Default | | --- | --- | --- | | --syntax-only | Only check the config file syntax, ignoring file and content validation referenced in the config | | -| --lint | Linting checks to apply to the rules specified in the config. Available options are: all, duplicate-rules, none. Use --lint=none to disable linting | `duplicate-rules` | +| --lint | Linting checks to apply to the rules/scrape configs specified in the config. Available options are: all, duplicate-rules, none, too-long-scrape-interval. Use --lint=none to disable linting | `duplicate-rules` | | --lint-fatal | Make lint errors exit with exit code 3. | `false` | | --agent | Check config file for Prometheus in Agent mode. | |