From e68ccc770866d867010c9a9a9ab3ed91c6d36b9e Mon Sep 17 00:00:00 2001 From: Bartlomiej Plotka Date: Thu, 4 Nov 2021 10:08:53 +0100 Subject: [PATCH] Fix misleading agent-only/server-only check messages. (#9650) * Fix misleading agent-only/server-only check messages. Issue: ``` [root@host01 ~]# docker run -it --net=host --rm -v /root/editor/prom-agent-batcopter.yaml:/etc/prometheus/prometheus.yaml -v /root/prom-batcopter-data:/prometheus -u root --name prom-agent-batcopter quay.io/prometheus/prometheus:main --enable-feature=agent --config.file=/etc/prometheus/prometheus.yaml --storage.tsdb.path=/prometheus --web.listen-address=:9091 ts=2021-11-02T16:00:59.789Z caller=main.go:205 level=info msg="Experimental agent mode enabled." The following flag(s) can not be used in agent mode: ["--enable-feature"] ``` Problem was that PreAction gives us all parsed flag. Context does not give us any info on what flag clause it was defined. Also added info for flag help about being server or agent only. Signed-off-by: Bartlomiej Plotka * gofumpt. Signed-off-by: Bartlomiej Plotka --- cmd/prometheus/main.go | 127 +++++++++++++----------------------- cmd/prometheus/main_test.go | 49 ++++++++++---- 2 files changed, 81 insertions(+), 95 deletions(-) diff --git a/cmd/prometheus/main.go b/cmd/prometheus/main.go index 62796f9b6..fc2bd4a5b 100644 --- a/cmd/prometheus/main.go +++ b/cmd/prometheus/main.go @@ -106,35 +106,24 @@ func init() { } } -// agentOnlySetting can be provided to a kingpin flag's PreAction to mark a -// flag as agent-only. -func agentOnlySetting() func(*kingpin.ParseContext) error { - return func(pc *kingpin.ParseContext) error { - agentOnlyFlags = append(agentOnlyFlags, extractFlagName(pc)) - return nil - } +// serverOnlyFlag creates server-only kingpin flag. +func serverOnlyFlag(app *kingpin.Application, name, help string) *kingpin.FlagClause { + return app.Flag(name, fmt.Sprintf("%s Use with server mode only.", help)). + PreAction(func(parseContext *kingpin.ParseContext) error { + // This will be invoked only if flag is actually provided by user. + serverOnlyFlags = append(serverOnlyFlags, "--"+name) + return nil + }) } -// serverOnlySetting can be provided to a kingpin flag's PreAction to mark a -// flag as server-only. -func serverOnlySetting() func(*kingpin.ParseContext) error { - return func(pc *kingpin.ParseContext) error { - serverOnlyFlags = append(serverOnlyFlags, extractFlagName(pc)) - return nil - } -} - -// extractFlagName gets the flag name from the ParseContext. Only call -// from agentOnlySetting or serverOnlySetting. -func extractFlagName(pc *kingpin.ParseContext) string { - for _, pe := range pc.Elements { - fc, ok := pe.Clause.(*kingpin.FlagClause) - if !ok { - continue - } - return "--" + fc.Model().Name - } - panic("extractFlagName not called from a kingpin PreAction. This is a bug, please report to Prometheus.") +// agentOnlyFlag creates agent-only kingpin flag. +func agentOnlyFlag(app *kingpin.Application, name, help string) *kingpin.FlagClause { + return app.Flag(name, fmt.Sprintf("%s Use with agent mode only.", help)). + PreAction(func(parseContext *kingpin.ParseContext) error { + // This will be invoked only if flag is actually provided by user. + agentOnlyFlags = append(agentOnlyFlags, "--"+name) + return nil + }) } type flagConfig struct { @@ -285,106 +274,83 @@ func main() { a.Flag("web.cors.origin", `Regex for CORS origin. It is fully anchored. Example: 'https?://(domain1|domain2)\.com'`). Default(".*").StringVar(&cfg.corsRegexString) - a.Flag("storage.tsdb.path", "Base path for metrics storage."). - PreAction(serverOnlySetting()). + serverOnlyFlag(a, "storage.tsdb.path", "Base path for metrics storage."). Default("data/").StringVar(&cfg.localStoragePath) - a.Flag("storage.tsdb.min-block-duration", "Minimum duration of a data block before being persisted. For use in testing."). - PreAction(serverOnlySetting()). + serverOnlyFlag(a, "storage.tsdb.min-block-duration", "Minimum duration of a data block before being persisted. For use in testing."). Hidden().Default("2h").SetValue(&cfg.tsdb.MinBlockDuration) - a.Flag("storage.tsdb.max-block-duration", + serverOnlyFlag(a, "storage.tsdb.max-block-duration", "Maximum duration compacted blocks may span. For use in testing. (Defaults to 10% of the retention period.)"). - PreAction(serverOnlySetting()). Hidden().PlaceHolder("").SetValue(&cfg.tsdb.MaxBlockDuration) - a.Flag("storage.tsdb.max-block-chunk-segment-size", + serverOnlyFlag(a, "storage.tsdb.max-block-chunk-segment-size", "The maximum size for a single chunk segment in a block. Example: 512MB"). - PreAction(serverOnlySetting()). Hidden().PlaceHolder("").BytesVar(&cfg.tsdb.MaxBlockChunkSegmentSize) - a.Flag("storage.tsdb.wal-segment-size", + serverOnlyFlag(a, "storage.tsdb.wal-segment-size", "Size at which to split the tsdb WAL segment files. Example: 100MB"). - PreAction(serverOnlySetting()). Hidden().PlaceHolder("").BytesVar(&cfg.tsdb.WALSegmentSize) - a.Flag("storage.tsdb.retention", "[DEPRECATED] How long to retain samples in storage. This flag has been deprecated, use \"storage.tsdb.retention.time\" instead."). - PreAction(serverOnlySetting()). + serverOnlyFlag(a, "storage.tsdb.retention", "[DEPRECATED] How long to retain samples in storage. This flag has been deprecated, use \"storage.tsdb.retention.time\" instead."). SetValue(&oldFlagRetentionDuration) - a.Flag("storage.tsdb.retention.time", "How long to retain samples in storage. When this flag is set it overrides \"storage.tsdb.retention\". If neither this flag nor \"storage.tsdb.retention\" nor \"storage.tsdb.retention.size\" is set, the retention time defaults to "+defaultRetentionString+". Units Supported: y, w, d, h, m, s, ms."). - PreAction(serverOnlySetting()). + serverOnlyFlag(a, "storage.tsdb.retention.time", "How long to retain samples in storage. When this flag is set it overrides \"storage.tsdb.retention\". If neither this flag nor \"storage.tsdb.retention\" nor \"storage.tsdb.retention.size\" is set, the retention time defaults to "+defaultRetentionString+". Units Supported: y, w, d, h, m, s, ms."). SetValue(&newFlagRetentionDuration) - a.Flag("storage.tsdb.retention.size", "Maximum number of bytes that can be stored for blocks. A unit is required, supported units: B, KB, MB, GB, TB, PB, EB. Ex: \"512MB\"."). - PreAction(serverOnlySetting()). + serverOnlyFlag(a, "storage.tsdb.retention.size", "Maximum number of bytes that can be stored for blocks. A unit is required, supported units: B, KB, MB, GB, TB, PB, EB. Ex: \"512MB\"."). BytesVar(&cfg.tsdb.MaxBytes) - a.Flag("storage.tsdb.no-lockfile", "Do not create lockfile in data directory."). - PreAction(serverOnlySetting()). + serverOnlyFlag(a, "storage.tsdb.no-lockfile", "Do not create lockfile in data directory."). Default("false").BoolVar(&cfg.tsdb.NoLockfile) - a.Flag("storage.tsdb.allow-overlapping-blocks", "Allow overlapping blocks, which in turn enables vertical compaction and vertical query merge."). - PreAction(serverOnlySetting()). + serverOnlyFlag(a, "storage.tsdb.allow-overlapping-blocks", "Allow overlapping blocks, which in turn enables vertical compaction and vertical query merge."). Default("false").BoolVar(&cfg.tsdb.AllowOverlappingBlocks) - a.Flag("storage.tsdb.wal-compression", "Compress the tsdb WAL."). - PreAction(serverOnlySetting()). + serverOnlyFlag(a, "storage.tsdb.wal-compression", "Compress the tsdb WAL."). Hidden().Default("true").BoolVar(&cfg.tsdb.WALCompression) - a.Flag("storage.agent.path", "Base path for metrics storage."). - PreAction(agentOnlySetting()). + agentOnlyFlag(a, "storage.agent.path", "Base path for metrics storage."). Default("data-agent/").StringVar(&cfg.localStoragePath) - a.Flag("storage.agent.wal-segment-size", + agentOnlyFlag(a, "storage.agent.wal-segment-size", "Size at which to split WAL segment files. Example: 100MB"). - PreAction(agentOnlySetting()). Hidden().PlaceHolder("").BytesVar(&cfg.agent.WALSegmentSize) - a.Flag("storage.agent.wal-compression", "Compress the agent WAL."). - PreAction(agentOnlySetting()). + agentOnlyFlag(a, "storage.agent.wal-compression", "Compress the agent WAL."). Default("true").BoolVar(&cfg.agent.WALCompression) - a.Flag("storage.agent.wal-truncate-frequency", + agentOnlyFlag(a, "storage.agent.wal-truncate-frequency", "The frequency at which to truncate the WAL and remove old data."). - PreAction(agentOnlySetting()). Hidden().PlaceHolder("").SetValue(&cfg.agent.TruncateFrequency) - a.Flag("storage.agent.retention.min-time", + agentOnlyFlag(a, "storage.agent.retention.min-time", "Minimum age samples may be before being considered for deletion when the WAL is truncated"). - PreAction(agentOnlySetting()). SetValue(&cfg.agent.MinWALTime) - a.Flag("storage.agent.retention.max-time", + agentOnlyFlag(a, "storage.agent.retention.max-time", "Maximum age samples may be before being forcibly deleted when the WAL is truncated"). - PreAction(agentOnlySetting()). SetValue(&cfg.agent.MaxWALTime) a.Flag("storage.remote.flush-deadline", "How long to wait flushing sample on shutdown or config reload."). Default("1m").PlaceHolder("").SetValue(&cfg.RemoteFlushDeadline) - a.Flag("storage.remote.read-sample-limit", "Maximum overall number of samples to return via the remote read interface, in a single query. 0 means no limit. This limit is ignored for streamed response types."). - PreAction(serverOnlySetting()). + serverOnlyFlag(a, "storage.remote.read-sample-limit", "Maximum overall number of samples to return via the remote read interface, in a single query. 0 means no limit. This limit is ignored for streamed response types."). Default("5e7").IntVar(&cfg.web.RemoteReadSampleLimit) - a.Flag("storage.remote.read-concurrent-limit", "Maximum number of concurrent remote read calls. 0 means no limit."). - PreAction(serverOnlySetting()). + serverOnlyFlag(a, "storage.remote.read-concurrent-limit", "Maximum number of concurrent remote read calls. 0 means no limit."). Default("10").IntVar(&cfg.web.RemoteReadConcurrencyLimit) - a.Flag("storage.remote.read-max-bytes-in-frame", "Maximum number of bytes in a single frame for streaming remote read response types before marshalling. Note that client might have limit on frame size as well. 1MB as recommended by protobuf by default."). - PreAction(serverOnlySetting()). + serverOnlyFlag(a, "storage.remote.read-max-bytes-in-frame", "Maximum number of bytes in a single frame for streaming remote read response types before marshalling. Note that client might have limit on frame size as well. 1MB as recommended by protobuf by default."). Default("1048576").IntVar(&cfg.web.RemoteReadBytesInFrame) - a.Flag("rules.alert.for-outage-tolerance", "Max time to tolerate prometheus outage for restoring \"for\" state of alert."). - PreAction(serverOnlySetting()). + serverOnlyFlag(a, "rules.alert.for-outage-tolerance", "Max time to tolerate prometheus outage for restoring \"for\" state of alert."). Default("1h").SetValue(&cfg.outageTolerance) - a.Flag("rules.alert.for-grace-period", "Minimum duration between alert and restored \"for\" state. This is maintained only for alerts with configured \"for\" time greater than grace period."). - PreAction(serverOnlySetting()). + serverOnlyFlag(a, "rules.alert.for-grace-period", "Minimum duration between alert and restored \"for\" state. This is maintained only for alerts with configured \"for\" time greater than grace period."). Default("10m").SetValue(&cfg.forGracePeriod) - a.Flag("rules.alert.resend-delay", "Minimum amount of time to wait before resending an alert to Alertmanager."). - PreAction(serverOnlySetting()). + serverOnlyFlag(a, "rules.alert.resend-delay", "Minimum amount of time to wait before resending an alert to Alertmanager."). Default("1m").SetValue(&cfg.resendDelay) a.Flag("scrape.adjust-timestamps", "Adjust scrape timestamps by up to `scrape.timestamp-tolerance` to align them to the intended schedule. See https://github.com/prometheus/prometheus/issues/7846 for more context. Experimental. This flag will be removed in a future release."). @@ -393,27 +359,22 @@ func main() { a.Flag("scrape.timestamp-tolerance", "Timestamp tolerance. See https://github.com/prometheus/prometheus/issues/7846 for more context. Experimental. This flag will be removed in a future release."). Hidden().Default("2ms").DurationVar(&scrape.ScrapeTimestampTolerance) - a.Flag("alertmanager.notification-queue-capacity", "The capacity of the queue for pending Alertmanager notifications."). - PreAction(serverOnlySetting()). + serverOnlyFlag(a, "alertmanager.notification-queue-capacity", "The capacity of the queue for pending Alertmanager notifications."). Default("10000").IntVar(&cfg.notifier.QueueCapacity) // TODO: Remove in Prometheus 3.0. alertmanagerTimeout := a.Flag("alertmanager.timeout", "[DEPRECATED] This flag has no effect.").Hidden().String() - a.Flag("query.lookback-delta", "The maximum lookback duration for retrieving metrics during expression evaluations and federation."). - PreAction(serverOnlySetting()). + serverOnlyFlag(a, "query.lookback-delta", "The maximum lookback duration for retrieving metrics during expression evaluations and federation."). Default("5m").SetValue(&cfg.lookbackDelta) - a.Flag("query.timeout", "Maximum time a query may take before being aborted."). - PreAction(serverOnlySetting()). + serverOnlyFlag(a, "query.timeout", "Maximum time a query may take before being aborted."). Default("2m").SetValue(&cfg.queryTimeout) - a.Flag("query.max-concurrency", "Maximum number of queries executed concurrently."). - PreAction(serverOnlySetting()). + serverOnlyFlag(a, "query.max-concurrency", "Maximum number of queries executed concurrently."). Default("20").IntVar(&cfg.queryConcurrency) - a.Flag("query.max-samples", "Maximum number of samples a single query can load into memory. Note that queries will fail if they try to load more samples than this into memory, so this also limits the number of samples a query can return."). - PreAction(serverOnlySetting()). + serverOnlyFlag(a, "query.max-samples", "Maximum number of samples a single query can load into memory. Note that queries will fail if they try to load more samples than this into memory, so this also limits the number of samples a query can return."). Default("50000000").IntVar(&cfg.queryMaxSamples) a.Flag("enable-feature", "Comma separated feature names to enable. Valid options: exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-at-modifier, promql-negative-offset, remote-write-receiver, extra-scrape-metrics, new-service-discovery-manager. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details."). diff --git a/cmd/prometheus/main_test.go b/cmd/prometheus/main_test.go index 94196a986..c8f15b559 100644 --- a/cmd/prometheus/main_test.go +++ b/cmd/prometheus/main_test.go @@ -14,6 +14,7 @@ package main import ( + "bytes" "context" "fmt" "io/ioutil" @@ -21,6 +22,7 @@ import ( "os" "os/exec" "path/filepath" + "strings" "syscall" "testing" "time" @@ -355,13 +357,11 @@ func getCurrentGaugeValuesFor(t *testing.T, reg prometheus.Gatherer, metricNames func TestAgentSuccessfulStartup(t *testing.T) { prom := exec.Command(promPath, "-test.main", "--enable-feature=agent", "--config.file="+agentConfig) - err := prom.Start() - require.NoError(t, err) + require.NoError(t, prom.Start()) - expectedExitStatus := 0 actualExitStatus := 0 - done := make(chan error, 1) + go func() { done <- prom.Wait() }() select { case err := <-done: @@ -370,18 +370,19 @@ func TestAgentSuccessfulStartup(t *testing.T) { case <-time.After(5 * time.Second): prom.Process.Kill() } - require.Equal(t, expectedExitStatus, actualExitStatus) + require.Equal(t, 0, actualExitStatus) } -func TestAgentStartupWithInvalidConfig(t *testing.T) { - prom := exec.Command(promPath, "-test.main", "--enable-feature=agent", "--config.file="+promConfig) - err := prom.Start() - require.NoError(t, err) +func TestAgentFailedStartupWithServerFlag(t *testing.T) { + prom := exec.Command(promPath, "-test.main", "--enable-feature=agent", "--storage.tsdb.path=.", "--config.file="+promConfig) + + output := bytes.Buffer{} + prom.Stderr = &output + require.NoError(t, prom.Start()) - expectedExitStatus := 2 actualExitStatus := 0 - done := make(chan error, 1) + go func() { done <- prom.Wait() }() select { case err := <-done: @@ -390,7 +391,31 @@ func TestAgentStartupWithInvalidConfig(t *testing.T) { case <-time.After(5 * time.Second): prom.Process.Kill() } - require.Equal(t, expectedExitStatus, actualExitStatus) + + require.Equal(t, 3, actualExitStatus) + + // Assert on last line. + lines := strings.Split(output.String(), "\n") + last := lines[len(lines)-1] + require.Equal(t, "The following flag(s) can not be used in agent mode: [\"--storage.tsdb.path\"]", last) +} + +func TestAgentFailedStartupWithInvalidConfig(t *testing.T) { + prom := exec.Command(promPath, "-test.main", "--enable-feature=agent", "--config.file="+promConfig) + require.NoError(t, prom.Start()) + + actualExitStatus := 0 + done := make(chan error, 1) + + go func() { done <- prom.Wait() }() + select { + case err := <-done: + t.Logf("prometheus agent should not be running: %v", err) + actualExitStatus = prom.ProcessState.ExitCode() + case <-time.After(5 * time.Second): + prom.Process.Kill() + } + require.Equal(t, 2, actualExitStatus) } func TestModeSpecificFlags(t *testing.T) {