Support reload config automatically

Signed-off-by: Julien <roidelapluie@o11y.eu>
This commit is contained in:
Julien 2024-08-30 13:25:02 +02:00
parent d63f5b35df
commit 1cd2d0498b
5 changed files with 379 additions and 2 deletions

View file

@ -154,6 +154,9 @@ type flagConfig struct {
RemoteFlushDeadline model.Duration
nameEscapingScheme string
enableAutoReload bool
autoReloadInterval model.Duration
featureList []string
memlimitRatio float64
// These options are extracted from featureList
@ -212,6 +215,12 @@ func (c *flagConfig) setFeatureListOptions(logger log.Logger) error {
case "auto-gomaxprocs":
c.enableAutoGOMAXPROCS = true
level.Info(logger).Log("msg", "Automatically set GOMAXPROCS to match Linux container CPU quota")
case "auto-reload-config":
c.enableAutoReload = true
if s := time.Duration(c.autoReloadInterval).Seconds(); s > 0 && s < 1 {
c.autoReloadInterval, _ = model.ParseDuration("1s")
}
level.Info(logger).Log("msg", fmt.Sprintf("Enabled automatic configuration file reloading. Checking for configuration changes every %s.", c.autoReloadInterval))
case "auto-gomemlimit":
c.enableAutoGOMEMLIMIT = true
level.Info(logger).Log("msg", "Automatically set GOMEMLIMIT to match Linux container or system memory limit")
@ -302,6 +311,9 @@ func main() {
a.Flag("config.file", "Prometheus configuration file path.").
Default("prometheus.yml").StringVar(&cfg.configFile)
a.Flag("config.auto-reload-interval", "Specifies the interval for checking and automatically reloading the Prometheus configuration file upon detecting changes.").
Default("30s").SetValue(&cfg.autoReloadInterval)
a.Flag("web.listen-address", "Address to listen on for UI, API, and telemetry. Can be repeated.").
Default("0.0.0.0:9090").StringsVar(&cfg.web.ListenAddresses)
@ -492,7 +504,7 @@ func main() {
a.Flag("scrape.name-escaping-scheme", `Method for escaping legacy invalid names when sending to Prometheus that does not support UTF-8. Can be one of "values", "underscores", or "dots".`).Default(scrape.DefaultNameEscapingScheme.String()).StringVar(&cfg.nameEscapingScheme)
a.Flag("enable-feature", "Comma separated feature names to enable. Valid options: agent, auto-gomaxprocs, auto-gomemlimit, concurrent-rule-eval, created-timestamp-zero-ingestion, delayed-compaction, exemplar-storage, expand-external-labels, extra-scrape-metrics, memory-snapshot-on-shutdown, native-histograms, new-service-discovery-manager, no-default-scrape-port, otlp-write-receiver, promql-experimental-functions, promql-delayed-name-removal, promql-per-step-stats, remote-write-receiver (DEPRECATED), utf8-names. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details.").
a.Flag("enable-feature", "Comma separated feature names to enable. Valid options: agent, auto-gomaxprocs, auto-gomemlimit, auto-reload-config, concurrent-rule-eval, created-timestamp-zero-ingestion, delayed-compaction, exemplar-storage, expand-external-labels, extra-scrape-metrics, memory-snapshot-on-shutdown, native-histograms, new-service-discovery-manager, no-default-scrape-port, otlp-write-receiver, promql-experimental-functions, promql-delayed-name-removal, promql-per-step-stats, remote-write-receiver (DEPRECATED), utf8-names. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details.").
Default("").StringsVar(&cfg.featureList)
promlogflag.AddFlags(a, &cfg.promlogConfig)
@ -1120,6 +1132,15 @@ func main() {
hup := make(chan os.Signal, 1)
signal.Notify(hup, syscall.SIGHUP)
cancel := make(chan struct{})
var checksum string
if cfg.enableAutoReload {
checksum, err = config.GenerateChecksum(cfg.configFile)
if err != nil {
level.Error(logger).Log("msg", "Failed to generate initial checksum for configuration file", "err", err)
}
}
g.Add(
func() error {
<-reloadReady.C
@ -1129,6 +1150,12 @@ func main() {
case <-hup:
if err := reloadConfig(cfg.configFile, cfg.enableExpandExternalLabels, cfg.tsdb.EnableExemplarStorage, logger, noStepSubqueryInterval, reloaders...); err != nil {
level.Error(logger).Log("msg", "Error reloading config", "err", err)
} else if cfg.enableAutoReload {
if currentChecksum, err := config.GenerateChecksum(cfg.configFile); err == nil {
checksum = currentChecksum
} else {
level.Error(logger).Log("msg", "Failed to generate checksum during configuration reload", "err", err)
}
}
case rc := <-webHandler.Reload():
if err := reloadConfig(cfg.configFile, cfg.enableExpandExternalLabels, cfg.tsdb.EnableExemplarStorage, logger, noStepSubqueryInterval, reloaders...); err != nil {
@ -1137,6 +1164,32 @@ func main() {
} else {
rc <- nil
}
if cfg.enableAutoReload {
if currentChecksum, err := config.GenerateChecksum(cfg.configFile); err == nil {
checksum = currentChecksum
} else {
level.Error(logger).Log("msg", "Failed to generate checksum during configuration reload", "err", err)
}
}
case <-time.Tick(time.Duration(cfg.autoReloadInterval)):
if !cfg.enableAutoReload {
continue
}
currentChecksum, err := config.GenerateChecksum(cfg.configFile)
if err != nil {
level.Error(logger).Log("msg", "Failed to generate checksum during configuration reload", "err", err)
continue
}
if currentChecksum == checksum {
continue
}
level.Info(logger).Log("msg", "Configuration file change detected, reloading the configuration.")
if err := reloadConfig(cfg.configFile, cfg.enableExpandExternalLabels, cfg.tsdb.EnableExemplarStorage, logger, noStepSubqueryInterval, reloaders...); err != nil {
level.Error(logger).Log("msg", "Error reloading config", "err", err)
} else {
checksum = currentChecksum
}
case <-cancel:
return nil
}

86
config/reload.go Normal file
View file

@ -0,0 +1,86 @@
// Copyright 2024 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package config
import (
"crypto/sha256"
"encoding/hex"
"fmt"
"os"
"path/filepath"
"gopkg.in/yaml.v2"
)
type ExternalFilesConfig struct {
RuleFiles []string `yaml:"rule_files"`
ScrapeConfigFiles []string `yaml:"scrape_config_files"`
}
// GenerateChecksum generates a checksum of the YAML file and the files it references.
func GenerateChecksum(yamlFilePath string) (string, error) {
hash := sha256.New()
yamlContent, err := os.ReadFile(yamlFilePath)
if err != nil {
return "", fmt.Errorf("error reading YAML file: %w", err)
}
_, _ = hash.Write(yamlContent)
var config ExternalFilesConfig
if err := yaml.Unmarshal(yamlContent, &config); err != nil {
return "", fmt.Errorf("error unmarshalling YAML: %w", err)
}
dir := filepath.Dir(yamlFilePath)
for i, file := range config.RuleFiles {
config.RuleFiles[i] = filepath.Join(dir, file)
}
for i, file := range config.ScrapeConfigFiles {
config.ScrapeConfigFiles[i] = filepath.Join(dir, file)
}
files := map[string][]string{
"r": config.RuleFiles, // "r" for rule files
"s": config.ScrapeConfigFiles, // "s" for scrape config files
}
for _, prefix := range []string{"r", "s"} {
for _, pattern := range files[prefix] {
matchingFiles, err := filepath.Glob(pattern)
if err != nil {
return "", fmt.Errorf("error finding files with pattern %s: %w", pattern, err)
}
for _, file := range matchingFiles {
// Write prefix to the hash ("r" or "s") followed by \0.
_, _ = hash.Write([]byte(prefix + "\x00"))
// Write the file path to the hash, followed by \0 to ensure
// separation.
_, _ = hash.Write([]byte(file + "\x00"))
// Read and hash the content of the file.
content, err := os.ReadFile(file)
if err != nil {
return "", fmt.Errorf("error reading file %s: %w", file, err)
}
_, _ = hash.Write(append(content, []byte("\x00")...))
}
}
}
return hex.EncodeToString(hash.Sum(nil)), nil
}

225
config/reload_test.go Normal file
View file

@ -0,0 +1,225 @@
// Copyright 2024 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package config
import (
"os"
"path/filepath"
"testing"
"github.com/stretchr/testify/require"
)
func TestGenerateChecksum(t *testing.T) {
// Create a temporary directory to hold the test files.
tmpDir, err := os.MkdirTemp("", "checksum_test")
require.NoError(t, err)
defer os.RemoveAll(tmpDir) // Clean up.
// Define paths for the temporary files.
yamlFilePath := filepath.Join(tmpDir, "test.yml")
ruleFilePath := filepath.Join(tmpDir, "rule_file.yml")
scrapeConfigFilePath := filepath.Join(tmpDir, "scrape_config.yml")
// Define initial and modified content for the files.
originalRuleContent := "groups:\n- name: example\n rules:\n - alert: ExampleAlert"
modifiedRuleContent := "groups:\n- name: example\n rules:\n - alert: ModifiedAlert"
originalScrapeConfigContent := "scrape_configs:\n- job_name: example"
modifiedScrapeConfigContent := "scrape_configs:\n- job_name: modified_example"
// Define YAML content referencing the rule and scrape config files.
yamlContent := `
rule_files:
- rule_file.yml
scrape_config_files:
- scrape_config.yml
`
// Write initial content to files.
require.NoError(t, os.WriteFile(ruleFilePath, []byte(originalRuleContent), 0o644))
require.NoError(t, os.WriteFile(scrapeConfigFilePath, []byte(originalScrapeConfigContent), 0o644))
require.NoError(t, os.WriteFile(yamlFilePath, []byte(yamlContent), 0o644))
// Generate the original checksum.
originalChecksum := calculateChecksum(t, yamlFilePath)
t.Run("Rule File Change", func(t *testing.T) {
// Modify the rule file.
require.NoError(t, os.WriteFile(ruleFilePath, []byte(modifiedRuleContent), 0o644))
// Checksum should change.
modifiedChecksum := calculateChecksum(t, yamlFilePath)
require.NotEqual(t, originalChecksum, modifiedChecksum)
// Revert the rule file.
require.NoError(t, os.WriteFile(ruleFilePath, []byte(originalRuleContent), 0o644))
// Checksum should return to the original.
revertedChecksum := calculateChecksum(t, yamlFilePath)
require.Equal(t, originalChecksum, revertedChecksum)
})
t.Run("Scrape Config Change", func(t *testing.T) {
// Modify the scrape config file.
require.NoError(t, os.WriteFile(scrapeConfigFilePath, []byte(modifiedScrapeConfigContent), 0o644))
// Checksum should change.
modifiedChecksum := calculateChecksum(t, yamlFilePath)
require.NotEqual(t, originalChecksum, modifiedChecksum)
// Revert the scrape config file.
require.NoError(t, os.WriteFile(scrapeConfigFilePath, []byte(originalScrapeConfigContent), 0o644))
// Checksum should return to the original.
revertedChecksum := calculateChecksum(t, yamlFilePath)
require.Equal(t, originalChecksum, revertedChecksum)
})
t.Run("Rule File Deletion", func(t *testing.T) {
// Delete the rule file.
require.NoError(t, os.Remove(ruleFilePath))
// Checksum should change.
deletedChecksum := calculateChecksum(t, yamlFilePath)
require.NotEqual(t, originalChecksum, deletedChecksum)
// Restore the rule file.
require.NoError(t, os.WriteFile(ruleFilePath, []byte(originalRuleContent), 0o644))
// Checksum should return to the original.
revertedChecksum := calculateChecksum(t, yamlFilePath)
require.Equal(t, originalChecksum, revertedChecksum)
})
t.Run("Scrape Config Deletion", func(t *testing.T) {
// Delete the scrape config file.
require.NoError(t, os.Remove(scrapeConfigFilePath))
// Checksum should change.
deletedChecksum := calculateChecksum(t, yamlFilePath)
require.NotEqual(t, originalChecksum, deletedChecksum)
// Restore the scrape config file.
require.NoError(t, os.WriteFile(scrapeConfigFilePath, []byte(originalScrapeConfigContent), 0o644))
// Checksum should return to the original.
revertedChecksum := calculateChecksum(t, yamlFilePath)
require.Equal(t, originalChecksum, revertedChecksum)
})
t.Run("Main File Change", func(t *testing.T) {
// Modify the main YAML file.
modifiedYamlContent := `
global:
scrape_interval: 3s
rule_files:
- rule_file.yml
scrape_config_files:
- scrape_config.yml
`
require.NoError(t, os.WriteFile(yamlFilePath, []byte(modifiedYamlContent), 0o644))
// Checksum should change.
modifiedChecksum := calculateChecksum(t, yamlFilePath)
require.NotEqual(t, originalChecksum, modifiedChecksum)
// Revert the main YAML file.
require.NoError(t, os.WriteFile(yamlFilePath, []byte(yamlContent), 0o644))
// Checksum should return to the original.
revertedChecksum := calculateChecksum(t, yamlFilePath)
require.Equal(t, originalChecksum, revertedChecksum)
})
t.Run("Rule File Removed from YAML Config", func(t *testing.T) {
// Modify the YAML content to remove the rule file.
modifiedYamlContent := `
scrape_config_files:
- scrape_config.yml
`
require.NoError(t, os.WriteFile(yamlFilePath, []byte(modifiedYamlContent), 0o644))
// Checksum should change.
modifiedChecksum := calculateChecksum(t, yamlFilePath)
require.NotEqual(t, originalChecksum, modifiedChecksum)
// Revert the YAML content.
require.NoError(t, os.WriteFile(yamlFilePath, []byte(yamlContent), 0o644))
// Checksum should return to the original.
revertedChecksum := calculateChecksum(t, yamlFilePath)
require.Equal(t, originalChecksum, revertedChecksum)
})
t.Run("Scrape Config Removed from YAML Config", func(t *testing.T) {
// Modify the YAML content to remove the scrape config file.
modifiedYamlContent := `
rule_files:
- rule_file.yml
`
require.NoError(t, os.WriteFile(yamlFilePath, []byte(modifiedYamlContent), 0o644))
// Checksum should change.
modifiedChecksum := calculateChecksum(t, yamlFilePath)
require.NotEqual(t, originalChecksum, modifiedChecksum)
// Revert the YAML content.
require.NoError(t, os.WriteFile(yamlFilePath, []byte(yamlContent), 0o644))
// Checksum should return to the original.
revertedChecksum := calculateChecksum(t, yamlFilePath)
require.Equal(t, originalChecksum, revertedChecksum)
})
t.Run("Empty Rule File", func(t *testing.T) {
// Write an empty rule file.
require.NoError(t, os.WriteFile(ruleFilePath, []byte(""), 0o644))
// Checksum should change.
emptyChecksum := calculateChecksum(t, yamlFilePath)
require.NotEqual(t, originalChecksum, emptyChecksum)
// Restore the rule file.
require.NoError(t, os.WriteFile(ruleFilePath, []byte(originalRuleContent), 0o644))
// Checksum should return to the original.
revertedChecksum := calculateChecksum(t, yamlFilePath)
require.Equal(t, originalChecksum, revertedChecksum)
})
t.Run("Empty Scrape Config File", func(t *testing.T) {
// Write an empty scrape config file.
require.NoError(t, os.WriteFile(scrapeConfigFilePath, []byte(""), 0o644))
// Checksum should change.
emptyChecksum := calculateChecksum(t, yamlFilePath)
require.NotEqual(t, originalChecksum, emptyChecksum)
// Restore the scrape config file.
require.NoError(t, os.WriteFile(scrapeConfigFilePath, []byte(originalScrapeConfigContent), 0o644))
// Checksum should return to the original.
revertedChecksum := calculateChecksum(t, yamlFilePath)
require.Equal(t, originalChecksum, revertedChecksum)
})
}
// calculateChecksum generates a checksum for the given YAML file path.
func calculateChecksum(t *testing.T, yamlFilePath string) string {
checksum, err := GenerateChecksum(yamlFilePath)
require.NoError(t, err)
require.NotEmpty(t, checksum)
return checksum
}

View file

@ -15,6 +15,7 @@ The Prometheus monitoring server
| <code class="text-nowrap">-h</code>, <code class="text-nowrap">--help</code> | Show context-sensitive help (also try --help-long and --help-man). | |
| <code class="text-nowrap">--version</code> | Show application version. | |
| <code class="text-nowrap">--config.file</code> | Prometheus configuration file path. | `prometheus.yml` |
| <code class="text-nowrap">--config.auto-reload-interval</code> | Specifies the interval for checking and automatically reloading the Prometheus configuration file upon detecting changes. | `30s` |
| <code class="text-nowrap">--web.listen-address</code> <code class="text-nowrap">...<code class="text-nowrap"> | Address to listen on for UI, API, and telemetry. Can be repeated. | `0.0.0.0:9090` |
| <code class="text-nowrap">--auto-gomemlimit.ratio</code> | The ratio of reserved GOMEMLIMIT memory to the detected maximum container or system memory | `0.9` |
| <code class="text-nowrap">--web.config.file</code> | [EXPERIMENTAL] Path to configuration file that can enable TLS or authentication. | |
@ -57,7 +58,7 @@ The Prometheus monitoring server
| <code class="text-nowrap">--query.max-concurrency</code> | Maximum number of queries executed concurrently. Use with server mode only. | `20` |
| <code class="text-nowrap">--query.max-samples</code> | Maximum number of samples a single query can load into memory. Note that queries will fail if they try to load more samples than this into memory, so this also limits the number of samples a query can return. Use with server mode only. | `50000000` |
| <code class="text-nowrap">--scrape.name-escaping-scheme</code> | Method for escaping legacy invalid names when sending to Prometheus that does not support UTF-8. Can be one of "values", "underscores", or "dots". | `values` |
| <code class="text-nowrap">--enable-feature</code> <code class="text-nowrap">...<code class="text-nowrap"> | Comma separated feature names to enable. Valid options: agent, auto-gomaxprocs, auto-gomemlimit, concurrent-rule-eval, created-timestamp-zero-ingestion, delayed-compaction, exemplar-storage, expand-external-labels, extra-scrape-metrics, memory-snapshot-on-shutdown, native-histograms, new-service-discovery-manager, no-default-scrape-port, otlp-write-receiver, promql-experimental-functions, promql-delayed-name-removal, promql-per-step-stats, remote-write-receiver (DEPRECATED), utf8-names. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details. | |
| <code class="text-nowrap">--enable-feature</code> <code class="text-nowrap">...<code class="text-nowrap"> | Comma separated feature names to enable. Valid options: agent, auto-gomaxprocs, auto-gomemlimit, auto-reload-config, concurrent-rule-eval, created-timestamp-zero-ingestion, delayed-compaction, exemplar-storage, expand-external-labels, extra-scrape-metrics, memory-snapshot-on-shutdown, native-histograms, new-service-discovery-manager, no-default-scrape-port, otlp-write-receiver, promql-experimental-functions, promql-delayed-name-removal, promql-per-step-stats, remote-write-receiver (DEPRECATED), utf8-names. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details. | |
| <code class="text-nowrap">--log.level</code> | Only log messages with the given severity or above. One of: [debug, info, warn, error] | `info` |
| <code class="text-nowrap">--log.format</code> | Output format of log messages. One of: [logfmt, json] | `logfmt` |

View file

@ -265,3 +265,15 @@ This allows optionally preserving the `__name__` label via the `label_replace` a
When enabled, changes the metric and label name validation scheme inside Prometheus to allow the full UTF-8 character set.
By itself, this flag does not enable the request of UTF-8 names via content negotiation.
Users will also have to set `metric_name_validation_scheme` in scrape configs to enable the feature either on the global config or on a per-scrape config basis.
## Auto Reload Config
`--enable-feature=auto-reload-config`
When enabled, Prometheus will automatically reload its configuration file at a
specified interval. The interval is defined by the
`--config.auto-reload-interval` flag, which defaults to `30s`.
Configuration reloads are triggered by detecting changes in the checksum of the
main configuration file or any referenced files, such as rule and scrape
configurations.