2015-06-15 03:36:32 -07:00
// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
2016-02-17 14:52:44 -08:00
// The main package for the Prometheus server executable.
2015-06-15 03:36:32 -07:00
package main
import (
2017-10-24 21:21:42 -07:00
"context"
2015-06-15 03:23:02 -07:00
"fmt"
2020-02-17 03:41:04 -08:00
"math"
2020-10-06 12:42:56 -07:00
"math/bits"
2017-06-20 09:48:17 -07:00
"net"
2017-10-06 03:22:19 -07:00
"net/http"
2015-06-15 03:36:32 -07:00
_ "net/http/pprof" // Comment this line to disable pprof endpoint.
2017-06-20 09:48:17 -07:00
"net/url"
2015-06-15 03:36:32 -07:00
"os"
"os/signal"
2017-06-20 08:38:01 -07:00
"path/filepath"
2017-09-04 04:10:32 -07:00
"runtime"
2017-06-20 09:48:17 -07:00
"strings"
2018-01-17 10:14:24 -08:00
"sync"
2015-06-15 03:36:32 -07:00
"syscall"
"time"
2020-02-18 03:25:36 -08:00
"github.com/alecthomas/units"
2021-06-11 09:17:59 -07:00
"github.com/go-kit/log"
"github.com/go-kit/log/level"
2022-02-12 15:58:27 -08:00
"github.com/grafana/regexp"
2019-03-25 16:01:12 -07:00
conntrack "github.com/mwitkow/go-conntrack"
2019-04-19 05:55:28 -07:00
"github.com/oklog/run"
2017-06-20 09:48:17 -07:00
"github.com/pkg/errors"
2015-06-23 09:04:04 -07:00
"github.com/prometheus/client_golang/prometheus"
2017-06-20 09:48:17 -07:00
"github.com/prometheus/common/model"
2019-03-25 16:01:12 -07:00
"github.com/prometheus/common/promlog"
2020-10-22 02:00:08 -07:00
promlogflag "github.com/prometheus/common/promlog/flag"
2016-05-05 04:46:51 -07:00
"github.com/prometheus/common/version"
2021-01-13 12:37:01 -08:00
toolkit_web "github.com/prometheus/exporter-toolkit/web"
toolkit_webflag "github.com/prometheus/exporter-toolkit/web/kingpinflag"
2020-07-30 00:45:42 -07:00
"go.uber.org/atomic"
2019-03-25 16:01:12 -07:00
kingpin "gopkg.in/alecthomas/kingpin.v2"
2020-10-06 05:54:12 -07:00
klog "k8s.io/klog"
klogv2 "k8s.io/klog/v2"
promql: Allow per-query contexts.
For Weaveworks' Frankenstein, we need to support multitenancy. In
Frankenstein, we initially solved this without modifying the promql
package at all: we constructed a new promql.Engine for every
query and injected a storage implementation into that engine which would
be primed to only collect data for a given user.
This is problematic to upstream, however. Prometheus assumes that there
is only one engine: the query concurrency gate is part of the engine,
and the engine contains one central cancellable context to shut down all
queries. Also, creating a new engine for every query seems like overkill.
Thus, we want to be able to pass per-query contexts into a single engine.
This change gets rid of the promql.Engine's built-in base context and
allows passing in a per-query context instead. Central cancellation of
all queries is still possible by deriving all passed-in contexts from
one central one, but this is now the responsibility of the caller. The
central query context is now created in main() and passed into the
relevant components (web handler / API, rule manager).
In a next step, the per-query context would have to be passed to the
storage implementation, so that the storage can implement multi-tenancy
or other features based on the contextual information.
2016-09-15 04:52:50 -07:00
2015-06-15 03:36:32 -07:00
"github.com/prometheus/prometheus/config"
2017-11-25 05:13:54 -08:00
"github.com/prometheus/prometheus/discovery"
2021-10-20 01:15:54 -07:00
"github.com/prometheus/prometheus/discovery/legacymanager"
"github.com/prometheus/prometheus/discovery/targetgroup"
2021-11-08 06:23:17 -08:00
"github.com/prometheus/prometheus/model/exemplar"
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/model/relabel"
2016-03-01 03:37:22 -08:00
"github.com/prometheus/prometheus/notifier"
2022-03-29 05:44:39 -07:00
_ "github.com/prometheus/prometheus/plugins" // Register plugins.
2015-06-15 03:36:32 -07:00
"github.com/prometheus/prometheus/promql"
"github.com/prometheus/prometheus/rules"
2018-02-01 01:55:07 -08:00
"github.com/prometheus/prometheus/scrape"
2017-05-10 02:44:13 -07:00
"github.com/prometheus/prometheus/storage"
"github.com/prometheus/prometheus/storage/remote"
2022-01-25 02:08:04 -08:00
"github.com/prometheus/prometheus/tracing"
2020-02-06 07:58:38 -08:00
"github.com/prometheus/prometheus/tsdb"
2021-10-29 08:25:05 -07:00
"github.com/prometheus/prometheus/tsdb/agent"
2021-11-08 06:23:17 -08:00
"github.com/prometheus/prometheus/util/logging"
prom_runtime "github.com/prometheus/prometheus/util/runtime"
2017-11-23 23:59:05 -08:00
"github.com/prometheus/prometheus/util/strutil"
2015-06-15 03:36:32 -07:00
"github.com/prometheus/prometheus/web"
)
2017-06-20 09:48:17 -07:00
var (
2021-10-29 08:25:05 -07:00
appName = "prometheus"
2017-06-20 09:48:17 -07:00
configSuccess = prometheus . NewGauge ( prometheus . GaugeOpts {
2018-03-21 09:08:37 -07:00
Name : "prometheus_config_last_reload_successful" ,
Help : "Whether the last configuration reload attempt was successful." ,
2017-06-20 09:48:17 -07:00
} )
configSuccessTime = prometheus . NewGauge ( prometheus . GaugeOpts {
2018-03-21 09:08:37 -07:00
Name : "prometheus_config_last_reload_success_timestamp_seconds" ,
Help : "Timestamp of the last successful configuration reload." ,
2017-06-20 09:48:17 -07:00
} )
2019-01-18 05:48:36 -08:00
defaultRetentionString = "15d"
defaultRetentionDuration model . Duration
2021-10-29 08:25:05 -07:00
agentMode bool
agentOnlyFlags , serverOnlyFlags [ ] string
2017-06-20 09:48:17 -07:00
)
func init ( ) {
2021-10-29 08:25:05 -07:00
prometheus . MustRegister ( version . NewCollector ( strings . ReplaceAll ( appName , "-" , "_" ) ) )
2019-01-18 05:48:36 -08:00
var err error
defaultRetentionDuration , err = model . ParseDuration ( defaultRetentionString )
if err != nil {
panic ( err )
}
2017-06-20 09:48:17 -07:00
}
2021-11-04 02:08:53 -07:00
// serverOnlyFlag creates server-only kingpin flag.
func serverOnlyFlag ( app * kingpin . Application , name , help string ) * kingpin . FlagClause {
return app . Flag ( name , fmt . Sprintf ( "%s Use with server mode only." , help ) ) .
PreAction ( func ( parseContext * kingpin . ParseContext ) error {
// This will be invoked only if flag is actually provided by user.
serverOnlyFlags = append ( serverOnlyFlags , "--" + name )
return nil
} )
2021-10-29 08:25:05 -07:00
}
2021-11-04 02:08:53 -07:00
// agentOnlyFlag creates agent-only kingpin flag.
func agentOnlyFlag ( app * kingpin . Application , name , help string ) * kingpin . FlagClause {
return app . Flag ( name , fmt . Sprintf ( "%s Use with agent mode only." , help ) ) .
PreAction ( func ( parseContext * kingpin . ParseContext ) error {
// This will be invoked only if flag is actually provided by user.
agentOnlyFlags = append ( agentOnlyFlags , "--" + name )
return nil
} )
2021-10-29 08:25:05 -07:00
}
2021-01-20 02:57:39 -08:00
type flagConfig struct {
configFile string
2021-11-04 03:08:01 -07:00
agentStoragePath string
2021-11-05 08:50:10 -07:00
serverStoragePath string
2021-01-20 02:57:39 -08:00
notifier notifier . Options
forGracePeriod model . Duration
outageTolerance model . Duration
resendDelay model . Duration
web web . Options
2021-08-24 05:31:14 -07:00
scrape scrape . Options
2021-01-20 02:57:39 -08:00
tsdb tsdbOptions
2021-10-29 08:25:05 -07:00
agent agentOptions
2021-01-20 02:57:39 -08:00
lookbackDelta model . Duration
webTimeout model . Duration
queryTimeout model . Duration
queryConcurrency int
queryMaxSamples int
RemoteFlushDeadline model . Duration
featureList [ ] string
// These options are extracted from featureList
// for ease of use.
2021-03-25 14:28:58 -07:00
enableExpandExternalLabels bool
2021-10-20 01:15:54 -07:00
enableNewSDManager bool
2022-02-01 18:07:23 -08:00
enablePerStepStats bool
2021-01-20 02:57:39 -08:00
prometheusURL string
corsRegexString string
promlogConfig promlog . Config
}
// setFeatureListOptions sets the corresponding options from the featureList.
func ( c * flagConfig ) setFeatureListOptions ( logger log . Logger ) error {
for _ , f := range c . featureList {
opts := strings . Split ( f , "," )
for _ , o := range opts {
switch o {
2021-01-30 03:04:48 -08:00
case "remote-write-receiver" :
2022-01-05 06:26:24 -08:00
c . web . EnableRemoteWriteReceiver = true
level . Warn ( logger ) . Log ( "msg" , "Remote write receiver enabled via feature flag remote-write-receiver. This is DEPRECATED. Use --web.enable-remote-write-receiver." )
2021-03-25 14:28:58 -07:00
case "expand-external-labels" :
c . enableExpandExternalLabels = true
level . Info ( logger ) . Log ( "msg" , "Experimental expand-external-labels enabled" )
2021-03-16 02:47:45 -07:00
case "exemplar-storage" :
2021-07-19 21:52:57 -07:00
c . tsdb . EnableExemplarStorage = true
level . Info ( logger ) . Log ( "msg" , "Experimental in-memory exemplar storage enabled" )
2021-08-06 09:51:01 -07:00
case "memory-snapshot-on-shutdown" :
c . tsdb . EnableMemorySnapshotOnShutdown = true
level . Info ( logger ) . Log ( "msg" , "Experimental memory snapshot on shutdown enabled" )
2021-08-24 05:31:14 -07:00
case "extra-scrape-metrics" :
c . scrape . ExtraMetrics = true
level . Info ( logger ) . Log ( "msg" , "Experimental additional scrape metrics" )
2021-10-20 01:15:54 -07:00
case "new-service-discovery-manager" :
c . enableNewSDManager = true
level . Info ( logger ) . Log ( "msg" , "Experimental service discovery manager" )
2021-11-02 06:03:35 -07:00
case "agent" :
agentMode = true
level . Info ( logger ) . Log ( "msg" , "Experimental agent mode enabled." )
2022-02-01 18:07:23 -08:00
case "promql-per-step-stats" :
c . enablePerStepStats = true
level . Info ( logger ) . Log ( "msg" , "Experimental per-step statistics reporting" )
2021-01-20 02:57:39 -08:00
case "" :
continue
2022-01-11 08:01:02 -08:00
case "promql-at-modifier" , "promql-negative-offset" :
level . Warn ( logger ) . Log ( "msg" , "This option for --enable-feature is now permanently enabled and therefore a no-op." , "option" , o )
2021-01-20 02:57:39 -08:00
default :
level . Warn ( logger ) . Log ( "msg" , "Unknown option for --enable-feature" , "option" , o )
}
}
}
return nil
}
2015-06-15 03:36:32 -07:00
func main ( ) {
2017-09-04 04:10:32 -07:00
if os . Getenv ( "DEBUG" ) != "" {
runtime . SetBlockProfileRate ( 20 )
runtime . SetMutexProfileFraction ( 20 )
}
2019-01-18 05:48:36 -08:00
var (
oldFlagRetentionDuration model . Duration
newFlagRetentionDuration model . Duration
)
2021-01-20 02:57:39 -08:00
cfg := flagConfig {
2017-06-20 09:48:17 -07:00
notifier : notifier . Options {
Registerer : prometheus . DefaultRegisterer ,
} ,
2019-06-24 06:48:15 -07:00
web : web . Options {
Registerer : prometheus . DefaultRegisterer ,
Gatherer : prometheus . DefaultGatherer ,
} ,
2018-11-23 05:22:40 -08:00
promlogConfig : promlog . Config { } ,
2017-06-20 09:48:17 -07:00
}
2021-02-25 10:52:34 -08:00
a := kingpin . New ( filepath . Base ( os . Args [ 0 ] ) , "The Prometheus monitoring server" ) . UsageWriter ( os . Stdout )
2017-06-20 08:38:01 -07:00
2021-10-29 08:25:05 -07:00
a . Version ( version . Print ( appName ) )
2017-06-20 08:38:01 -07:00
a . HelpFlag . Short ( 'h' )
a . Flag ( "config.file" , "Prometheus configuration file path." ) .
Default ( "prometheus.yml" ) . StringVar ( & cfg . configFile )
2017-10-16 15:00:05 -07:00
a . Flag ( "web.listen-address" , "Address to listen on for UI, API, and telemetry." ) .
2017-06-20 08:38:01 -07:00
Default ( "0.0.0.0:9090" ) . StringVar ( & cfg . web . ListenAddress )
2021-01-13 12:37:01 -08:00
webConfig := toolkit_webflag . AddFlags ( a )
2020-12-25 03:45:31 -08:00
2017-06-20 08:38:01 -07:00
a . Flag ( "web.read-timeout" ,
"Maximum duration before timing out read of the request, and closing idle connections." ) .
Default ( "5m" ) . SetValue ( & cfg . webTimeout )
a . Flag ( "web.max-connections" , "Maximum number of simultaneous connections." ) .
Default ( "512" ) . IntVar ( & cfg . web . MaxConnections )
a . Flag ( "web.external-url" ,
"The URL under which Prometheus is externally reachable (for example, if Prometheus is served via a reverse proxy). Used for generating relative and absolute links back to Prometheus itself. If the URL has a path portion, it will be used to prefix all HTTP endpoints served by Prometheus. If omitted, relevant URL components will be derived automatically." ) .
PlaceHolder ( "<URL>" ) . StringVar ( & cfg . prometheusURL )
a . Flag ( "web.route-prefix" ,
"Prefix for the internal routes of web endpoints. Defaults to path of --web.external-url." ) .
PlaceHolder ( "<path>" ) . StringVar ( & cfg . web . RoutePrefix )
a . Flag ( "web.user-assets" , "Path to static asset directory, available at /user." ) .
PlaceHolder ( "<path>" ) . StringVar ( & cfg . web . UserAssetsPath )
2017-07-10 06:44:29 -07:00
a . Flag ( "web.enable-lifecycle" , "Enable shutdown and reload via HTTP request." ) .
Default ( "false" ) . BoolVar ( & cfg . web . EnableLifecycle )
2017-06-20 08:38:01 -07:00
2018-03-20 08:58:19 -07:00
a . Flag ( "web.enable-admin-api" , "Enable API endpoints for admin control actions." ) .
2017-07-10 00:29:41 -07:00
Default ( "false" ) . BoolVar ( & cfg . web . EnableAdminAPI )
2022-01-05 06:26:24 -08:00
a . Flag ( "web.enable-remote-write-receiver" , "Enable API endpoint accepting remote write requests." ) .
Default ( "false" ) . BoolVar ( & cfg . web . EnableRemoteWriteReceiver )
2017-06-20 08:38:01 -07:00
a . Flag ( "web.console.templates" , "Path to the console template directory, available at /consoles." ) .
Default ( "consoles" ) . StringVar ( & cfg . web . ConsoleTemplatesPath )
a . Flag ( "web.console.libraries" , "Path to the console library directory." ) .
Default ( "console_libraries" ) . StringVar ( & cfg . web . ConsoleLibrariesPath )
2018-11-20 20:45:06 -08:00
a . Flag ( "web.page-title" , "Document title of Prometheus instance." ) .
Default ( "Prometheus Time Series Collection and Processing Server" ) . StringVar ( & cfg . web . PageTitle )
2019-05-15 07:59:06 -07:00
a . Flag ( "web.cors.origin" , ` Regex for CORS origin. It is fully anchored. Example: 'https?://(domain1|domain2)\.com' ` ) .
2019-04-10 05:22:05 -07:00
Default ( ".*" ) . StringVar ( & cfg . corsRegexString )
2021-11-04 02:08:53 -07:00
serverOnlyFlag ( a , "storage.tsdb.path" , "Base path for metrics storage." ) .
2021-11-05 08:50:10 -07:00
Default ( "data/" ) . StringVar ( & cfg . serverStoragePath )
2017-06-20 08:38:01 -07:00
2021-11-04 02:08:53 -07:00
serverOnlyFlag ( a , "storage.tsdb.min-block-duration" , "Minimum duration of a data block before being persisted. For use in testing." ) .
2017-12-24 04:13:48 -08:00
Hidden ( ) . Default ( "2h" ) . SetValue ( & cfg . tsdb . MinBlockDuration )
2017-06-20 08:38:01 -07:00
2021-11-04 02:08:53 -07:00
serverOnlyFlag ( a , "storage.tsdb.max-block-duration" ,
2019-05-15 07:59:06 -07:00
"Maximum duration compacted blocks may span. For use in testing. (Defaults to 10% of the retention period.)" ) .
2017-12-24 04:13:48 -08:00
Hidden ( ) . PlaceHolder ( "<duration>" ) . SetValue ( & cfg . tsdb . MaxBlockDuration )
2017-06-20 08:38:01 -07:00
2021-11-04 02:08:53 -07:00
serverOnlyFlag ( a , "storage.tsdb.max-block-chunk-segment-size" ,
2021-04-15 01:55:01 -07:00
"The maximum size for a single chunk segment in a block. Example: 512MB" ) .
Hidden ( ) . PlaceHolder ( "<bytes>" ) . BytesVar ( & cfg . tsdb . MaxBlockChunkSegmentSize )
2021-11-04 02:08:53 -07:00
serverOnlyFlag ( a , "storage.tsdb.wal-segment-size" ,
2019-05-15 07:59:06 -07:00
"Size at which to split the tsdb WAL segment files. Example: 100MB" ) .
2019-01-03 06:13:21 -08:00
Hidden ( ) . PlaceHolder ( "<bytes>" ) . BytesVar ( & cfg . tsdb . WALSegmentSize )
2021-11-04 02:08:53 -07:00
serverOnlyFlag ( a , "storage.tsdb.retention" , "[DEPRECATED] How long to retain samples in storage. This flag has been deprecated, use \"storage.tsdb.retention.time\" instead." ) .
2019-02-19 03:53:43 -08:00
SetValue ( & oldFlagRetentionDuration )
2019-01-18 05:48:36 -08:00
2021-11-04 02:08:53 -07:00
serverOnlyFlag ( a , "storage.tsdb.retention.time" , "How long to retain samples in storage. When this flag is set it overrides \"storage.tsdb.retention\". If neither this flag nor \"storage.tsdb.retention\" nor \"storage.tsdb.retention.size\" is set, the retention time defaults to " + defaultRetentionString + ". Units Supported: y, w, d, h, m, s, ms." ) .
2019-02-19 03:53:43 -08:00
SetValue ( & newFlagRetentionDuration )
2019-01-18 05:48:36 -08:00
2022-01-12 15:55:57 -08:00
serverOnlyFlag ( a , "storage.tsdb.retention.size" , "Maximum number of bytes that can be stored for blocks. A unit is required, supported units: B, KB, MB, GB, TB, PB, EB. Ex: \"512MB\". Based on powers-of-2, so 1KB is 1024B." ) .
2019-02-19 03:53:43 -08:00
BytesVar ( & cfg . tsdb . MaxBytes )
2017-06-20 08:38:01 -07:00
2021-11-04 02:08:53 -07:00
serverOnlyFlag ( a , "storage.tsdb.no-lockfile" , "Do not create lockfile in data directory." ) .
2017-06-22 06:02:10 -07:00
Default ( "false" ) . BoolVar ( & cfg . tsdb . NoLockfile )
2021-11-04 02:08:53 -07:00
serverOnlyFlag ( a , "storage.tsdb.allow-overlapping-blocks" , "Allow overlapping blocks, which in turn enables vertical compaction and vertical query merge." ) .
2019-03-04 11:42:45 -08:00
Default ( "false" ) . BoolVar ( & cfg . tsdb . AllowOverlappingBlocks )
2021-11-04 02:08:53 -07:00
serverOnlyFlag ( a , "storage.tsdb.wal-compression" , "Compress the tsdb WAL." ) .
2021-07-27 02:08:20 -07:00
Hidden ( ) . Default ( "true" ) . BoolVar ( & cfg . tsdb . WALCompression )
2019-07-03 06:23:13 -07:00
2022-03-11 08:26:59 -08:00
serverOnlyFlag ( a , "storage.tsdb.head-chunks-write-queue-size" , "Size of the queue through which head chunks are written to the disk to be m-mapped, 0 disables the queue completely. Experimental." ) .
Default ( "0" ) . IntVar ( & cfg . tsdb . HeadChunksWriteQueueSize )
2021-11-04 02:08:53 -07:00
agentOnlyFlag ( a , "storage.agent.path" , "Base path for metrics storage." ) .
2021-11-04 03:08:01 -07:00
Default ( "data-agent/" ) . StringVar ( & cfg . agentStoragePath )
2021-10-29 08:25:05 -07:00
2021-11-04 02:08:53 -07:00
agentOnlyFlag ( a , "storage.agent.wal-segment-size" ,
2021-10-29 08:25:05 -07:00
"Size at which to split WAL segment files. Example: 100MB" ) .
Hidden ( ) . PlaceHolder ( "<bytes>" ) . BytesVar ( & cfg . agent . WALSegmentSize )
2021-11-04 02:08:53 -07:00
agentOnlyFlag ( a , "storage.agent.wal-compression" , "Compress the agent WAL." ) .
2021-10-29 08:25:05 -07:00
Default ( "true" ) . BoolVar ( & cfg . agent . WALCompression )
2021-11-04 02:08:53 -07:00
agentOnlyFlag ( a , "storage.agent.wal-truncate-frequency" ,
2021-10-29 08:25:05 -07:00
"The frequency at which to truncate the WAL and remove old data." ) .
Hidden ( ) . PlaceHolder ( "<duration>" ) . SetValue ( & cfg . agent . TruncateFrequency )
2021-11-04 02:08:53 -07:00
agentOnlyFlag ( a , "storage.agent.retention.min-time" ,
2021-10-29 08:25:05 -07:00
"Minimum age samples may be before being considered for deletion when the WAL is truncated" ) .
SetValue ( & cfg . agent . MinWALTime )
2021-11-04 02:08:53 -07:00
agentOnlyFlag ( a , "storage.agent.retention.max-time" ,
2021-10-29 08:25:05 -07:00
"Maximum age samples may be before being forcibly deleted when the WAL is truncated" ) .
SetValue ( & cfg . agent . MaxWALTime )
2021-11-11 08:45:25 -08:00
agentOnlyFlag ( a , "storage.agent.no-lockfile" , "Do not create lockfile in data directory." ) .
Default ( "false" ) . BoolVar ( & cfg . agent . NoLockfile )
2018-05-23 07:03:54 -07:00
a . Flag ( "storage.remote.flush-deadline" , "How long to wait flushing sample on shutdown or config reload." ) .
2018-05-24 07:40:24 -07:00
Default ( "1m" ) . PlaceHolder ( "<duration>" ) . SetValue ( & cfg . RemoteFlushDeadline )
2018-05-23 07:03:54 -07:00
2021-11-04 02:08:53 -07:00
serverOnlyFlag ( a , "storage.remote.read-sample-limit" , "Maximum overall number of samples to return via the remote read interface, in a single query. 0 means no limit. This limit is ignored for streamed response types." ) .
2018-09-25 12:07:34 -07:00
Default ( "5e7" ) . IntVar ( & cfg . web . RemoteReadSampleLimit )
2021-11-04 02:08:53 -07:00
serverOnlyFlag ( a , "storage.remote.read-concurrent-limit" , "Maximum number of concurrent remote read calls. 0 means no limit." ) .
2018-09-25 12:07:34 -07:00
Default ( "10" ) . IntVar ( & cfg . web . RemoteReadConcurrencyLimit )
2018-09-05 06:50:50 -07:00
2021-11-04 02:08:53 -07:00
serverOnlyFlag ( a , "storage.remote.read-max-bytes-in-frame" , "Maximum number of bytes in a single frame for streaming remote read response types before marshalling. Note that client might have limit on frame size as well. 1MB as recommended by protobuf by default." ) .
2019-08-19 13:16:10 -07:00
Default ( "1048576" ) . IntVar ( & cfg . web . RemoteReadBytesInFrame )
2021-11-04 02:08:53 -07:00
serverOnlyFlag ( a , "rules.alert.for-outage-tolerance" , "Max time to tolerate prometheus outage for restoring \"for\" state of alert." ) .
2018-08-02 03:18:24 -07:00
Default ( "1h" ) . SetValue ( & cfg . outageTolerance )
2021-11-04 02:08:53 -07:00
serverOnlyFlag ( a , "rules.alert.for-grace-period" , "Minimum duration between alert and restored \"for\" state. This is maintained only for alerts with configured \"for\" time greater than grace period." ) .
2018-08-02 03:18:24 -07:00
Default ( "10m" ) . SetValue ( & cfg . forGracePeriod )
2021-11-04 02:08:53 -07:00
serverOnlyFlag ( a , "rules.alert.resend-delay" , "Minimum amount of time to wait before resending an alert to Alertmanager." ) .
2018-08-27 09:41:42 -07:00
Default ( "1m" ) . SetValue ( & cfg . resendDelay )
2021-09-08 04:57:33 -07:00
a . Flag ( "scrape.adjust-timestamps" , "Adjust scrape timestamps by up to `scrape.timestamp-tolerance` to align them to the intended schedule. See https://github.com/prometheus/prometheus/issues/7846 for more context. Experimental. This flag will be removed in a future release." ) .
2020-10-07 09:25:52 -07:00
Hidden ( ) . Default ( "true" ) . BoolVar ( & scrape . AlignScrapeTimestamps )
2020-10-07 08:31:32 -07:00
2021-09-08 04:57:33 -07:00
a . Flag ( "scrape.timestamp-tolerance" , "Timestamp tolerance. See https://github.com/prometheus/prometheus/issues/7846 for more context. Experimental. This flag will be removed in a future release." ) .
Hidden ( ) . Default ( "2ms" ) . DurationVar ( & scrape . ScrapeTimestampTolerance )
2021-11-04 02:08:53 -07:00
serverOnlyFlag ( a , "alertmanager.notification-queue-capacity" , "The capacity of the queue for pending Alertmanager notifications." ) .
2017-06-20 08:38:01 -07:00
Default ( "10000" ) . IntVar ( & cfg . notifier . QueueCapacity )
2021-01-25 03:33:45 -08:00
// TODO: Remove in Prometheus 3.0.
alertmanagerTimeout := a . Flag ( "alertmanager.timeout" , "[DEPRECATED] This flag has no effect." ) . Hidden ( ) . String ( )
2017-06-20 08:38:01 -07:00
2021-11-04 02:08:53 -07:00
serverOnlyFlag ( a , "query.lookback-delta" , "The maximum lookback duration for retrieving metrics during expression evaluations and federation." ) .
2017-06-20 08:38:01 -07:00
Default ( "5m" ) . SetValue ( & cfg . lookbackDelta )
2021-11-04 02:08:53 -07:00
serverOnlyFlag ( a , "query.timeout" , "Maximum time a query may take before being aborted." ) .
2017-06-20 09:48:17 -07:00
Default ( "2m" ) . SetValue ( & cfg . queryTimeout )
2017-06-20 08:38:01 -07:00
2021-11-04 02:08:53 -07:00
serverOnlyFlag ( a , "query.max-concurrency" , "Maximum number of queries executed concurrently." ) .
2021-01-30 03:04:48 -08:00
Default ( "20" ) . IntVar ( & cfg . queryConcurrency )
2019-05-15 07:59:06 -07:00
2021-11-04 02:08:53 -07:00
serverOnlyFlag ( a , "query.max-samples" , "Maximum number of samples a single query can load into memory. Note that queries will fail if they try to load more samples than this into memory, so this also limits the number of samples a query can return." ) .
2018-10-02 04:59:19 -07:00
Default ( "50000000" ) . IntVar ( & cfg . queryMaxSamples )
2017-06-20 08:38:01 -07:00
2022-02-01 18:07:23 -08:00
a . Flag ( "enable-feature" , "Comma separated feature names to enable. Valid options: agent, exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-at-modifier, promql-negative-offset, promql-per-step-stats, remote-write-receiver (DEPRECATED), extra-scrape-metrics, new-service-discovery-manager. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details." ) .
2021-01-20 02:57:39 -08:00
Default ( "" ) . StringsVar ( & cfg . featureList )
2018-11-23 05:22:40 -08:00
promlogflag . AddFlags ( a , & cfg . promlogConfig )
2017-09-08 09:34:20 -07:00
2017-06-20 09:48:17 -07:00
_ , err := a . Parse ( os . Args [ 1 : ] )
if err != nil {
2017-10-09 07:25:50 -07:00
fmt . Fprintln ( os . Stderr , errors . Wrapf ( err , "Error parsing commandline arguments" ) )
2017-06-20 08:38:01 -07:00
a . Usage ( os . Args [ 1 : ] )
os . Exit ( 2 )
}
2017-06-20 09:48:17 -07:00
2019-02-19 03:53:43 -08:00
logger := promlog . New ( & cfg . promlogConfig )
2021-01-20 02:57:39 -08:00
if err := cfg . setFeatureListOptions ( logger ) ; err != nil {
fmt . Fprintln ( os . Stderr , errors . Wrapf ( err , "Error parsing feature list" ) )
os . Exit ( 1 )
}
2021-11-02 06:03:35 -07:00
if agentMode && len ( serverOnlyFlags ) > 0 {
fmt . Fprintf ( os . Stderr , "The following flag(s) can not be used in agent mode: %q" , serverOnlyFlags )
os . Exit ( 3 )
}
if ! agentMode && len ( agentOnlyFlags ) > 0 {
fmt . Fprintf ( os . Stderr , "The following flag(s) can only be used in agent mode: %q" , agentOnlyFlags )
os . Exit ( 3 )
}
2021-11-05 08:50:10 -07:00
localStoragePath := cfg . serverStoragePath
if agentMode {
localStoragePath = cfg . agentStoragePath
}
2017-06-20 09:48:17 -07:00
cfg . web . ExternalURL , err = computeExternalURL ( cfg . prometheusURL , cfg . web . ListenAddress )
if err != nil {
fmt . Fprintln ( os . Stderr , errors . Wrapf ( err , "parse external URL %q" , cfg . prometheusURL ) )
2017-06-20 08:38:01 -07:00
os . Exit ( 2 )
}
2019-01-17 07:01:06 -08:00
cfg . web . CORSOrigin , err = compileCORSRegexString ( cfg . corsRegexString )
if err != nil {
fmt . Fprintln ( os . Stderr , errors . Wrapf ( err , "could not compile CORS regex string %q" , cfg . corsRegexString ) )
os . Exit ( 2 )
}
2021-01-25 03:33:45 -08:00
if * alertmanagerTimeout != "" {
level . Warn ( logger ) . Log ( "msg" , "The flag --alertmanager.timeout has no effect and will be removed in the future." )
}
2020-06-21 08:56:59 -07:00
// Throw error for invalid config before starting other components.
2021-07-19 21:52:57 -07:00
var cfgFile * config . Config
2021-10-29 16:41:40 -07:00
if cfgFile , err = config . LoadFile ( cfg . configFile , agentMode , false , log . NewNopLogger ( ) ) ; err != nil {
2022-02-16 08:43:15 -08:00
absPath , pathErr := filepath . Abs ( cfg . configFile )
if pathErr != nil {
absPath = cfg . configFile
}
level . Error ( logger ) . Log ( "msg" , fmt . Sprintf ( "Error loading config (--config.file=%s)" , cfg . configFile ) , "file" , absPath , "err" , err )
2020-06-21 08:56:59 -07:00
os . Exit ( 2 )
}
2021-07-19 21:52:57 -07:00
if cfg . tsdb . EnableExemplarStorage {
if cfgFile . StorageConfig . ExemplarsConfig == nil {
cfgFile . StorageConfig . ExemplarsConfig = & config . DefaultExemplarsConfig
}
cfg . tsdb . MaxExemplars = int64 ( cfgFile . StorageConfig . ExemplarsConfig . MaxExemplars )
}
2020-10-12 12:30:59 -07:00
// Now that the validity of the config is established, set the config
// success metrics accordingly, although the config isn't really loaded
// yet. This will happen later (including setting these metrics again),
// but if we don't do it now, the metrics will stay at zero until the
// startup procedure is complete, which might take long enough to
// trigger alerts about an invalid config.
configSuccess . Set ( 1 )
configSuccessTime . SetToCurrentTime ( )
2020-06-21 08:56:59 -07:00
2017-06-20 09:48:17 -07:00
cfg . web . ReadTimeout = time . Duration ( cfg . webTimeout )
// Default -web.route-prefix to path of -web.external-url.
if cfg . web . RoutePrefix == "" {
cfg . web . RoutePrefix = cfg . web . ExternalURL . Path
}
// RoutePrefix must always be at least '/'.
cfg . web . RoutePrefix = "/" + strings . Trim ( cfg . web . RoutePrefix , "/" )
2015-06-15 03:36:32 -07:00
2021-10-29 08:25:05 -07:00
if ! agentMode {
// Time retention settings.
2019-02-19 03:53:43 -08:00
if oldFlagRetentionDuration != 0 {
level . Warn ( logger ) . Log ( "deprecation_notice" , "'storage.tsdb.retention' flag is deprecated use 'storage.tsdb.retention.time' instead." )
cfg . tsdb . RetentionDuration = oldFlagRetentionDuration
}
2019-01-18 05:48:36 -08:00
2019-02-19 03:53:43 -08:00
// When the new flag is set it takes precedence.
if newFlagRetentionDuration != 0 {
cfg . tsdb . RetentionDuration = newFlagRetentionDuration
}
2019-01-18 06:48:06 -08:00
2019-02-19 03:53:43 -08:00
if cfg . tsdb . RetentionDuration == 0 && cfg . tsdb . MaxBytes == 0 {
cfg . tsdb . RetentionDuration = defaultRetentionDuration
2020-04-11 01:22:18 -07:00
level . Info ( logger ) . Log ( "msg" , "No time or size retention was set so using the default time retention" , "duration" , defaultRetentionDuration )
2019-02-19 03:53:43 -08:00
}
2019-01-18 06:48:06 -08:00
2019-03-11 10:18:57 -07:00
// Check for overflows. This limits our max retention to 100y.
2019-02-19 03:53:43 -08:00
if cfg . tsdb . RetentionDuration < 0 {
2019-03-11 10:18:57 -07:00
y , err := model . ParseDuration ( "100y" )
if err != nil {
panic ( err )
}
cfg . tsdb . RetentionDuration = y
2020-04-11 01:22:18 -07:00
level . Warn ( logger ) . Log ( "msg" , "Time retention value is too high. Limiting to: " + y . String ( ) )
2019-01-18 06:48:06 -08:00
}
2019-02-19 03:53:43 -08:00
2021-10-29 08:25:05 -07:00
// Max block size settings.
2019-02-19 03:53:43 -08:00
if cfg . tsdb . MaxBlockDuration == 0 {
maxBlockDuration , err := model . ParseDuration ( "31d" )
if err != nil {
panic ( err )
}
// When the time retention is set and not too big use to define the max block duration.
if cfg . tsdb . RetentionDuration != 0 && cfg . tsdb . RetentionDuration / 10 < maxBlockDuration {
maxBlockDuration = cfg . tsdb . RetentionDuration / 10
}
2019-01-18 06:48:06 -08:00
2019-02-19 03:53:43 -08:00
cfg . tsdb . MaxBlockDuration = maxBlockDuration
2019-01-18 06:48:06 -08:00
}
2017-06-20 09:48:17 -07:00
}
2015-09-01 10:18:39 -07:00
2020-07-22 06:39:51 -07:00
noStepSubqueryInterval := & safePromQLNoStepSubqueryInterval { }
noStepSubqueryInterval . Set ( config . DefaultGlobalConfig . EvaluationInterval )
2017-06-20 09:48:17 -07:00
2018-12-04 06:01:12 -08:00
// Above level 6, the k8s client would log bearer tokens in clear-text.
2019-01-03 13:44:29 -08:00
klog . ClampLevel ( 6 )
2021-07-27 04:07:39 -07:00
klog . SetLogger ( log . With ( logger , "component" , "k8s_client_runtime" ) )
2020-10-06 05:54:12 -07:00
klogv2 . ClampLevel ( 6 )
2021-07-27 04:07:39 -07:00
klogv2 . SetLogger ( log . With ( logger , "component" , "k8s_client_runtime" ) )
2017-06-16 03:22:44 -07:00
2017-10-23 00:49:28 -07:00
level . Info ( logger ) . Log ( "msg" , "Starting Prometheus" , "version" , version . Info ( ) )
2020-10-06 12:42:56 -07:00
if bits . UintSize < 64 {
level . Warn ( logger ) . Log ( "msg" , "This Prometheus binary has not been compiled for a 64-bit architecture. Due to virtual memory constraints of 32-bit systems, it is highly recommended to switch to a 64-bit binary of Prometheus." , "GOARCH" , runtime . GOARCH )
}
2017-08-11 11:45:52 -07:00
level . Info ( logger ) . Log ( "build_context" , version . BuildContext ( ) )
2018-08-22 03:41:11 -07:00
level . Info ( logger ) . Log ( "host_details" , prom_runtime . Uname ( ) )
level . Info ( logger ) . Log ( "fd_limits" , prom_runtime . FdLimits ( ) )
2020-03-23 07:47:11 -07:00
level . Info ( logger ) . Log ( "vm_limits" , prom_runtime . VMLimits ( ) )
2016-05-05 04:46:51 -07:00
2017-09-18 03:32:17 -07:00
var (
2021-06-05 07:29:32 -07:00
localStorage = & readyStorage { stats : tsdb . NewDBStats ( ) }
2020-11-19 07:23:03 -08:00
scraper = & readyScrapeManager { }
2021-11-05 08:50:10 -07:00
remoteStorage = remote . NewStorage ( log . With ( logger , "component" , "remote" ) , prometheus . DefaultRegisterer , localStorage . StartTime , localStoragePath , time . Duration ( cfg . RemoteFlushDeadline ) , scraper )
2017-09-18 03:32:17 -07:00
fanoutStorage = storage . NewFanout ( logger , localStorage , remoteStorage )
2017-08-11 11:45:52 -07:00
)
2016-09-19 13:47:51 -07:00
2015-06-25 16:32:44 -07:00
var (
2017-12-01 04:59:24 -08:00
ctxWeb , cancelWeb = context . WithCancel ( context . Background ( ) )
ctxRule = context . Background ( )
2017-11-26 07:15:15 -08:00
2018-12-18 03:13:18 -08:00
notifierManager = notifier . NewManager ( & cfg . notifier , log . With ( logger , "component" , "notifier" ) )
2018-01-25 15:32:36 -08:00
ctxScrape , cancelScrape = context . WithCancel ( context . Background ( ) )
ctxNotify , cancelNotify = context . WithCancel ( context . Background ( ) )
2021-10-20 01:15:54 -07:00
discoveryManagerScrape discoveryManager
discoveryManagerNotify discoveryManager
)
if cfg . enableNewSDManager {
discovery . RegisterMetrics ( )
discoveryManagerScrape = discovery . NewManager ( ctxScrape , log . With ( logger , "component" , "discovery manager scrape" ) , discovery . Name ( "scrape" ) )
discoveryManagerNotify = discovery . NewManager ( ctxNotify , log . With ( logger , "component" , "discovery manager notify" ) , discovery . Name ( "notify" ) )
} else {
legacymanager . RegisterMetrics ( )
discoveryManagerScrape = legacymanager . NewManager ( ctxScrape , log . With ( logger , "component" , "discovery manager scrape" ) , legacymanager . Name ( "scrape" ) )
discoveryManagerNotify = legacymanager . NewManager ( ctxNotify , log . With ( logger , "component" , "discovery manager notify" ) , legacymanager . Name ( "notify" ) )
}
2018-01-25 15:32:36 -08:00
2021-10-20 01:15:54 -07:00
var (
2022-01-25 02:08:04 -08:00
scrapeManager = scrape . NewManager ( & cfg . scrape , log . With ( logger , "component" , "scrape manager" ) , fanoutStorage )
tracingManager = tracing . NewManager ( logger )
2018-01-09 08:44:23 -08:00
2021-10-29 08:25:05 -07:00
queryEngine * promql . Engine
ruleManager * rules . Manager
)
if ! agentMode {
opts := promql . EngineOpts {
2020-07-22 06:39:51 -07:00
Logger : log . With ( logger , "component" , "query engine" ) ,
Reg : prometheus . DefaultRegisterer ,
MaxSamples : cfg . queryMaxSamples ,
Timeout : time . Duration ( cfg . queryTimeout ) ,
2021-11-05 08:50:10 -07:00
ActiveQueryTracker : promql . NewActiveQueryTracker ( localStoragePath , cfg . queryConcurrency , log . With ( logger , "component" , "activeQueryTracker" ) ) ,
2020-07-22 06:39:51 -07:00
LookbackDelta : time . Duration ( cfg . lookbackDelta ) ,
NoStepSubqueryIntervalFn : noStepSubqueryInterval . Get ,
2022-01-11 08:01:02 -08:00
// EnableAtModifier and EnableNegativeOffset have to be
// always on for regular PromQL as of Prometheus v2.33.
EnableAtModifier : true ,
EnableNegativeOffset : true ,
2022-02-01 18:07:23 -08:00
EnablePerStepStats : cfg . enablePerStepStats ,
2018-10-02 04:59:19 -07:00
}
2019-07-31 08:12:43 -07:00
2018-10-02 04:59:19 -07:00
queryEngine = promql . NewEngine ( opts )
2018-01-09 08:44:23 -08:00
ruleManager = rules . NewManager ( & rules . ManagerOptions {
2018-08-02 03:18:24 -07:00
Appendable : fanoutStorage ,
2020-06-26 11:06:36 -07:00
Queryable : localStorage ,
2018-08-02 03:18:24 -07:00
QueryFunc : rules . EngineQueryFunc ( queryEngine , fanoutStorage ) ,
2018-12-18 03:13:18 -08:00
NotifyFunc : sendAlerts ( notifierManager , cfg . web . ExternalURL . String ( ) ) ,
2018-08-02 03:18:24 -07:00
Context : ctxRule ,
ExternalURL : cfg . web . ExternalURL ,
Registerer : prometheus . DefaultRegisterer ,
Logger : log . With ( logger , "component" , "rule manager" ) ,
OutageTolerance : time . Duration ( cfg . outageTolerance ) ,
ForGracePeriod : time . Duration ( cfg . forGracePeriod ) ,
2018-08-27 09:41:42 -07:00
ResendDelay : time . Duration ( cfg . resendDelay ) ,
2017-11-26 07:15:15 -08:00
} )
2021-10-29 08:25:05 -07:00
}
2015-06-25 16:32:44 -07:00
2020-11-19 07:23:03 -08:00
scraper . Set ( scrapeManager )
2017-11-26 07:15:15 -08:00
cfg . web . Context = ctxWeb
2020-02-18 03:25:36 -08:00
cfg . web . TSDBRetentionDuration = cfg . tsdb . RetentionDuration
cfg . web . TSDBMaxBytes = cfg . tsdb . MaxBytes
2021-11-05 08:50:10 -07:00
cfg . web . TSDBDir = localStoragePath
2020-04-29 09:16:14 -07:00
cfg . web . LocalStorage = localStorage
2017-09-18 03:32:17 -07:00
cfg . web . Storage = fanoutStorage
2021-03-16 02:47:45 -07:00
cfg . web . ExemplarStorage = localStorage
2016-09-15 15:58:06 -07:00
cfg . web . QueryEngine = queryEngine
2017-11-25 05:13:54 -08:00
cfg . web . ScrapeManager = scrapeManager
2016-09-15 15:58:06 -07:00
cfg . web . RuleManager = ruleManager
2018-12-18 03:13:18 -08:00
cfg . web . Notifier = notifierManager
2020-02-09 15:58:23 -08:00
cfg . web . LookbackDelta = time . Duration ( cfg . lookbackDelta )
2021-10-29 08:25:05 -07:00
cfg . web . IsAgent = agentMode
2015-06-15 03:36:32 -07:00
2016-09-15 15:58:06 -07:00
cfg . web . Version = & web . PrometheusVersion {
2016-05-05 04:46:51 -07:00
Version : version . Version ,
Revision : version . Revision ,
Branch : version . Branch ,
BuildUser : version . BuildUser ,
BuildDate : version . BuildDate ,
GoVersion : version . GoVersion ,
}
2016-09-15 15:58:06 -07:00
cfg . web . Flags = map [ string ] string { }
api: Added v1/status/flags endpoint. (#3864)
Endpoint URL: /api/v1/status/flags
Example Output:
```json
{
"status": "success",
"data": {
"alertmanager.notification-queue-capacity": "10000",
"alertmanager.timeout": "10s",
"completion-bash": "false",
"completion-script-bash": "false",
"completion-script-zsh": "false",
"config.file": "my_cool_prometheus.yaml",
"help": "false",
"help-long": "false",
"help-man": "false",
"log.level": "info",
"query.lookback-delta": "5m",
"query.max-concurrency": "20",
"query.timeout": "2m",
"storage.tsdb.max-block-duration": "36h",
"storage.tsdb.min-block-duration": "2h",
"storage.tsdb.no-lockfile": "false",
"storage.tsdb.path": "data/",
"storage.tsdb.retention": "15d",
"version": "false",
"web.console.libraries": "console_libraries",
"web.console.templates": "consoles",
"web.enable-admin-api": "false",
"web.enable-lifecycle": "false",
"web.external-url": "",
"web.listen-address": "0.0.0.0:9090",
"web.max-connections": "512",
"web.read-timeout": "5m",
"web.route-prefix": "/",
"web.user-assets": ""
}
}
```
Signed-off-by: Bartek Plotka <bwplotka@gmail.com>
2018-02-21 00:49:02 -08:00
// Exclude kingpin default flags to expose only Prometheus ones.
boilerplateFlags := kingpin . New ( "" , "" ) . Version ( "" )
2017-06-20 08:38:01 -07:00
for _ , f := range a . Model ( ) . Flags {
api: Added v1/status/flags endpoint. (#3864)
Endpoint URL: /api/v1/status/flags
Example Output:
```json
{
"status": "success",
"data": {
"alertmanager.notification-queue-capacity": "10000",
"alertmanager.timeout": "10s",
"completion-bash": "false",
"completion-script-bash": "false",
"completion-script-zsh": "false",
"config.file": "my_cool_prometheus.yaml",
"help": "false",
"help-long": "false",
"help-man": "false",
"log.level": "info",
"query.lookback-delta": "5m",
"query.max-concurrency": "20",
"query.timeout": "2m",
"storage.tsdb.max-block-duration": "36h",
"storage.tsdb.min-block-duration": "2h",
"storage.tsdb.no-lockfile": "false",
"storage.tsdb.path": "data/",
"storage.tsdb.retention": "15d",
"version": "false",
"web.console.libraries": "console_libraries",
"web.console.templates": "consoles",
"web.enable-admin-api": "false",
"web.enable-lifecycle": "false",
"web.external-url": "",
"web.listen-address": "0.0.0.0:9090",
"web.max-connections": "512",
"web.read-timeout": "5m",
"web.route-prefix": "/",
"web.user-assets": ""
}
}
```
Signed-off-by: Bartek Plotka <bwplotka@gmail.com>
2018-02-21 00:49:02 -08:00
if boilerplateFlags . GetFlag ( f . Name ) != nil {
continue
}
2016-09-15 15:58:06 -07:00
cfg . web . Flags [ f . Name ] = f . Value . String ( )
2017-06-20 08:38:01 -07:00
}
2016-09-15 15:58:06 -07:00
2019-07-31 08:12:43 -07:00
// Depends on cfg.web.ScrapeManager so needs to be after cfg.web.ScrapeManager = scrapeManager.
2017-11-26 07:15:15 -08:00
webHandler := web . New ( log . With ( logger , "component" , "web" ) , & cfg . web )
2017-10-06 03:22:19 -07:00
// Monitor outgoing connections on default transport with conntrack.
http . DefaultTransport . ( * http . Transport ) . DialContext = conntrack . NewDialContextFunc (
conntrack . DialWithTracing ( ) ,
)
2021-05-30 20:35:26 -07:00
// This is passed to ruleManager.Update().
2021-10-22 01:06:44 -07:00
externalURL := cfg . web . ExternalURL . String ( )
2021-05-30 20:35:26 -07:00
2020-08-06 12:48:52 -07:00
reloaders := [ ] reloader {
{
2021-07-19 21:52:57 -07:00
name : "db_storage" ,
reloader : localStorage . ApplyConfig ,
} , {
2020-08-06 12:48:52 -07:00
name : "remote_storage" ,
reloader : remoteStorage . ApplyConfig ,
} , {
name : "web_handler" ,
reloader : webHandler . ApplyConfig ,
} , {
name : "query_engine" ,
reloader : func ( cfg * config . Config ) error {
2021-10-29 08:25:05 -07:00
if agentMode {
// No-op in Agent mode.
return nil
}
2020-08-06 12:48:52 -07:00
if cfg . GlobalConfig . QueryLogFile == "" {
queryEngine . SetQueryLogger ( nil )
return nil
}
2020-01-08 05:28:43 -08:00
2020-08-06 12:48:52 -07:00
l , err := logging . NewJSONFileLogger ( cfg . GlobalConfig . QueryLogFile )
2017-11-23 23:22:57 -08:00
if err != nil {
2020-08-06 12:48:52 -07:00
return err
2017-11-23 23:22:57 -08:00
}
2020-08-06 12:48:52 -07:00
queryEngine . SetQueryLogger ( l )
return nil
} ,
} , {
// The Scrape and notifier managers need to reload before the Discovery manager as
// they need to read the most updated config when receiving the new targets list.
name : "scrape" ,
reloader : scrapeManager . ApplyConfig ,
} , {
name : "scrape_sd" ,
reloader : func ( cfg * config . Config ) error {
2020-08-20 05:48:26 -07:00
c := make ( map [ string ] discovery . Configs )
2020-08-06 12:48:52 -07:00
for _ , v := range cfg . ScrapeConfigs {
2020-08-20 05:48:26 -07:00
c [ v . JobName ] = v . ServiceDiscoveryConfigs
2020-08-06 12:48:52 -07:00
}
return discoveryManagerScrape . ApplyConfig ( c )
} ,
} , {
name : "notify" ,
reloader : notifierManager . ApplyConfig ,
} , {
name : "notify_sd" ,
reloader : func ( cfg * config . Config ) error {
2020-08-20 05:48:26 -07:00
c := make ( map [ string ] discovery . Configs )
2020-08-06 12:48:52 -07:00
for k , v := range cfg . AlertingConfig . AlertmanagerConfigs . ToMap ( ) {
2020-08-20 05:48:26 -07:00
c [ k ] = v . ServiceDiscoveryConfigs
2020-08-06 12:48:52 -07:00
}
return discoveryManagerNotify . ApplyConfig ( c )
} ,
} , {
name : "rules" ,
reloader : func ( cfg * config . Config ) error {
2021-10-29 08:25:05 -07:00
if agentMode {
// No-op in Agent mode
return nil
}
2020-08-06 12:48:52 -07:00
// Get all rule files matching the configuration paths.
var files [ ] string
for _ , pat := range cfg . RuleFiles {
fs , err := filepath . Glob ( pat )
if err != nil {
// The only error can be a bad pattern.
return errors . Wrapf ( err , "error retrieving rule files for %s" , pat )
}
files = append ( files , fs ... )
}
return ruleManager . Update (
time . Duration ( cfg . GlobalConfig . EvaluationInterval ) ,
files ,
cfg . GlobalConfig . ExternalLabels ,
2021-05-30 20:35:26 -07:00
externalURL ,
2022-03-28 17:16:46 -07:00
nil ,
2020-08-06 12:48:52 -07:00
)
} ,
2022-01-25 02:08:04 -08:00
} , {
name : "tracing" ,
reloader : tracingManager . ApplyConfig ,
2017-11-23 06:48:14 -08:00
} ,
2015-06-15 03:36:32 -07:00
}
2017-11-11 04:06:13 -08:00
prometheus . MustRegister ( configSuccess )
prometheus . MustRegister ( configSuccessTime )
2015-06-15 03:36:32 -07:00
2017-09-18 03:32:17 -07:00
// Start all components while we wait for TSDB to open but only load
// initial config and mark ourselves as ready after it completed.
dbOpen := make ( chan struct { } )
2018-01-17 10:14:24 -08:00
// sync.Once is used to make sure we can close the channel at different execution stages(SIGTERM or when the config is loaded).
type closeOnce struct {
C chan struct { }
once sync . Once
Close func ( )
}
// Wait until the server is ready to handle reloading.
reloadReady := & closeOnce {
C : make ( chan struct { } ) ,
}
reloadReady . Close = func ( ) {
reloadReady . once . Do ( func ( ) {
close ( reloadReady . C )
} )
}
2017-11-11 04:06:13 -08:00
2020-12-03 06:33:16 -08:00
listener , err := webHandler . Listener ( )
if err != nil {
level . Error ( logger ) . Log ( "msg" , "Unable to start web listener" , "err" , err )
os . Exit ( 1 )
}
2021-01-13 12:37:01 -08:00
err = toolkit_web . Validate ( * webConfig )
2020-12-25 03:45:31 -08:00
if err != nil {
level . Error ( logger ) . Log ( "msg" , "Unable to validate web configuration file" , "err" , err )
os . Exit ( 1 )
}
2019-04-19 05:55:28 -07:00
var g run . Group
2017-11-11 04:06:13 -08:00
{
2018-04-01 11:19:30 -07:00
// Termination handler.
2018-11-14 00:56:42 -08:00
term := make ( chan os . Signal , 1 )
2017-11-11 04:06:13 -08:00
signal . Notify ( term , os . Interrupt , syscall . SIGTERM )
cancel := make ( chan struct { } )
g . Add (
func ( ) error {
2018-01-17 10:14:24 -08:00
// Don't forget to release the reloadReady channel so that waiting blocks can exit normally.
2017-11-11 04:06:13 -08:00
select {
case <- term :
level . Warn ( logger ) . Log ( "msg" , "Received SIGTERM, exiting gracefully..." )
2018-01-17 10:14:24 -08:00
reloadReady . Close ( )
2017-11-11 04:06:13 -08:00
case <- webHandler . Quit ( ) :
level . Warn ( logger ) . Log ( "msg" , "Received termination request via web service, exiting gracefully..." )
case <- cancel :
2018-01-17 10:14:24 -08:00
reloadReady . Close ( )
2017-11-11 04:06:13 -08:00
}
return nil
} ,
func ( err error ) {
close ( cancel )
} ,
2017-09-18 03:32:17 -07:00
)
2017-11-11 04:06:13 -08:00
}
2017-11-26 07:15:15 -08:00
{
2018-04-01 11:19:30 -07:00
// Scrape discovery manager.
2017-12-30 09:27:50 -08:00
g . Add (
func ( ) error {
2018-01-25 15:32:36 -08:00
err := discoveryManagerScrape . Run ( )
2017-12-30 09:27:50 -08:00
level . Info ( logger ) . Log ( "msg" , "Scrape discovery manager stopped" )
return err
} ,
func ( err error ) {
level . Info ( logger ) . Log ( "msg" , "Stopping scrape discovery manager..." )
2018-01-25 15:32:36 -08:00
cancelScrape ( )
2017-12-30 09:27:50 -08:00
} ,
)
}
{
2018-04-01 11:19:30 -07:00
// Notify discovery manager.
2017-11-26 07:15:15 -08:00
g . Add (
func ( ) error {
2018-01-25 15:32:36 -08:00
err := discoveryManagerNotify . Run ( )
2017-12-30 09:27:50 -08:00
level . Info ( logger ) . Log ( "msg" , "Notify discovery manager stopped" )
2017-11-26 07:15:15 -08:00
return err
} ,
func ( err error ) {
2017-12-30 09:27:50 -08:00
level . Info ( logger ) . Log ( "msg" , "Stopping notify discovery manager..." )
2018-01-25 15:32:36 -08:00
cancelNotify ( )
2017-11-26 07:15:15 -08:00
} ,
)
}
{
2018-04-01 11:19:30 -07:00
// Scrape manager.
2017-11-26 07:15:15 -08:00
g . Add (
func ( ) error {
2018-01-17 04:02:13 -08:00
// When the scrape manager receives a new targets list
2018-01-17 10:14:24 -08:00
// it needs to read a valid config for each job.
// It depends on the config being in sync with the discovery manager so
// we wait until the config is fully loaded.
2018-02-25 23:58:10 -08:00
<- reloadReady . C
2018-01-17 04:02:13 -08:00
2017-12-30 09:27:50 -08:00
err := scrapeManager . Run ( discoveryManagerScrape . SyncCh ( ) )
2017-11-26 07:15:15 -08:00
level . Info ( logger ) . Log ( "msg" , "Scrape manager stopped" )
return err
} ,
func ( err error ) {
// Scrape manager needs to be stopped before closing the local TSDB
// so that it doesn't try to write samples to a closed storage.
level . Info ( logger ) . Log ( "msg" , "Stopping scrape manager..." )
scrapeManager . Stop ( )
} ,
)
}
2022-01-25 02:08:04 -08:00
{
// Tracing manager.
g . Add (
func ( ) error {
<- reloadReady . C
tracingManager . Run ( )
return nil
} ,
func ( err error ) {
tracingManager . Stop ( )
} ,
)
}
2017-11-11 04:06:13 -08:00
{
2018-04-01 11:19:30 -07:00
// Reload handler.
2017-11-11 04:06:13 -08:00
// Make sure that sighup handler is registered with a redirect to the channel before the potentially
// long and synchronous tsdb init.
2018-11-14 00:56:42 -08:00
hup := make ( chan os . Signal , 1 )
2017-11-11 04:06:13 -08:00
signal . Notify ( hup , syscall . SIGHUP )
cancel := make ( chan struct { } )
g . Add (
func ( ) error {
2018-02-25 23:58:10 -08:00
<- reloadReady . C
2015-06-15 03:36:32 -07:00
2017-11-11 04:06:13 -08:00
for {
select {
case <- hup :
2021-07-19 21:52:57 -07:00
if err := reloadConfig ( cfg . configFile , cfg . enableExpandExternalLabels , cfg . tsdb . EnableExemplarStorage , logger , noStepSubqueryInterval , reloaders ... ) ; err != nil {
2017-11-11 04:06:13 -08:00
level . Error ( logger ) . Log ( "msg" , "Error reloading config" , "err" , err )
}
case rc := <- webHandler . Reload ( ) :
2021-07-19 21:52:57 -07:00
if err := reloadConfig ( cfg . configFile , cfg . enableExpandExternalLabels , cfg . tsdb . EnableExemplarStorage , logger , noStepSubqueryInterval , reloaders ... ) ; err != nil {
2017-11-11 04:06:13 -08:00
level . Error ( logger ) . Log ( "msg" , "Error reloading config" , "err" , err )
rc <- err
} else {
rc <- nil
}
case <- cancel :
return nil
}
}
} ,
func ( err error ) {
2018-07-04 05:41:16 -07:00
// Wait for any in-progress reloads to complete to avoid
// reloading things after they have been shutdown.
cancel <- struct { } { }
2017-11-11 04:06:13 -08:00
} ,
)
2017-09-18 03:32:17 -07:00
}
2017-11-11 04:06:13 -08:00
{
2018-04-01 11:19:30 -07:00
// Initial configuration loading.
2017-11-11 04:06:13 -08:00
cancel := make ( chan struct { } )
g . Add (
func ( ) error {
select {
case <- dbOpen :
// In case a shutdown is initiated before the dbOpen is released
case <- cancel :
2018-01-17 10:14:24 -08:00
reloadReady . Close ( )
2017-11-11 04:06:13 -08:00
return nil
}
2017-09-18 03:32:17 -07:00
2021-07-19 21:52:57 -07:00
if err := reloadConfig ( cfg . configFile , cfg . enableExpandExternalLabels , cfg . tsdb . EnableExemplarStorage , logger , noStepSubqueryInterval , reloaders ... ) ; err != nil {
2019-03-25 16:01:12 -07:00
return errors . Wrapf ( err , "error loading config from %q" , cfg . configFile )
2017-11-11 04:06:13 -08:00
}
2017-09-18 03:32:17 -07:00
2018-01-17 10:14:24 -08:00
reloadReady . Close ( )
2018-01-17 05:06:56 -08:00
2017-11-11 04:06:13 -08:00
webHandler . Ready ( )
2018-01-17 10:14:24 -08:00
level . Info ( logger ) . Log ( "msg" , "Server is ready to receive web requests." )
2017-11-11 04:06:13 -08:00
<- cancel
return nil
} ,
func ( err error ) {
close ( cancel )
} ,
)
}
2021-10-29 08:25:05 -07:00
if ! agentMode {
2018-07-04 05:41:16 -07:00
// Rule manager.
g . Add (
func ( ) error {
<- reloadReady . C
ruleManager . Run ( )
return nil
} ,
func ( err error ) {
ruleManager . Stop ( )
} ,
)
2021-10-29 08:25:05 -07:00
2018-04-01 11:19:30 -07:00
// TSDB.
2020-02-11 08:34:09 -08:00
opts := cfg . tsdb . ToTSDBOptions ( )
2017-11-11 04:06:13 -08:00
cancel := make ( chan struct { } )
g . Add (
func ( ) error {
level . Info ( logger ) . Log ( "msg" , "Starting TSDB ..." )
2019-01-03 06:13:21 -08:00
if cfg . tsdb . WALSegmentSize != 0 {
if cfg . tsdb . WALSegmentSize < 10 * 1024 * 1024 || cfg . tsdb . WALSegmentSize > 256 * 1024 * 1024 {
return errors . New ( "flag 'storage.tsdb.wal-segment-size' must be set between 10MB and 256MB" )
}
}
2021-04-15 01:55:01 -07:00
if cfg . tsdb . MaxBlockChunkSegmentSize != 0 {
if cfg . tsdb . MaxBlockChunkSegmentSize < 1024 * 1024 {
return errors . New ( "flag 'storage.tsdb.max-block-chunk-segment-size' must be set over 1MB" )
}
}
2021-06-05 07:29:32 -07:00
2021-11-05 08:50:10 -07:00
db , err := openDBWithMetrics ( localStoragePath , logger , prometheus . DefaultRegisterer , & opts , localStorage . getStats ( ) )
2017-11-11 04:06:13 -08:00
if err != nil {
2019-03-25 16:01:12 -07:00
return errors . Wrapf ( err , "opening storage failed" )
2017-11-11 04:06:13 -08:00
}
2020-02-14 10:48:55 -08:00
2021-11-05 08:50:10 -07:00
switch fsType := prom_runtime . Statfs ( localStoragePath ) ; fsType {
2020-07-30 02:22:44 -07:00
case "NFS_SUPER_MAGIC" :
level . Warn ( logger ) . Log ( "fs_type" , fsType , "msg" , "This filesystem is not supported and may lead to data corruption and data loss. Please carefully read https://prometheus.io/docs/prometheus/latest/storage/ to learn more about supported filesystems." )
default :
level . Info ( logger ) . Log ( "fs_type" , fsType )
}
2017-11-11 04:06:13 -08:00
level . Info ( logger ) . Log ( "msg" , "TSDB started" )
2019-02-19 03:53:43 -08:00
level . Debug ( logger ) . Log ( "msg" , "TSDB options" ,
"MinBlockDuration" , cfg . tsdb . MinBlockDuration ,
"MaxBlockDuration" , cfg . tsdb . MaxBlockDuration ,
"MaxBytes" , cfg . tsdb . MaxBytes ,
"NoLockfile" , cfg . tsdb . NoLockfile ,
"RetentionDuration" , cfg . tsdb . RetentionDuration ,
"WALSegmentSize" , cfg . tsdb . WALSegmentSize ,
2019-03-04 11:42:45 -08:00
"AllowOverlappingBlocks" , cfg . tsdb . AllowOverlappingBlocks ,
2019-07-03 06:23:13 -07:00
"WALCompression" , cfg . tsdb . WALCompression ,
2019-02-19 03:53:43 -08:00
)
2017-11-11 04:06:13 -08:00
startTimeMargin := int64 ( 2 * time . Duration ( cfg . tsdb . MinBlockDuration ) . Seconds ( ) * 1000 )
localStorage . Set ( db , startTimeMargin )
close ( dbOpen )
<- cancel
return nil
} ,
func ( err error ) {
if err := fanoutStorage . Close ( ) ; err != nil {
level . Error ( logger ) . Log ( "msg" , "Error stopping storage" , "err" , err )
}
close ( cancel )
} ,
)
}
2021-10-29 08:25:05 -07:00
if agentMode {
// WAL storage.
opts := cfg . agent . ToAgentOptions ( )
cancel := make ( chan struct { } )
g . Add (
func ( ) error {
level . Info ( logger ) . Log ( "msg" , "Starting WAL storage ..." )
if cfg . agent . WALSegmentSize != 0 {
if cfg . agent . WALSegmentSize < 10 * 1024 * 1024 || cfg . agent . WALSegmentSize > 256 * 1024 * 1024 {
2021-11-02 06:03:35 -07:00
return errors . New ( "flag 'storage.agent.wal-segment-size' must be set between 10MB and 256MB" )
2021-10-29 08:25:05 -07:00
}
}
db , err := agent . Open (
logger ,
prometheus . DefaultRegisterer ,
remoteStorage ,
2021-11-05 08:50:10 -07:00
localStoragePath ,
2021-10-29 08:25:05 -07:00
& opts ,
)
if err != nil {
return errors . Wrap ( err , "opening storage failed" )
}
2021-11-05 08:50:10 -07:00
switch fsType := prom_runtime . Statfs ( localStoragePath ) ; fsType {
2021-10-29 08:25:05 -07:00
case "NFS_SUPER_MAGIC" :
level . Warn ( logger ) . Log ( "fs_type" , fsType , "msg" , "This filesystem is not supported and may lead to data corruption and data loss. Please carefully read https://prometheus.io/docs/prometheus/latest/storage/ to learn more about supported filesystems." )
default :
level . Info ( logger ) . Log ( "fs_type" , fsType )
}
level . Info ( logger ) . Log ( "msg" , "Agent WAL storage started" )
level . Debug ( logger ) . Log ( "msg" , "Agent WAL storage options" ,
"WALSegmentSize" , cfg . agent . WALSegmentSize ,
"WALCompression" , cfg . agent . WALCompression ,
"StripeSize" , cfg . agent . StripeSize ,
"TruncateFrequency" , cfg . agent . TruncateFrequency ,
"MinWALTime" , cfg . agent . MinWALTime ,
"MaxWALTime" , cfg . agent . MaxWALTime ,
)
localStorage . Set ( db , 0 )
close ( dbOpen )
<- cancel
return nil
} ,
func ( e error ) {
if err := fanoutStorage . Close ( ) ; err != nil {
level . Error ( logger ) . Log ( "msg" , "Error stopping storage" , "err" , err )
}
close ( cancel )
} ,
)
}
2017-11-11 04:06:13 -08:00
{
2018-04-01 11:19:30 -07:00
// Web handler.
2017-11-11 04:06:13 -08:00
g . Add (
func ( ) error {
2021-01-13 12:37:01 -08:00
if err := webHandler . Run ( ctxWeb , listener , * webConfig ) ; err != nil {
2019-03-25 16:01:12 -07:00
return errors . Wrapf ( err , "error starting web server" )
2017-11-11 04:06:13 -08:00
}
return nil
} ,
func ( err error ) {
2017-11-25 05:13:54 -08:00
cancelWeb ( )
2017-11-11 04:06:13 -08:00
} ,
)
}
{
2018-04-01 11:19:30 -07:00
// Notifier.
2017-11-11 04:06:13 -08:00
// Calling notifier.Stop() before ruleManager.Stop() will cause a panic if the ruleManager isn't running,
// so keep this interrupt after the ruleManager.Stop().
g . Add (
func ( ) error {
2018-01-17 05:06:56 -08:00
// When the notifier manager receives a new targets list
2018-01-17 10:14:24 -08:00
// it needs to read a valid config for each job.
// It depends on the config being in sync with the discovery manager
2018-01-17 05:06:56 -08:00
// so we wait until the config is fully loaded.
2018-02-25 23:58:10 -08:00
<- reloadReady . C
2018-12-18 03:13:18 -08:00
notifierManager . Run ( discoveryManagerNotify . SyncCh ( ) )
2018-01-17 10:14:24 -08:00
level . Info ( logger ) . Log ( "msg" , "Notifier manager stopped" )
2017-11-11 04:06:13 -08:00
return nil
} ,
func ( err error ) {
2018-12-18 03:13:18 -08:00
notifierManager . Stop ( )
2017-11-11 04:06:13 -08:00
} ,
)
}
if err := g . Run ( ) ; err != nil {
level . Error ( logger ) . Log ( "err" , err )
2018-06-21 00:32:26 -07:00
os . Exit ( 1 )
2015-06-15 03:36:32 -07:00
}
2017-08-11 11:45:52 -07:00
level . Info ( logger ) . Log ( "msg" , "See you next time!" )
2015-06-15 03:36:32 -07:00
}
2021-06-05 07:29:32 -07:00
func openDBWithMetrics ( dir string , logger log . Logger , reg prometheus . Registerer , opts * tsdb . Options , stats * tsdb . DBStats ) ( * tsdb . DB , error ) {
2020-02-14 10:48:55 -08:00
db , err := tsdb . Open (
dir ,
log . With ( logger , "component" , "tsdb" ) ,
reg ,
opts ,
2021-06-05 07:29:32 -07:00
stats ,
2020-02-14 10:48:55 -08:00
)
if err != nil {
return nil , err
}
reg . MustRegister (
prometheus . NewGaugeFunc ( prometheus . GaugeOpts {
Name : "prometheus_tsdb_lowest_timestamp_seconds" ,
Help : "Lowest timestamp value stored in the database." ,
} , func ( ) float64 {
bb := db . Blocks ( )
if len ( bb ) == 0 {
return float64 ( db . Head ( ) . MinTime ( ) / 1000 )
}
return float64 ( db . Blocks ( ) [ 0 ] . Meta ( ) . MinTime / 1000 )
} ) , prometheus . NewGaugeFunc ( prometheus . GaugeOpts {
Name : "prometheus_tsdb_head_min_time_seconds" ,
Help : "Minimum time bound of the head block." ,
} , func ( ) float64 { return float64 ( db . Head ( ) . MinTime ( ) / 1000 ) } ) ,
prometheus . NewGaugeFunc ( prometheus . GaugeOpts {
Name : "prometheus_tsdb_head_max_time_seconds" ,
Help : "Maximum timestamp of the head block." ,
} , func ( ) float64 { return float64 ( db . Head ( ) . MaxTime ( ) / 1000 ) } ) ,
)
return db , nil
}
2020-07-22 06:39:51 -07:00
type safePromQLNoStepSubqueryInterval struct {
2020-07-30 00:45:42 -07:00
value atomic . Int64
2020-07-22 06:39:51 -07:00
}
func durationToInt64Millis ( d time . Duration ) int64 {
return int64 ( d / time . Millisecond )
}
2021-10-22 01:06:44 -07:00
2020-07-22 06:39:51 -07:00
func ( i * safePromQLNoStepSubqueryInterval ) Set ( ev model . Duration ) {
2020-07-30 00:45:42 -07:00
i . value . Store ( durationToInt64Millis ( time . Duration ( ev ) ) )
2020-07-22 06:39:51 -07:00
}
func ( i * safePromQLNoStepSubqueryInterval ) Get ( int64 ) int64 {
2020-07-30 00:45:42 -07:00
return i . value . Load ( )
2020-07-22 06:39:51 -07:00
}
2020-08-06 12:48:52 -07:00
type reloader struct {
name string
reloader func ( * config . Config ) error
}
2021-10-22 01:06:44 -07:00
func reloadConfig ( filename string , expandExternalLabels , enableExemplarStorage bool , logger log . Logger , noStepSuqueryInterval * safePromQLNoStepSubqueryInterval , rls ... reloader ) ( err error ) {
2020-08-06 12:48:52 -07:00
start := time . Now ( )
timings := [ ] interface { } { }
2017-08-11 11:45:52 -07:00
level . Info ( logger ) . Log ( "msg" , "Loading configuration file" , "filename" , filename )
2015-09-01 10:18:39 -07:00
defer func ( ) {
2016-07-11 07:24:54 -07:00
if err == nil {
2015-09-01 10:18:39 -07:00
configSuccess . Set ( 1 )
2018-01-26 23:48:13 -08:00
configSuccessTime . SetToCurrentTime ( )
2015-09-01 10:18:39 -07:00
} else {
configSuccess . Set ( 0 )
}
} ( )
2015-06-15 03:36:32 -07:00
2021-10-29 16:41:40 -07:00
conf , err := config . LoadFile ( filename , agentMode , expandExternalLabels , logger )
2015-06-15 03:36:32 -07:00
if err != nil {
2019-03-25 16:01:12 -07:00
return errors . Wrapf ( err , "couldn't load configuration (--config.file=%q)" , filename )
2015-06-15 03:36:32 -07:00
}
2021-07-19 21:52:57 -07:00
if enableExemplarStorage {
if conf . StorageConfig . ExemplarsConfig == nil {
conf . StorageConfig . ExemplarsConfig = & config . DefaultExemplarsConfig
}
}
2016-08-11 18:23:18 -07:00
failed := false
2015-06-15 03:36:32 -07:00
for _ , rl := range rls {
2020-08-06 12:48:52 -07:00
rstart := time . Now ( )
if err := rl . reloader ( conf ) ; err != nil {
2017-08-11 11:45:52 -07:00
level . Error ( logger ) . Log ( "msg" , "Failed to apply configuration" , "err" , err )
2016-08-11 18:23:18 -07:00
failed = true
2016-07-11 07:24:54 -07:00
}
2020-08-06 12:48:52 -07:00
timings = append ( timings , rl . name , time . Since ( rstart ) )
2015-06-15 03:36:32 -07:00
}
2016-08-11 18:23:18 -07:00
if failed {
2019-03-25 16:01:12 -07:00
return errors . Errorf ( "one or more errors occurred while applying the new configuration (--config.file=%q)" , filename )
2016-08-11 18:23:18 -07:00
}
2018-12-17 11:16:28 -08:00
2020-07-22 06:39:51 -07:00
noStepSuqueryInterval . Set ( conf . GlobalConfig . EvaluationInterval )
2020-08-06 12:48:52 -07:00
l := [ ] interface { } { "msg" , "Completed loading of configuration file" , "filename" , filename , "totalDuration" , time . Since ( start ) }
level . Info ( logger ) . Log ( append ( l , timings ... ) ... )
2016-08-11 18:23:18 -07:00
return nil
2015-06-15 03:36:32 -07:00
}
2017-06-20 09:48:17 -07:00
2017-10-05 03:16:15 -07:00
func startsOrEndsWithQuote ( s string ) bool {
return strings . HasPrefix ( s , "\"" ) || strings . HasPrefix ( s , "'" ) ||
strings . HasSuffix ( s , "\"" ) || strings . HasSuffix ( s , "'" )
}
2019-01-17 07:01:06 -08:00
// compileCORSRegexString compiles given string and adds anchors
func compileCORSRegexString ( s string ) ( * regexp . Regexp , error ) {
r , err := relabel . NewRegexp ( s )
if err != nil {
return nil , err
}
return r . Regexp , nil
}
2017-06-20 09:48:17 -07:00
// computeExternalURL computes a sanitized external URL from a raw input. It infers unset
// URL parts from the OS and the given listen address.
func computeExternalURL ( u , listenAddr string ) ( * url . URL , error ) {
if u == "" {
hostname , err := os . Hostname ( )
if err != nil {
return nil , err
}
_ , port , err := net . SplitHostPort ( listenAddr )
if err != nil {
return nil , err
}
u = fmt . Sprintf ( "http://%s:%s/" , hostname , port )
}
2017-10-05 03:16:15 -07:00
if startsOrEndsWithQuote ( u ) {
2019-03-25 16:01:12 -07:00
return nil , errors . New ( "URL must not begin or end with quotes" )
2017-06-20 09:48:17 -07:00
}
eu , err := url . Parse ( u )
if err != nil {
return nil , err
}
ppref := strings . TrimRight ( eu . Path , "/" )
if ppref != "" && ! strings . HasPrefix ( ppref , "/" ) {
ppref = "/" + ppref
}
eu . Path = ppref
return eu , nil
}
2017-11-23 23:59:05 -08:00
2018-12-18 03:15:46 -08:00
type sender interface {
Send ( alerts ... * notifier . Alert )
}
2018-01-22 08:17:33 -08:00
// sendAlerts implements the rules.NotifyFunc for a Notifier.
2018-12-18 03:15:46 -08:00
func sendAlerts ( s sender , externalURL string ) rules . NotifyFunc {
2018-08-04 12:31:12 -07:00
return func ( ctx context . Context , expr string , alerts ... * rules . Alert ) {
2017-11-23 23:59:05 -08:00
var res [ ] * notifier . Alert
for _ , alert := range alerts {
a := & notifier . Alert {
StartsAt : alert . FiredAt ,
Labels : alert . Labels ,
Annotations : alert . Annotations ,
GeneratorURL : externalURL + strutil . TableLinkForExpression ( expr ) ,
}
if ! alert . ResolvedAt . IsZero ( ) {
a . EndsAt = alert . ResolvedAt
2018-08-28 08:05:00 -07:00
} else {
a . EndsAt = alert . ValidUntil
2017-11-23 23:59:05 -08:00
}
res = append ( res , a )
}
if len ( alerts ) > 0 {
2018-12-18 03:15:46 -08:00
s . Send ( res ... )
2017-11-23 23:59:05 -08:00
}
}
}
2020-02-17 03:41:04 -08:00
// readyStorage implements the Storage interface while allowing to set the actual
// storage at a later point in time.
type readyStorage struct {
mtx sync . RWMutex
2021-10-29 08:25:05 -07:00
db storage . Storage
2020-02-17 03:41:04 -08:00
startTimeMargin int64
2021-06-05 07:29:32 -07:00
stats * tsdb . DBStats
2020-02-17 03:41:04 -08:00
}
2021-07-19 21:52:57 -07:00
func ( s * readyStorage ) ApplyConfig ( conf * config . Config ) error {
db := s . get ( )
2021-10-29 08:25:05 -07:00
if db , ok := db . ( * tsdb . DB ) ; ok {
return db . ApplyConfig ( conf )
}
return nil
2021-07-19 21:52:57 -07:00
}
2020-02-17 03:41:04 -08:00
// Set the storage.
2021-10-29 08:25:05 -07:00
func ( s * readyStorage ) Set ( db storage . Storage , startTimeMargin int64 ) {
2020-02-17 03:41:04 -08:00
s . mtx . Lock ( )
defer s . mtx . Unlock ( )
s . db = db
s . startTimeMargin = startTimeMargin
}
2021-10-29 08:25:05 -07:00
func ( s * readyStorage ) get ( ) storage . Storage {
2020-02-17 03:41:04 -08:00
s . mtx . RLock ( )
x := s . db
s . mtx . RUnlock ( )
return x
}
2021-06-05 07:29:32 -07:00
func ( s * readyStorage ) getStats ( ) * tsdb . DBStats {
s . mtx . RLock ( )
x := s . stats
s . mtx . RUnlock ( )
return x
}
2020-02-17 03:41:04 -08:00
// StartTime implements the Storage interface.
func ( s * readyStorage ) StartTime ( ) ( int64 , error ) {
if x := s . get ( ) ; x != nil {
2021-10-29 08:25:05 -07:00
switch db := x . ( type ) {
case * tsdb . DB :
var startTime int64
if len ( db . Blocks ( ) ) > 0 {
startTime = db . Blocks ( ) [ 0 ] . Meta ( ) . MinTime
} else {
startTime = time . Now ( ) . Unix ( ) * 1000
}
// Add a safety margin as it may take a few minutes for everything to spin up.
return startTime + s . startTimeMargin , nil
case * agent . DB :
return db . StartTime ( )
default :
2021-11-01 03:38:23 -07:00
panic ( fmt . Sprintf ( "unknown storage type %T" , db ) )
2020-02-17 03:41:04 -08:00
}
}
return math . MaxInt64 , tsdb . ErrNotReady
}
// Querier implements the Storage interface.
func ( s * readyStorage ) Querier ( ctx context . Context , mint , maxt int64 ) ( storage . Querier , error ) {
if x := s . get ( ) ; x != nil {
return x . Querier ( ctx , mint , maxt )
}
return nil , tsdb . ErrNotReady
}
2020-06-24 06:41:52 -07:00
// ChunkQuerier implements the Storage interface.
func ( s * readyStorage ) ChunkQuerier ( ctx context . Context , mint , maxt int64 ) ( storage . ChunkQuerier , error ) {
if x := s . get ( ) ; x != nil {
return x . ChunkQuerier ( ctx , mint , maxt )
}
return nil , tsdb . ErrNotReady
}
2021-03-16 02:47:45 -07:00
func ( s * readyStorage ) ExemplarQuerier ( ctx context . Context ) ( storage . ExemplarQuerier , error ) {
if x := s . get ( ) ; x != nil {
2021-10-29 08:25:05 -07:00
switch db := x . ( type ) {
case * tsdb . DB :
return db . ExemplarQuerier ( ctx )
case * agent . DB :
return nil , agent . ErrUnsupported
default :
panic ( fmt . Sprintf ( "unknown storage type %T" , db ) )
}
2021-03-16 02:47:45 -07:00
}
return nil , tsdb . ErrNotReady
}
2020-02-17 03:41:04 -08:00
// Appender implements the Storage interface.
2020-07-24 07:10:51 -07:00
func ( s * readyStorage ) Appender ( ctx context . Context ) storage . Appender {
2020-02-17 03:41:04 -08:00
if x := s . get ( ) ; x != nil {
2020-07-24 07:10:51 -07:00
return x . Appender ( ctx )
2020-02-17 03:41:04 -08:00
}
return notReadyAppender { }
}
type notReadyAppender struct { }
2021-11-06 03:10:04 -07:00
func ( n notReadyAppender ) Append ( ref storage . SeriesRef , l labels . Labels , t int64 , v float64 ) ( storage . SeriesRef , error ) {
2020-02-17 03:41:04 -08:00
return 0 , tsdb . ErrNotReady
}
2021-11-06 03:10:04 -07:00
func ( n notReadyAppender ) AppendExemplar ( ref storage . SeriesRef , l labels . Labels , e exemplar . Exemplar ) ( storage . SeriesRef , error ) {
2021-03-16 02:47:45 -07:00
return 0 , tsdb . ErrNotReady
}
2020-02-17 03:41:04 -08:00
func ( n notReadyAppender ) Commit ( ) error { return tsdb . ErrNotReady }
func ( n notReadyAppender ) Rollback ( ) error { return tsdb . ErrNotReady }
// Close implements the Storage interface.
func ( s * readyStorage ) Close ( ) error {
2020-04-29 09:16:14 -07:00
if x := s . get ( ) ; x != nil {
2020-02-17 03:41:04 -08:00
return x . Close ( )
}
return nil
}
2020-02-18 03:25:36 -08:00
2020-04-29 09:16:14 -07:00
// CleanTombstones implements the api_v1.TSDBAdminStats and api_v2.TSDBAdmin interfaces.
func ( s * readyStorage ) CleanTombstones ( ) error {
if x := s . get ( ) ; x != nil {
2021-10-29 08:25:05 -07:00
switch db := x . ( type ) {
case * tsdb . DB :
return db . CleanTombstones ( )
case * agent . DB :
return agent . ErrUnsupported
default :
panic ( fmt . Sprintf ( "unknown storage type %T" , db ) )
}
2020-04-29 09:16:14 -07:00
}
return tsdb . ErrNotReady
}
// Delete implements the api_v1.TSDBAdminStats and api_v2.TSDBAdmin interfaces.
func ( s * readyStorage ) Delete ( mint , maxt int64 , ms ... * labels . Matcher ) error {
if x := s . get ( ) ; x != nil {
2021-10-29 08:25:05 -07:00
switch db := x . ( type ) {
case * tsdb . DB :
return db . Delete ( mint , maxt , ms ... )
case * agent . DB :
return agent . ErrUnsupported
default :
panic ( fmt . Sprintf ( "unknown storage type %T" , db ) )
}
2020-04-29 09:16:14 -07:00
}
return tsdb . ErrNotReady
}
// Snapshot implements the api_v1.TSDBAdminStats and api_v2.TSDBAdmin interfaces.
func ( s * readyStorage ) Snapshot ( dir string , withHead bool ) error {
if x := s . get ( ) ; x != nil {
2021-10-29 08:25:05 -07:00
switch db := x . ( type ) {
case * tsdb . DB :
return db . Snapshot ( dir , withHead )
case * agent . DB :
return agent . ErrUnsupported
default :
panic ( fmt . Sprintf ( "unknown storage type %T" , db ) )
}
2020-04-29 09:16:14 -07:00
}
return tsdb . ErrNotReady
}
// Stats implements the api_v1.TSDBAdminStats interface.
func ( s * readyStorage ) Stats ( statsByLabelName string ) ( * tsdb . Stats , error ) {
if x := s . get ( ) ; x != nil {
2021-10-29 08:25:05 -07:00
switch db := x . ( type ) {
case * tsdb . DB :
return db . Head ( ) . Stats ( statsByLabelName ) , nil
case * agent . DB :
return nil , agent . ErrUnsupported
default :
panic ( fmt . Sprintf ( "unknown storage type %T" , db ) )
}
2020-04-29 09:16:14 -07:00
}
return nil , tsdb . ErrNotReady
}
2021-06-05 07:29:32 -07:00
// WALReplayStatus implements the api_v1.TSDBStats interface.
func ( s * readyStorage ) WALReplayStatus ( ) ( tsdb . WALReplayStatus , error ) {
if x := s . getStats ( ) ; x != nil {
return x . Head . WALReplayStatus . GetWALReplayStatus ( ) , nil
}
return tsdb . WALReplayStatus { } , tsdb . ErrNotReady
}
2020-11-19 07:23:03 -08:00
// ErrNotReady is returned if the underlying scrape manager is not ready yet.
var ErrNotReady = errors . New ( "Scrape manager not ready" )
// ReadyScrapeManager allows a scrape manager to be retrieved. Even if it's set at a later point in time.
type readyScrapeManager struct {
mtx sync . RWMutex
m * scrape . Manager
}
// Set the scrape manager.
func ( rm * readyScrapeManager ) Set ( m * scrape . Manager ) {
rm . mtx . Lock ( )
defer rm . mtx . Unlock ( )
rm . m = m
}
// Get the scrape manager. If is not ready, return an error.
func ( rm * readyScrapeManager ) Get ( ) ( * scrape . Manager , error ) {
rm . mtx . RLock ( )
defer rm . mtx . RUnlock ( )
if rm . m != nil {
return rm . m , nil
}
return nil , ErrNotReady
}
2020-02-18 03:25:36 -08:00
// tsdbOptions is tsdb.Option version with defined units.
// This is required as tsdb.Option fields are unit agnostic (time).
type tsdbOptions struct {
2021-08-06 09:51:01 -07:00
WALSegmentSize units . Base2Bytes
MaxBlockChunkSegmentSize units . Base2Bytes
RetentionDuration model . Duration
MaxBytes units . Base2Bytes
NoLockfile bool
AllowOverlappingBlocks bool
WALCompression bool
2022-03-11 08:26:59 -08:00
HeadChunksWriteQueueSize int
2021-08-06 09:51:01 -07:00
StripeSize int
MinBlockDuration model . Duration
MaxBlockDuration model . Duration
EnableExemplarStorage bool
MaxExemplars int64
EnableMemorySnapshotOnShutdown bool
2020-02-18 03:25:36 -08:00
}
func ( opts tsdbOptions ) ToTSDBOptions ( ) tsdb . Options {
return tsdb . Options {
2021-08-06 09:51:01 -07:00
WALSegmentSize : int ( opts . WALSegmentSize ) ,
MaxBlockChunkSegmentSize : int64 ( opts . MaxBlockChunkSegmentSize ) ,
RetentionDuration : int64 ( time . Duration ( opts . RetentionDuration ) / time . Millisecond ) ,
MaxBytes : int64 ( opts . MaxBytes ) ,
NoLockfile : opts . NoLockfile ,
AllowOverlappingBlocks : opts . AllowOverlappingBlocks ,
WALCompression : opts . WALCompression ,
2022-03-11 08:26:59 -08:00
HeadChunksWriteQueueSize : opts . HeadChunksWriteQueueSize ,
2021-08-06 09:51:01 -07:00
StripeSize : opts . StripeSize ,
MinBlockDuration : int64 ( time . Duration ( opts . MinBlockDuration ) / time . Millisecond ) ,
MaxBlockDuration : int64 ( time . Duration ( opts . MaxBlockDuration ) / time . Millisecond ) ,
EnableExemplarStorage : opts . EnableExemplarStorage ,
MaxExemplars : opts . MaxExemplars ,
EnableMemorySnapshotOnShutdown : opts . EnableMemorySnapshotOnShutdown ,
2020-02-18 03:25:36 -08:00
}
}
2020-04-22 17:05:55 -07:00
2021-10-29 08:25:05 -07:00
// agentOptions is a version of agent.Options with defined units. This is required
// as agent.Option fields are unit agnostic (time).
type agentOptions struct {
WALSegmentSize units . Base2Bytes
WALCompression bool
StripeSize int
TruncateFrequency model . Duration
MinWALTime , MaxWALTime model . Duration
2021-11-11 08:45:25 -08:00
NoLockfile bool
2021-10-29 08:25:05 -07:00
}
func ( opts agentOptions ) ToAgentOptions ( ) agent . Options {
return agent . Options {
WALSegmentSize : int ( opts . WALSegmentSize ) ,
WALCompression : opts . WALCompression ,
StripeSize : opts . StripeSize ,
TruncateFrequency : time . Duration ( opts . TruncateFrequency ) ,
MinWALTime : durationToInt64Millis ( time . Duration ( opts . MinWALTime ) ) ,
MaxWALTime : durationToInt64Millis ( time . Duration ( opts . MaxWALTime ) ) ,
2021-11-11 08:45:25 -08:00
NoLockfile : opts . NoLockfile ,
2021-10-29 08:25:05 -07:00
}
}
2021-10-20 01:15:54 -07:00
// discoveryManager interfaces the discovery manager. This is used to keep using
// the manager that restarts SD's on reload for a few releases until we feel
// the new manager can be enabled for all users.
type discoveryManager interface {
ApplyConfig ( cfg map [ string ] discovery . Configs ) error
Run ( ) error
SyncCh ( ) <- chan map [ string ] [ ] * targetgroup . Group
}