mirror of
https://github.com/prometheus/prometheus.git
synced 2025-01-13 06:47:28 -08:00
e96d786fb8
I have received a recent report for a user which confirms that sometime the GRPC server does not stop propery. It appears that there are 2 issues: 1. The cmux server can refuse to stop if there are stale connections. For that we set the ReadTimeout. 2. The GRPC server graceful stop can never finish. What this PR avoids is: ``` goroutine 227 [semacquire, 2 minutes]: sync.runtime_Semacquire(0xc00059a75c) /usr/local/go/src/runtime/sema.go:56 +0x42 sync.(*WaitGroup).Wait(0xc00059a75c) /usr/local/go/src/sync/waitgroup.go:130 +0x64 google.golang.org/grpc.(*Server).GracefulStop(0xc00059a600) ``` This PR stops the GRPC server after 15s. Related to the go routing dumps in #6747. Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
1169 lines
33 KiB
Go
1169 lines
33 KiB
Go
// Copyright 2013 The Prometheus Authors
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package web
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"io/ioutil"
|
|
stdlog "log"
|
|
"math"
|
|
"net"
|
|
"net/http"
|
|
"net/http/pprof"
|
|
"net/url"
|
|
"os"
|
|
"path"
|
|
"path/filepath"
|
|
"regexp"
|
|
"runtime"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
template_text "text/template"
|
|
"time"
|
|
|
|
"github.com/alecthomas/units"
|
|
"github.com/go-kit/kit/log"
|
|
"github.com/go-kit/kit/log/level"
|
|
conntrack "github.com/mwitkow/go-conntrack"
|
|
"github.com/opentracing-contrib/go-stdlib/nethttp"
|
|
opentracing "github.com/opentracing/opentracing-go"
|
|
"github.com/pkg/errors"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promhttp"
|
|
io_prometheus_client "github.com/prometheus/client_model/go"
|
|
"github.com/prometheus/common/model"
|
|
"github.com/prometheus/common/route"
|
|
"github.com/prometheus/common/server"
|
|
"github.com/soheilhy/cmux"
|
|
"go.uber.org/atomic"
|
|
"golang.org/x/net/netutil"
|
|
"google.golang.org/grpc"
|
|
|
|
"github.com/prometheus/prometheus/config"
|
|
"github.com/prometheus/prometheus/notifier"
|
|
"github.com/prometheus/prometheus/promql"
|
|
"github.com/prometheus/prometheus/rules"
|
|
"github.com/prometheus/prometheus/scrape"
|
|
"github.com/prometheus/prometheus/storage"
|
|
"github.com/prometheus/prometheus/template"
|
|
"github.com/prometheus/prometheus/tsdb"
|
|
"github.com/prometheus/prometheus/tsdb/index"
|
|
"github.com/prometheus/prometheus/util/httputil"
|
|
api_v1 "github.com/prometheus/prometheus/web/api/v1"
|
|
api_v2 "github.com/prometheus/prometheus/web/api/v2"
|
|
"github.com/prometheus/prometheus/web/ui"
|
|
)
|
|
|
|
// Paths that are handled by the React / Reach router that should all be served the main React app's index.html.
|
|
var reactRouterPaths = []string{
|
|
"/",
|
|
"/alerts",
|
|
"/config",
|
|
"/flags",
|
|
"/graph",
|
|
"/rules",
|
|
"/service-discovery",
|
|
"/status",
|
|
"/targets",
|
|
"/tsdb-status",
|
|
"/version",
|
|
}
|
|
|
|
// withStackTrace logs the stack trace in case the request panics. The function
|
|
// will re-raise the error which will then be handled by the net/http package.
|
|
// It is needed because the go-kit log package doesn't manage properly the
|
|
// panics from net/http (see https://github.com/go-kit/kit/issues/233).
|
|
func withStackTracer(h http.Handler, l log.Logger) http.Handler {
|
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
defer func() {
|
|
if err := recover(); err != nil {
|
|
const size = 64 << 10
|
|
buf := make([]byte, size)
|
|
buf = buf[:runtime.Stack(buf, false)]
|
|
level.Error(l).Log("msg", "panic while serving request", "client", r.RemoteAddr, "url", r.URL, "err", err, "stack", buf)
|
|
panic(err)
|
|
}
|
|
}()
|
|
h.ServeHTTP(w, r)
|
|
})
|
|
}
|
|
|
|
type metrics struct {
|
|
requestCounter *prometheus.CounterVec
|
|
requestDuration *prometheus.HistogramVec
|
|
responseSize *prometheus.HistogramVec
|
|
}
|
|
|
|
func newMetrics(r prometheus.Registerer) *metrics {
|
|
m := &metrics{
|
|
requestCounter: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Name: "prometheus_http_requests_total",
|
|
Help: "Counter of HTTP requests.",
|
|
},
|
|
[]string{"handler", "code"},
|
|
),
|
|
requestDuration: prometheus.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "prometheus_http_request_duration_seconds",
|
|
Help: "Histogram of latencies for HTTP requests.",
|
|
Buckets: []float64{.1, .2, .4, 1, 3, 8, 20, 60, 120},
|
|
},
|
|
[]string{"handler"},
|
|
),
|
|
responseSize: prometheus.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Name: "prometheus_http_response_size_bytes",
|
|
Help: "Histogram of response size for HTTP requests.",
|
|
Buckets: prometheus.ExponentialBuckets(100, 10, 8),
|
|
},
|
|
[]string{"handler"},
|
|
),
|
|
}
|
|
|
|
if r != nil {
|
|
r.MustRegister(m.requestCounter, m.requestDuration, m.responseSize)
|
|
registerFederationMetrics(r)
|
|
}
|
|
return m
|
|
}
|
|
|
|
func (m *metrics) instrumentHandlerWithPrefix(prefix string) func(handlerName string, handler http.HandlerFunc) http.HandlerFunc {
|
|
return func(handlerName string, handler http.HandlerFunc) http.HandlerFunc {
|
|
return m.instrumentHandler(prefix+handlerName, handler)
|
|
}
|
|
}
|
|
|
|
func (m *metrics) instrumentHandler(handlerName string, handler http.HandlerFunc) http.HandlerFunc {
|
|
return promhttp.InstrumentHandlerCounter(
|
|
m.requestCounter.MustCurryWith(prometheus.Labels{"handler": handlerName}),
|
|
promhttp.InstrumentHandlerDuration(
|
|
m.requestDuration.MustCurryWith(prometheus.Labels{"handler": handlerName}),
|
|
promhttp.InstrumentHandlerResponseSize(
|
|
m.responseSize.MustCurryWith(prometheus.Labels{"handler": handlerName}),
|
|
handler,
|
|
),
|
|
),
|
|
)
|
|
}
|
|
|
|
// PrometheusVersion contains build information about Prometheus.
|
|
type PrometheusVersion = api_v1.PrometheusVersion
|
|
|
|
type LocalStorage interface {
|
|
storage.Storage
|
|
api_v1.TSDBAdminStats
|
|
}
|
|
|
|
// Handler serves various HTTP endpoints of the Prometheus server
|
|
type Handler struct {
|
|
logger log.Logger
|
|
|
|
gatherer prometheus.Gatherer
|
|
metrics *metrics
|
|
|
|
scrapeManager *scrape.Manager
|
|
ruleManager *rules.Manager
|
|
queryEngine *promql.Engine
|
|
lookbackDelta time.Duration
|
|
context context.Context
|
|
storage storage.Storage
|
|
localStorage LocalStorage
|
|
notifier *notifier.Manager
|
|
|
|
apiV1 *api_v1.API
|
|
|
|
router *route.Router
|
|
quitCh chan struct{}
|
|
reloadCh chan chan error
|
|
options *Options
|
|
config *config.Config
|
|
versionInfo *PrometheusVersion
|
|
birth time.Time
|
|
cwd string
|
|
flagsMap map[string]string
|
|
|
|
mtx sync.RWMutex
|
|
now func() model.Time
|
|
|
|
ready atomic.Uint32 // ready is uint32 rather than boolean to be able to use atomic functions.
|
|
}
|
|
|
|
// ApplyConfig updates the config field of the Handler struct
|
|
func (h *Handler) ApplyConfig(conf *config.Config) error {
|
|
h.mtx.Lock()
|
|
defer h.mtx.Unlock()
|
|
|
|
h.config = conf
|
|
|
|
return nil
|
|
}
|
|
|
|
// Options for the web Handler.
|
|
type Options struct {
|
|
Context context.Context
|
|
TSDBRetentionDuration model.Duration
|
|
TSDBDir string
|
|
TSDBMaxBytes units.Base2Bytes
|
|
LocalStorage LocalStorage
|
|
Storage storage.Storage
|
|
QueryEngine *promql.Engine
|
|
LookbackDelta time.Duration
|
|
ScrapeManager *scrape.Manager
|
|
RuleManager *rules.Manager
|
|
Notifier *notifier.Manager
|
|
Version *PrometheusVersion
|
|
Flags map[string]string
|
|
|
|
ListenAddress string
|
|
CORSOrigin *regexp.Regexp
|
|
ReadTimeout time.Duration
|
|
MaxConnections int
|
|
ExternalURL *url.URL
|
|
RoutePrefix string
|
|
UseLocalAssets bool
|
|
UserAssetsPath string
|
|
ConsoleTemplatesPath string
|
|
ConsoleLibrariesPath string
|
|
EnableLifecycle bool
|
|
EnableAdminAPI bool
|
|
PageTitle string
|
|
RemoteReadSampleLimit int
|
|
RemoteReadConcurrencyLimit int
|
|
RemoteReadBytesInFrame int
|
|
|
|
Gatherer prometheus.Gatherer
|
|
Registerer prometheus.Registerer
|
|
}
|
|
|
|
// New initializes a new web Handler.
|
|
func New(logger log.Logger, o *Options) *Handler {
|
|
if logger == nil {
|
|
logger = log.NewNopLogger()
|
|
}
|
|
|
|
m := newMetrics(o.Registerer)
|
|
router := route.New().
|
|
WithInstrumentation(m.instrumentHandler).
|
|
WithInstrumentation(setPathWithPrefix(""))
|
|
|
|
cwd, err := os.Getwd()
|
|
if err != nil {
|
|
cwd = "<error retrieving current working directory>"
|
|
}
|
|
|
|
h := &Handler{
|
|
logger: logger,
|
|
|
|
gatherer: o.Gatherer,
|
|
metrics: m,
|
|
|
|
router: router,
|
|
quitCh: make(chan struct{}),
|
|
reloadCh: make(chan chan error),
|
|
options: o,
|
|
versionInfo: o.Version,
|
|
birth: time.Now().UTC(),
|
|
cwd: cwd,
|
|
flagsMap: o.Flags,
|
|
|
|
context: o.Context,
|
|
scrapeManager: o.ScrapeManager,
|
|
ruleManager: o.RuleManager,
|
|
queryEngine: o.QueryEngine,
|
|
lookbackDelta: o.LookbackDelta,
|
|
storage: o.Storage,
|
|
localStorage: o.LocalStorage,
|
|
notifier: o.Notifier,
|
|
|
|
now: model.Now,
|
|
}
|
|
h.ready.Store(0)
|
|
|
|
factoryTr := func(_ context.Context) api_v1.TargetRetriever { return h.scrapeManager }
|
|
factoryAr := func(_ context.Context) api_v1.AlertmanagerRetriever { return h.notifier }
|
|
FactoryRr := func(_ context.Context) api_v1.RulesRetriever { return h.ruleManager }
|
|
|
|
h.apiV1 = api_v1.NewAPI(h.queryEngine, h.storage, factoryTr, factoryAr,
|
|
func() config.Config {
|
|
h.mtx.RLock()
|
|
defer h.mtx.RUnlock()
|
|
return *h.config
|
|
},
|
|
o.Flags,
|
|
api_v1.GlobalURLOptions{
|
|
ListenAddress: o.ListenAddress,
|
|
Host: o.ExternalURL.Host,
|
|
Scheme: o.ExternalURL.Scheme,
|
|
},
|
|
h.testReady,
|
|
h.options.LocalStorage,
|
|
h.options.TSDBDir,
|
|
h.options.EnableAdminAPI,
|
|
logger,
|
|
FactoryRr,
|
|
h.options.RemoteReadSampleLimit,
|
|
h.options.RemoteReadConcurrencyLimit,
|
|
h.options.RemoteReadBytesInFrame,
|
|
h.options.CORSOrigin,
|
|
h.runtimeInfo,
|
|
h.versionInfo,
|
|
)
|
|
|
|
if o.RoutePrefix != "/" {
|
|
// If the prefix is missing for the root path, prepend it.
|
|
router.Get("/", func(w http.ResponseWriter, r *http.Request) {
|
|
http.Redirect(w, r, o.RoutePrefix, http.StatusFound)
|
|
})
|
|
router = router.WithPrefix(o.RoutePrefix)
|
|
}
|
|
|
|
readyf := h.testReady
|
|
|
|
router.Get("/", func(w http.ResponseWriter, r *http.Request) {
|
|
http.Redirect(w, r, path.Join(o.ExternalURL.Path, "/graph"), http.StatusFound)
|
|
})
|
|
|
|
router.Get("/alerts", readyf(h.alerts))
|
|
router.Get("/graph", readyf(h.graph))
|
|
router.Get("/status", readyf(h.status))
|
|
router.Get("/flags", readyf(h.flags))
|
|
router.Get("/config", readyf(h.serveConfig))
|
|
router.Get("/rules", readyf(h.rules))
|
|
router.Get("/targets", readyf(h.targets))
|
|
router.Get("/version", readyf(h.version))
|
|
router.Get("/service-discovery", readyf(h.serviceDiscovery))
|
|
|
|
router.Get("/metrics", promhttp.Handler().ServeHTTP)
|
|
|
|
router.Get("/federate", readyf(httputil.CompressionHandler{
|
|
Handler: http.HandlerFunc(h.federation),
|
|
}.ServeHTTP))
|
|
|
|
router.Get("/consoles/*filepath", readyf(h.consoles))
|
|
|
|
router.Get("/static/*filepath", func(w http.ResponseWriter, r *http.Request) {
|
|
r.URL.Path = path.Join("/static", route.Param(r.Context(), "filepath"))
|
|
fs := server.StaticFileServer(ui.Assets)
|
|
fs.ServeHTTP(w, r)
|
|
})
|
|
|
|
// Make sure that "<path-prefix>/new" is redirected to "<path-prefix>/new/" and
|
|
// not just the naked "/new/", which would be the default behavior of the router
|
|
// with the "RedirectTrailingSlash" option (https://godoc.org/github.com/julienschmidt/httprouter#Router.RedirectTrailingSlash),
|
|
// and which breaks users with a --web.route-prefix that deviates from the path derived
|
|
// from the external URL.
|
|
router.Get("/new", func(w http.ResponseWriter, r *http.Request) {
|
|
http.Redirect(w, r, path.Join(o.ExternalURL.Path, "new")+"/", http.StatusFound)
|
|
})
|
|
|
|
router.Get("/new/*filepath", func(w http.ResponseWriter, r *http.Request) {
|
|
p := route.Param(r.Context(), "filepath")
|
|
|
|
// For paths that the React/Reach router handles, we want to serve the
|
|
// index.html, but with replaced path prefix placeholder.
|
|
for _, rp := range reactRouterPaths {
|
|
if p != rp {
|
|
continue
|
|
}
|
|
|
|
f, err := ui.Assets.Open("/static/react/index.html")
|
|
if err != nil {
|
|
w.WriteHeader(http.StatusInternalServerError)
|
|
fmt.Fprintf(w, "Error opening React index.html: %v", err)
|
|
return
|
|
}
|
|
idx, err := ioutil.ReadAll(f)
|
|
if err != nil {
|
|
w.WriteHeader(http.StatusInternalServerError)
|
|
fmt.Fprintf(w, "Error reading React index.html: %v", err)
|
|
return
|
|
}
|
|
replacedIdx := bytes.ReplaceAll(idx, []byte("PATH_PREFIX_PLACEHOLDER"), []byte(o.ExternalURL.Path))
|
|
replacedIdx = bytes.ReplaceAll(replacedIdx, []byte("CONSOLES_LINK_PLACEHOLDER"), []byte(h.consolesPath()))
|
|
replacedIdx = bytes.ReplaceAll(replacedIdx, []byte("TITLE_PLACEHOLDER"), []byte(h.options.PageTitle))
|
|
w.Write(replacedIdx)
|
|
return
|
|
}
|
|
|
|
// For all other paths, serve auxiliary assets.
|
|
r.URL.Path = path.Join("/static/react/", p)
|
|
fs := server.StaticFileServer(ui.Assets)
|
|
fs.ServeHTTP(w, r)
|
|
})
|
|
|
|
if o.UserAssetsPath != "" {
|
|
router.Get("/user/*filepath", route.FileServe(o.UserAssetsPath))
|
|
}
|
|
|
|
if o.EnableLifecycle {
|
|
router.Post("/-/quit", h.quit)
|
|
router.Put("/-/quit", h.quit)
|
|
router.Post("/-/reload", h.reload)
|
|
router.Put("/-/reload", h.reload)
|
|
} else {
|
|
forbiddenAPINotEnabled := func(w http.ResponseWriter, _ *http.Request) {
|
|
w.WriteHeader(http.StatusForbidden)
|
|
w.Write([]byte("Lifecycle API is not enabled."))
|
|
}
|
|
router.Post("/-/quit", forbiddenAPINotEnabled)
|
|
router.Put("/-/quit", forbiddenAPINotEnabled)
|
|
router.Post("/-/reload", forbiddenAPINotEnabled)
|
|
router.Put("/-/reload", forbiddenAPINotEnabled)
|
|
}
|
|
router.Get("/-/quit", func(w http.ResponseWriter, _ *http.Request) {
|
|
w.WriteHeader(http.StatusMethodNotAllowed)
|
|
w.Write([]byte("Only POST or PUT requests allowed"))
|
|
})
|
|
router.Get("/-/reload", func(w http.ResponseWriter, _ *http.Request) {
|
|
w.WriteHeader(http.StatusMethodNotAllowed)
|
|
w.Write([]byte("Only POST or PUT requests allowed"))
|
|
})
|
|
|
|
router.Get("/debug/*subpath", serveDebug)
|
|
router.Post("/debug/*subpath", serveDebug)
|
|
|
|
router.Get("/-/healthy", func(w http.ResponseWriter, r *http.Request) {
|
|
w.WriteHeader(http.StatusOK)
|
|
fmt.Fprintf(w, "Prometheus is Healthy.\n")
|
|
})
|
|
router.Get("/-/ready", readyf(func(w http.ResponseWriter, r *http.Request) {
|
|
w.WriteHeader(http.StatusOK)
|
|
fmt.Fprintf(w, "Prometheus is Ready.\n")
|
|
}))
|
|
|
|
return h
|
|
}
|
|
|
|
func serveDebug(w http.ResponseWriter, req *http.Request) {
|
|
ctx := req.Context()
|
|
subpath := route.Param(ctx, "subpath")
|
|
|
|
if subpath == "/pprof" {
|
|
http.Redirect(w, req, req.URL.Path+"/", http.StatusMovedPermanently)
|
|
return
|
|
}
|
|
|
|
if !strings.HasPrefix(subpath, "/pprof/") {
|
|
http.NotFound(w, req)
|
|
return
|
|
}
|
|
subpath = strings.TrimPrefix(subpath, "/pprof/")
|
|
|
|
switch subpath {
|
|
case "cmdline":
|
|
pprof.Cmdline(w, req)
|
|
case "profile":
|
|
pprof.Profile(w, req)
|
|
case "symbol":
|
|
pprof.Symbol(w, req)
|
|
case "trace":
|
|
pprof.Trace(w, req)
|
|
default:
|
|
req.URL.Path = "/debug/pprof/" + subpath
|
|
pprof.Index(w, req)
|
|
}
|
|
}
|
|
|
|
// Ready sets Handler to be ready.
|
|
func (h *Handler) Ready() {
|
|
h.ready.Store(1)
|
|
}
|
|
|
|
// Verifies whether the server is ready or not.
|
|
func (h *Handler) isReady() bool {
|
|
return h.ready.Load() > 0
|
|
}
|
|
|
|
// Checks if server is ready, calls f if it is, returns 503 if it is not.
|
|
func (h *Handler) testReady(f http.HandlerFunc) http.HandlerFunc {
|
|
return func(w http.ResponseWriter, r *http.Request) {
|
|
if h.isReady() {
|
|
f(w, r)
|
|
} else {
|
|
w.WriteHeader(http.StatusServiceUnavailable)
|
|
fmt.Fprintf(w, "Service Unavailable")
|
|
}
|
|
}
|
|
}
|
|
|
|
// Checks if server is ready, calls f if it is, returns 503 if it is not.
|
|
func (h *Handler) testReadyHandler(f http.Handler) http.HandlerFunc {
|
|
return h.testReady(f.ServeHTTP)
|
|
}
|
|
|
|
// Quit returns the receive-only quit channel.
|
|
func (h *Handler) Quit() <-chan struct{} {
|
|
return h.quitCh
|
|
}
|
|
|
|
// Reload returns the receive-only channel that signals configuration reload requests.
|
|
func (h *Handler) Reload() <-chan chan error {
|
|
return h.reloadCh
|
|
}
|
|
|
|
// Run serves the HTTP endpoints.
|
|
func (h *Handler) Run(ctx context.Context) error {
|
|
level.Info(h.logger).Log("msg", "Start listening for connections", "address", h.options.ListenAddress)
|
|
|
|
listener, err := net.Listen("tcp", h.options.ListenAddress)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
listener = netutil.LimitListener(listener, h.options.MaxConnections)
|
|
|
|
// Monitor incoming connections with conntrack.
|
|
listener = conntrack.NewListener(listener,
|
|
conntrack.TrackWithName("http"),
|
|
conntrack.TrackWithTracing())
|
|
|
|
var (
|
|
m = cmux.New(listener)
|
|
// See https://github.com/grpc/grpc-go/issues/2636 for why we need to use MatchWithWriters().
|
|
grpcl = m.MatchWithWriters(cmux.HTTP2MatchHeaderFieldSendSettings("content-type", "application/grpc"))
|
|
httpl = m.Match(cmux.HTTP1Fast())
|
|
grpcSrv = grpc.NewServer()
|
|
)
|
|
|
|
// Prevent open connections to block the shutdown of the handler.
|
|
m.SetReadTimeout(h.options.ReadTimeout)
|
|
|
|
av2 := api_v2.New(
|
|
h.options.LocalStorage,
|
|
h.options.TSDBDir,
|
|
h.options.EnableAdminAPI,
|
|
)
|
|
av2.RegisterGRPC(grpcSrv)
|
|
|
|
hh, err := av2.HTTPHandler(ctx, h.options.ListenAddress)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
hhFunc := h.testReadyHandler(hh)
|
|
|
|
operationName := nethttp.OperationNameFunc(func(r *http.Request) string {
|
|
return fmt.Sprintf("%s %s", r.Method, r.URL.Path)
|
|
})
|
|
mux := http.NewServeMux()
|
|
mux.Handle("/", h.router)
|
|
|
|
apiPath := "/api"
|
|
if h.options.RoutePrefix != "/" {
|
|
apiPath = h.options.RoutePrefix + apiPath
|
|
level.Info(h.logger).Log("msg", "Router prefix", "prefix", h.options.RoutePrefix)
|
|
}
|
|
av1 := route.New().
|
|
WithInstrumentation(h.metrics.instrumentHandlerWithPrefix("/api/v1")).
|
|
WithInstrumentation(setPathWithPrefix(apiPath + "/v1"))
|
|
h.apiV1.Register(av1)
|
|
|
|
mux.Handle(apiPath+"/v1/", http.StripPrefix(apiPath+"/v1", av1))
|
|
|
|
mux.Handle(apiPath+"/", http.StripPrefix(apiPath,
|
|
http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
httputil.SetCORS(w, h.options.CORSOrigin, r)
|
|
hhFunc(w, r)
|
|
}),
|
|
))
|
|
|
|
errlog := stdlog.New(log.NewStdlibAdapter(level.Error(h.logger)), "", 0)
|
|
|
|
httpSrv := &http.Server{
|
|
Handler: withStackTracer(nethttp.Middleware(opentracing.GlobalTracer(), mux, operationName), h.logger),
|
|
ErrorLog: errlog,
|
|
ReadTimeout: h.options.ReadTimeout,
|
|
}
|
|
|
|
errCh := make(chan error)
|
|
go func() {
|
|
errCh <- httpSrv.Serve(httpl)
|
|
}()
|
|
go func() {
|
|
errCh <- grpcSrv.Serve(grpcl)
|
|
}()
|
|
go func() {
|
|
errCh <- m.Serve()
|
|
}()
|
|
|
|
select {
|
|
case e := <-errCh:
|
|
return e
|
|
case <-ctx.Done():
|
|
httpSrv.Shutdown(ctx)
|
|
stopGRPCSrv(grpcSrv)
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// stopGRPCSrv stops a given GRPC server. An attempt to stop the server
|
|
// gracefully is made first. After 15s, the server to forced to stop.
|
|
func stopGRPCSrv(srv *grpc.Server) {
|
|
stop := make(chan struct{})
|
|
go func() {
|
|
srv.GracefulStop()
|
|
close(stop)
|
|
}()
|
|
|
|
select {
|
|
case <-time.After(15 * time.Second):
|
|
srv.Stop()
|
|
case <-stop:
|
|
}
|
|
}
|
|
|
|
func (h *Handler) alerts(w http.ResponseWriter, r *http.Request) {
|
|
|
|
var groups []*rules.Group
|
|
for _, group := range h.ruleManager.RuleGroups() {
|
|
if group.HasAlertingRules() {
|
|
groups = append(groups, group)
|
|
}
|
|
}
|
|
|
|
alertStatus := AlertStatus{
|
|
Groups: groups,
|
|
AlertStateToRowClass: map[rules.AlertState]string{
|
|
rules.StateInactive: "success",
|
|
rules.StatePending: "warning",
|
|
rules.StateFiring: "danger",
|
|
},
|
|
Counts: alertCounts(groups),
|
|
}
|
|
h.executeTemplate(w, "alerts.html", alertStatus)
|
|
}
|
|
|
|
func alertCounts(groups []*rules.Group) AlertByStateCount {
|
|
result := AlertByStateCount{}
|
|
|
|
for _, group := range groups {
|
|
for _, alert := range group.AlertingRules() {
|
|
switch alert.State() {
|
|
case rules.StateInactive:
|
|
result.Inactive++
|
|
case rules.StatePending:
|
|
result.Pending++
|
|
case rules.StateFiring:
|
|
result.Firing++
|
|
}
|
|
}
|
|
}
|
|
return result
|
|
}
|
|
|
|
func (h *Handler) consoles(w http.ResponseWriter, r *http.Request) {
|
|
ctx := r.Context()
|
|
name := route.Param(ctx, "filepath")
|
|
|
|
file, err := http.Dir(h.options.ConsoleTemplatesPath).Open(name)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusNotFound)
|
|
return
|
|
}
|
|
defer file.Close()
|
|
text, err := ioutil.ReadAll(file)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
ctx = httputil.ContextFromRequest(ctx, r)
|
|
|
|
// Provide URL parameters as a map for easy use. Advanced users may have need for
|
|
// parameters beyond the first, so provide RawParams.
|
|
rawParams, err := url.ParseQuery(r.URL.RawQuery)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusBadRequest)
|
|
return
|
|
}
|
|
params := map[string]string{}
|
|
for k, v := range rawParams {
|
|
params[k] = v[0]
|
|
}
|
|
|
|
externalLabels := map[string]string{}
|
|
h.mtx.RLock()
|
|
els := h.config.GlobalConfig.ExternalLabels
|
|
h.mtx.RUnlock()
|
|
for _, el := range els {
|
|
externalLabels[el.Name] = el.Value
|
|
}
|
|
|
|
// Inject some convenience variables that are easier to remember for users
|
|
// who are not used to Go's templating system.
|
|
defs := []string{
|
|
"{{$rawParams := .RawParams }}",
|
|
"{{$params := .Params}}",
|
|
"{{$path := .Path}}",
|
|
"{{$externalLabels := .ExternalLabels}}",
|
|
}
|
|
|
|
data := struct {
|
|
RawParams url.Values
|
|
Params map[string]string
|
|
Path string
|
|
ExternalLabels map[string]string
|
|
}{
|
|
RawParams: rawParams,
|
|
Params: params,
|
|
Path: strings.TrimLeft(name, "/"),
|
|
ExternalLabels: externalLabels,
|
|
}
|
|
|
|
tmpl := template.NewTemplateExpander(
|
|
ctx,
|
|
strings.Join(append(defs, string(text)), ""),
|
|
"__console_"+name,
|
|
data,
|
|
h.now(),
|
|
template.QueryFunc(rules.EngineQueryFunc(h.queryEngine, h.storage)),
|
|
h.options.ExternalURL,
|
|
)
|
|
filenames, err := filepath.Glob(h.options.ConsoleLibrariesPath + "/*.lib")
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
result, err := tmpl.ExpandHTML(filenames)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
io.WriteString(w, result)
|
|
}
|
|
|
|
func (h *Handler) graph(w http.ResponseWriter, r *http.Request) {
|
|
h.executeTemplate(w, "graph.html", nil)
|
|
}
|
|
|
|
func (h *Handler) status(w http.ResponseWriter, r *http.Request) {
|
|
status := struct {
|
|
Birth time.Time
|
|
CWD string
|
|
Version *PrometheusVersion
|
|
Alertmanagers []*url.URL
|
|
GoroutineCount int
|
|
GOMAXPROCS int
|
|
GOGC string
|
|
GODEBUG string
|
|
CorruptionCount int64
|
|
ChunkCount int64
|
|
TimeSeriesCount int64
|
|
LastConfigTime time.Time
|
|
ReloadConfigSuccess bool
|
|
StorageRetention string
|
|
NumSeries uint64
|
|
MaxTime int64
|
|
MinTime int64
|
|
Stats *index.PostingsStats
|
|
Duration string
|
|
}{
|
|
Birth: h.birth,
|
|
CWD: h.cwd,
|
|
Version: h.versionInfo,
|
|
Alertmanagers: h.notifier.Alertmanagers(),
|
|
GoroutineCount: runtime.NumGoroutine(),
|
|
GOMAXPROCS: runtime.GOMAXPROCS(0),
|
|
GOGC: os.Getenv("GOGC"),
|
|
GODEBUG: os.Getenv("GODEBUG"),
|
|
}
|
|
|
|
if h.options.TSDBRetentionDuration != 0 {
|
|
status.StorageRetention = h.options.TSDBRetentionDuration.String()
|
|
}
|
|
if h.options.TSDBMaxBytes != 0 {
|
|
if status.StorageRetention != "" {
|
|
status.StorageRetention = status.StorageRetention + " or "
|
|
}
|
|
status.StorageRetention = status.StorageRetention + h.options.TSDBMaxBytes.String()
|
|
}
|
|
|
|
metrics, err := h.gatherer.Gather()
|
|
if err != nil {
|
|
http.Error(w, fmt.Sprintf("error gathering runtime status: %s", err), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
for _, mF := range metrics {
|
|
switch *mF.Name {
|
|
case "prometheus_tsdb_head_chunks":
|
|
status.ChunkCount = int64(toFloat64(mF))
|
|
case "prometheus_tsdb_head_series":
|
|
status.TimeSeriesCount = int64(toFloat64(mF))
|
|
case "prometheus_tsdb_wal_corruptions_total":
|
|
status.CorruptionCount = int64(toFloat64(mF))
|
|
case "prometheus_config_last_reload_successful":
|
|
status.ReloadConfigSuccess = toFloat64(mF) != 0
|
|
case "prometheus_config_last_reload_success_timestamp_seconds":
|
|
status.LastConfigTime = time.Unix(int64(toFloat64(mF)), 0).UTC()
|
|
}
|
|
}
|
|
|
|
startTime := time.Now().UnixNano()
|
|
s, err := h.localStorage.Stats("__name__")
|
|
if err != nil {
|
|
if errors.Cause(err) == tsdb.ErrNotReady {
|
|
http.Error(w, tsdb.ErrNotReady.Error(), http.StatusServiceUnavailable)
|
|
return
|
|
}
|
|
http.Error(w, fmt.Sprintf("error gathering local storage statistics: %s", err), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
status.Duration = fmt.Sprintf("%.3f", float64(time.Now().UnixNano()-startTime)/float64(1e9))
|
|
status.Stats = s.IndexPostingStats
|
|
status.NumSeries = s.NumSeries
|
|
status.MaxTime = s.MaxTime
|
|
status.MinTime = s.MinTime
|
|
|
|
h.executeTemplate(w, "status.html", status)
|
|
}
|
|
|
|
func (h *Handler) runtimeInfo() (api_v1.RuntimeInfo, error) {
|
|
status := api_v1.RuntimeInfo{
|
|
StartTime: h.birth,
|
|
CWD: h.cwd,
|
|
GoroutineCount: runtime.NumGoroutine(),
|
|
GOMAXPROCS: runtime.GOMAXPROCS(0),
|
|
GOGC: os.Getenv("GOGC"),
|
|
GODEBUG: os.Getenv("GODEBUG"),
|
|
}
|
|
|
|
if h.options.TSDBRetentionDuration != 0 {
|
|
status.StorageRetention = h.options.TSDBRetentionDuration.String()
|
|
}
|
|
if h.options.TSDBMaxBytes != 0 {
|
|
if status.StorageRetention != "" {
|
|
status.StorageRetention = status.StorageRetention + " or "
|
|
}
|
|
status.StorageRetention = status.StorageRetention + h.options.TSDBMaxBytes.String()
|
|
}
|
|
|
|
metrics, err := h.gatherer.Gather()
|
|
if err != nil {
|
|
return status, errors.Errorf("error gathering runtime status: %s", err)
|
|
}
|
|
for _, mF := range metrics {
|
|
switch *mF.Name {
|
|
case "prometheus_tsdb_head_chunks":
|
|
status.ChunkCount = int64(toFloat64(mF))
|
|
case "prometheus_tsdb_head_series":
|
|
status.TimeSeriesCount = int64(toFloat64(mF))
|
|
case "prometheus_tsdb_wal_corruptions_total":
|
|
status.CorruptionCount = int64(toFloat64(mF))
|
|
case "prometheus_config_last_reload_successful":
|
|
status.ReloadConfigSuccess = toFloat64(mF) != 0
|
|
case "prometheus_config_last_reload_success_timestamp_seconds":
|
|
status.LastConfigTime = time.Unix(int64(toFloat64(mF)), 0).UTC()
|
|
}
|
|
}
|
|
return status, nil
|
|
}
|
|
|
|
func toFloat64(f *io_prometheus_client.MetricFamily) float64 {
|
|
m := *f.Metric[0]
|
|
if m.Gauge != nil {
|
|
return m.Gauge.GetValue()
|
|
}
|
|
if m.Counter != nil {
|
|
return m.Counter.GetValue()
|
|
}
|
|
if m.Untyped != nil {
|
|
return m.Untyped.GetValue()
|
|
}
|
|
return math.NaN()
|
|
}
|
|
|
|
func (h *Handler) flags(w http.ResponseWriter, r *http.Request) {
|
|
h.executeTemplate(w, "flags.html", h.flagsMap)
|
|
}
|
|
|
|
func (h *Handler) serveConfig(w http.ResponseWriter, r *http.Request) {
|
|
h.mtx.RLock()
|
|
defer h.mtx.RUnlock()
|
|
|
|
h.executeTemplate(w, "config.html", h.config.String())
|
|
}
|
|
|
|
func (h *Handler) rules(w http.ResponseWriter, r *http.Request) {
|
|
h.executeTemplate(w, "rules.html", h.ruleManager)
|
|
}
|
|
|
|
func (h *Handler) serviceDiscovery(w http.ResponseWriter, r *http.Request) {
|
|
var index []string
|
|
targets := h.scrapeManager.TargetsAll()
|
|
for job := range targets {
|
|
index = append(index, job)
|
|
}
|
|
sort.Strings(index)
|
|
scrapeConfigData := struct {
|
|
Index []string
|
|
Targets map[string][]*scrape.Target
|
|
Active []int
|
|
Dropped []int
|
|
Total []int
|
|
}{
|
|
Index: index,
|
|
Targets: make(map[string][]*scrape.Target),
|
|
Active: make([]int, len(index)),
|
|
Dropped: make([]int, len(index)),
|
|
Total: make([]int, len(index)),
|
|
}
|
|
for i, job := range scrapeConfigData.Index {
|
|
scrapeConfigData.Targets[job] = make([]*scrape.Target, 0, len(targets[job]))
|
|
scrapeConfigData.Total[i] = len(targets[job])
|
|
for _, target := range targets[job] {
|
|
// Do not display more than 100 dropped targets per job to avoid
|
|
// returning too much data to the clients.
|
|
if target.Labels().Len() == 0 {
|
|
scrapeConfigData.Dropped[i]++
|
|
if scrapeConfigData.Dropped[i] > 100 {
|
|
continue
|
|
}
|
|
} else {
|
|
scrapeConfigData.Active[i]++
|
|
}
|
|
scrapeConfigData.Targets[job] = append(scrapeConfigData.Targets[job], target)
|
|
}
|
|
}
|
|
|
|
h.executeTemplate(w, "service-discovery.html", scrapeConfigData)
|
|
}
|
|
|
|
func (h *Handler) targets(w http.ResponseWriter, r *http.Request) {
|
|
tps := h.scrapeManager.TargetsActive()
|
|
for _, targets := range tps {
|
|
sort.Slice(targets, func(i, j int) bool {
|
|
iJobLabel := targets[i].Labels().Get(model.JobLabel)
|
|
jJobLabel := targets[j].Labels().Get(model.JobLabel)
|
|
if iJobLabel == jJobLabel {
|
|
return targets[i].Labels().Get(model.InstanceLabel) < targets[j].Labels().Get(model.InstanceLabel)
|
|
}
|
|
return iJobLabel < jJobLabel
|
|
})
|
|
}
|
|
|
|
h.executeTemplate(w, "targets.html", struct {
|
|
TargetPools map[string][]*scrape.Target
|
|
}{
|
|
TargetPools: tps,
|
|
})
|
|
}
|
|
|
|
func (h *Handler) version(w http.ResponseWriter, r *http.Request) {
|
|
dec := json.NewEncoder(w)
|
|
if err := dec.Encode(h.versionInfo); err != nil {
|
|
http.Error(w, fmt.Sprintf("error encoding JSON: %s", err), http.StatusInternalServerError)
|
|
}
|
|
}
|
|
|
|
func (h *Handler) quit(w http.ResponseWriter, r *http.Request) {
|
|
fmt.Fprintf(w, "Requesting termination... Goodbye!")
|
|
close(h.quitCh)
|
|
}
|
|
|
|
func (h *Handler) reload(w http.ResponseWriter, r *http.Request) {
|
|
rc := make(chan error)
|
|
h.reloadCh <- rc
|
|
if err := <-rc; err != nil {
|
|
http.Error(w, fmt.Sprintf("failed to reload config: %s", err), http.StatusInternalServerError)
|
|
}
|
|
}
|
|
|
|
func (h *Handler) consolesPath() string {
|
|
if _, err := os.Stat(h.options.ConsoleTemplatesPath + "/index.html"); !os.IsNotExist(err) {
|
|
return h.options.ExternalURL.Path + "/consoles/index.html"
|
|
}
|
|
if h.options.UserAssetsPath != "" {
|
|
if _, err := os.Stat(h.options.UserAssetsPath + "/index.html"); !os.IsNotExist(err) {
|
|
return h.options.ExternalURL.Path + "/user/index.html"
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func tmplFuncs(consolesPath string, opts *Options) template_text.FuncMap {
|
|
return template_text.FuncMap{
|
|
"since": func(t time.Time) time.Duration {
|
|
return time.Since(t) / time.Millisecond * time.Millisecond
|
|
},
|
|
"unixToTime": func(i int64) time.Time {
|
|
t := time.Unix(i/int64(time.Microsecond), 0).UTC()
|
|
return t
|
|
},
|
|
"consolesPath": func() string { return consolesPath },
|
|
"pathPrefix": func() string { return opts.ExternalURL.Path },
|
|
"pageTitle": func() string { return opts.PageTitle },
|
|
"buildVersion": func() string { return opts.Version.Revision },
|
|
"globalURL": func(u *url.URL) *url.URL {
|
|
host, port, err := net.SplitHostPort(u.Host)
|
|
if err != nil {
|
|
return u
|
|
}
|
|
for _, lhr := range api_v1.LocalhostRepresentations {
|
|
if host == lhr {
|
|
_, ownPort, err := net.SplitHostPort(opts.ListenAddress)
|
|
if err != nil {
|
|
return u
|
|
}
|
|
|
|
if port == ownPort {
|
|
// Only in the case where the target is on localhost and its port is
|
|
// the same as the one we're listening on, we know for sure that
|
|
// we're monitoring our own process and that we need to change the
|
|
// scheme, hostname, and port to the externally reachable ones as
|
|
// well. We shouldn't need to touch the path at all, since if a
|
|
// path prefix is defined, the path under which we scrape ourselves
|
|
// should already contain the prefix.
|
|
u.Scheme = opts.ExternalURL.Scheme
|
|
u.Host = opts.ExternalURL.Host
|
|
} else {
|
|
// Otherwise, we only know that localhost is not reachable
|
|
// externally, so we replace only the hostname by the one in the
|
|
// external URL. It could be the wrong hostname for the service on
|
|
// this port, but it's still the best possible guess.
|
|
host, _, err := net.SplitHostPort(opts.ExternalURL.Host)
|
|
if err != nil {
|
|
return u
|
|
}
|
|
u.Host = host + ":" + port
|
|
}
|
|
break
|
|
}
|
|
}
|
|
return u
|
|
},
|
|
"numHealthy": func(pool []*scrape.Target) int {
|
|
alive := len(pool)
|
|
for _, p := range pool {
|
|
if p.Health() != scrape.HealthGood {
|
|
alive--
|
|
}
|
|
}
|
|
|
|
return alive
|
|
},
|
|
"targetHealthToClass": func(th scrape.TargetHealth) string {
|
|
switch th {
|
|
case scrape.HealthUnknown:
|
|
return "warning"
|
|
case scrape.HealthGood:
|
|
return "success"
|
|
default:
|
|
return "danger"
|
|
}
|
|
},
|
|
"ruleHealthToClass": func(rh rules.RuleHealth) string {
|
|
switch rh {
|
|
case rules.HealthUnknown:
|
|
return "warning"
|
|
case rules.HealthGood:
|
|
return "success"
|
|
default:
|
|
return "danger"
|
|
}
|
|
},
|
|
"alertStateToClass": func(as rules.AlertState) string {
|
|
switch as {
|
|
case rules.StateInactive:
|
|
return "success"
|
|
case rules.StatePending:
|
|
return "warning"
|
|
case rules.StateFiring:
|
|
return "danger"
|
|
default:
|
|
panic("unknown alert state")
|
|
}
|
|
},
|
|
}
|
|
}
|
|
|
|
func (h *Handler) getTemplate(name string) (string, error) {
|
|
var tmpl string
|
|
|
|
appendf := func(name string) error {
|
|
f, err := ui.Assets.Open(path.Join("/templates", name))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer f.Close()
|
|
b, err := ioutil.ReadAll(f)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
tmpl += string(b)
|
|
return nil
|
|
}
|
|
|
|
err := appendf("_base.html")
|
|
if err != nil {
|
|
return "", errors.Wrap(err, "error reading base template")
|
|
}
|
|
err = appendf(name)
|
|
if err != nil {
|
|
return "", errors.Wrapf(err, "error reading page template %s", name)
|
|
}
|
|
|
|
return tmpl, nil
|
|
}
|
|
|
|
func (h *Handler) executeTemplate(w http.ResponseWriter, name string, data interface{}) {
|
|
text, err := h.getTemplate(name)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
}
|
|
|
|
tmpl := template.NewTemplateExpander(
|
|
h.context,
|
|
text,
|
|
name,
|
|
data,
|
|
h.now(),
|
|
template.QueryFunc(rules.EngineQueryFunc(h.queryEngine, h.storage)),
|
|
h.options.ExternalURL,
|
|
)
|
|
tmpl.Funcs(tmplFuncs(h.consolesPath(), h.options))
|
|
|
|
result, err := tmpl.ExpandHTML(nil)
|
|
if err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
io.WriteString(w, result)
|
|
}
|
|
|
|
// AlertStatus bundles alerting rules and the mapping of alert states to row classes.
|
|
type AlertStatus struct {
|
|
Groups []*rules.Group
|
|
AlertStateToRowClass map[rules.AlertState]string
|
|
Counts AlertByStateCount
|
|
}
|
|
|
|
type AlertByStateCount struct {
|
|
Inactive int32
|
|
Pending int32
|
|
Firing int32
|
|
}
|
|
|
|
func setPathWithPrefix(prefix string) func(handlerName string, handler http.HandlerFunc) http.HandlerFunc {
|
|
return func(handlerName string, handler http.HandlerFunc) http.HandlerFunc {
|
|
return func(w http.ResponseWriter, r *http.Request) {
|
|
handler(w, r.WithContext(httputil.ContextWithPath(r.Context(), prefix+r.URL.Path)))
|
|
}
|
|
}
|
|
}
|