Limit concurrent remote reads. (#4656)

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>
This commit is contained in:
Tom Wilkie 2018-09-25 20:07:34 +01:00 committed by GitHub
parent d3a1ff1abf
commit 4c52400708
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 108 additions and 74 deletions

View file

@ -169,7 +169,10 @@ func main() {
Default("1m").PlaceHolder("<duration>").SetValue(&cfg.RemoteFlushDeadline)
a.Flag("storage.remote.read-sample-limit", "Maximum overall number of samples to return via the remote read interface, in a single query. 0 means no limit.").
Default("5e7").IntVar(&cfg.web.RemoteReadLimit)
Default("5e7").IntVar(&cfg.web.RemoteReadSampleLimit)
a.Flag("storage.remote.read-concurrent-limit", "Maximum number of concurrent remote read calls. 0 means no limit.").
Default("10").IntVar(&cfg.web.RemoteReadConcurrencyLimit)
a.Flag("rules.alert.for-outage-tolerance", "Max time to tolerate prometheus outage for restoring 'for' state of alert.").
Default("1h").SetValue(&cfg.outageTolerance)

48
pkg/gate/gate.go Normal file
View file

@ -0,0 +1,48 @@
// Copyright 2016 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package gate
import "context"
// A Gate controls the maximum number of concurrently running and waiting queries.
type Gate struct {
ch chan struct{}
}
// NewGate returns a query gate that limits the number of queries
// being concurrently executed.
func New(length int) *Gate {
return &Gate{
ch: make(chan struct{}, length),
}
}
// Start blocks until the gate has a free spot or the context is done.
func (g *Gate) Start(ctx context.Context) error {
select {
case <-ctx.Done():
return ctx.Err()
case g.ch <- struct{}{}:
return nil
}
}
// Done releases a single spot in the gate.
func (g *Gate) Done() {
select {
case <-g.ch:
default:
panic("gate.Done: more operations done than started")
}
}

View file

@ -30,6 +30,7 @@ import (
opentracing "github.com/opentracing/opentracing-go"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/pkg/gate"
"github.com/prometheus/prometheus/pkg/labels"
"github.com/prometheus/prometheus/pkg/timestamp"
"github.com/prometheus/prometheus/pkg/value"
@ -148,7 +149,13 @@ func (q *query) Exec(ctx context.Context) *Result {
func contextDone(ctx context.Context, env string) error {
select {
case <-ctx.Done():
err := ctx.Err()
return contextErr(ctx.Err(), env)
default:
return nil
}
}
func contextErr(err error, env string) error {
switch err {
case context.Canceled:
return ErrQueryCanceled(env)
@ -157,9 +164,6 @@ func contextDone(ctx context.Context, env string) error {
default:
return err
}
default:
return nil
}
}
// Engine handles the lifetime of queries from beginning to end.
@ -168,7 +172,7 @@ type Engine struct {
logger log.Logger
metrics *engineMetrics
timeout time.Duration
gate *queryGate
gate *gate.Gate
}
// NewEngine returns a new engine.
@ -232,7 +236,7 @@ func NewEngine(logger log.Logger, reg prometheus.Registerer, maxConcurrent int,
)
}
return &Engine{
gate: newQueryGate(maxConcurrent),
gate: gate.New(maxConcurrent),
timeout: timeout,
logger: logger,
metrics: metrics,
@ -317,7 +321,7 @@ func (ng *Engine) exec(ctx context.Context, q *query) (Value, error) {
queueSpanTimer, _ := q.stats.GetSpanTimer(ctx, stats.ExecQueueTime, ng.metrics.queryQueueTime)
if err := ng.gate.Start(ctx); err != nil {
return nil, err
return nil, contextErr(err, "query queue")
}
defer ng.gate.Done()
@ -1714,38 +1718,6 @@ func shouldDropMetricName(op ItemType) bool {
// series is considered stale.
var LookbackDelta = 5 * time.Minute
// A queryGate controls the maximum number of concurrently running and waiting queries.
type queryGate struct {
ch chan struct{}
}
// newQueryGate returns a query gate that limits the number of queries
// being concurrently executed.
func newQueryGate(length int) *queryGate {
return &queryGate{
ch: make(chan struct{}, length),
}
}
// Start blocks until the gate has a free spot or the context is done.
func (g *queryGate) Start(ctx context.Context) error {
select {
case <-ctx.Done():
return contextDone(ctx, "query queue")
case g.ch <- struct{}{}:
return nil
}
}
// Done releases a single spot in the gate.
func (g *queryGate) Done() {
select {
case <-g.ch:
default:
panic("engine.queryGate.Done: more operations done than started")
}
}
// documentedType returns the internal type to the equivalent
// user facing terminology as defined in the documentation.
func documentedType(t ValueType) string {

View file

@ -36,6 +36,7 @@ import (
"github.com/prometheus/tsdb"
"github.com/prometheus/prometheus/config"
"github.com/prometheus/prometheus/pkg/gate"
"github.com/prometheus/prometheus/pkg/labels"
"github.com/prometheus/prometheus/pkg/textparse"
"github.com/prometheus/prometheus/pkg/timestamp"
@ -134,7 +135,8 @@ type API struct {
db func() *tsdb.DB
enableAdmin bool
logger log.Logger
remoteReadLimit int
remoteReadSampleLimit int
remoteReadGate *gate.Gate
}
// NewAPI returns an initialized API type.
@ -150,7 +152,8 @@ func NewAPI(
enableAdmin bool,
logger log.Logger,
rr rulesRetriever,
remoteReadLimit int,
remoteReadSampleLimit int,
remoteReadConcurrencyLimit int,
) *API {
return &API{
QueryEngine: qe,
@ -165,7 +168,8 @@ func NewAPI(
db: db,
enableAdmin: enableAdmin,
rulesRetriever: rr,
remoteReadLimit: remoteReadLimit,
remoteReadSampleLimit: remoteReadSampleLimit,
remoteReadGate: gate.New(remoteReadConcurrencyLimit),
logger: logger,
}
}
@ -751,6 +755,9 @@ func (api *API) serveFlags(r *http.Request) (interface{}, *apiError, func()) {
}
func (api *API) remoteRead(w http.ResponseWriter, r *http.Request) {
api.remoteReadGate.Start(r.Context())
defer api.remoteReadGate.Done()
req, err := remote.DecodeReadRequest(r)
if err != nil {
http.Error(w, err.Error(), http.StatusBadRequest)
@ -798,7 +805,7 @@ func (api *API) remoteRead(w http.ResponseWriter, r *http.Request) {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
resp.Results[i], err = remote.ToQueryResult(set, api.remoteReadLimit)
resp.Results[i], err = remote.ToQueryResult(set, api.remoteReadSampleLimit)
if err != nil {
if httpErr, ok := err.(remote.HTTPError); ok {
http.Error(w, httpErr.Error(), httpErr.Status())

View file

@ -39,6 +39,7 @@ import (
"github.com/prometheus/common/route"
"github.com/prometheus/prometheus/config"
"github.com/prometheus/prometheus/pkg/gate"
"github.com/prometheus/prometheus/pkg/labels"
"github.com/prometheus/prometheus/pkg/timestamp"
"github.com/prometheus/prometheus/prompb"
@ -834,7 +835,8 @@ func TestReadEndpoint(t *testing.T) {
},
}
},
remoteReadLimit: 1e6,
remoteReadSampleLimit: 1e6,
remoteReadGate: gate.New(1),
}
// Encode the request.

View file

@ -167,7 +167,8 @@ type Options struct {
ConsoleLibrariesPath string
EnableLifecycle bool
EnableAdminAPI bool
RemoteReadLimit int
RemoteReadSampleLimit int
RemoteReadConcurrencyLimit int
}
func instrumentHandler(handlerName string, handler http.HandlerFunc) http.HandlerFunc {
@ -228,7 +229,8 @@ func New(logger log.Logger, o *Options) *Handler {
h.options.EnableAdminAPI,
logger,
h.ruleManager,
h.options.RemoteReadLimit,
h.options.RemoteReadSampleLimit,
h.options.RemoteReadConcurrencyLimit,
)
if o.RoutePrefix != "/" {