mirror of
https://github.com/prometheus/prometheus.git
synced 2025-03-05 20:59:13 -08:00
Merge pull request #1354 from prometheus/beorn7/storage
Rework the way to communicate backpressure (AKA suspended ingestion)
This commit is contained in:
commit
9ea3897ea7
|
@ -107,7 +107,7 @@ func init() {
|
||||||
)
|
)
|
||||||
cfg.fs.IntVar(
|
cfg.fs.IntVar(
|
||||||
&cfg.storage.MemoryChunks, "storage.local.memory-chunks", 1024*1024,
|
&cfg.storage.MemoryChunks, "storage.local.memory-chunks", 1024*1024,
|
||||||
"How many chunks to keep in memory. While the size of a chunk is 1kiB, the total memory usage will be significantly higher than this value * 1kiB. Furthermore, for various reasons, more chunks might have to be kept in memory temporarily.",
|
"How many chunks to keep in memory. While the size of a chunk is 1kiB, the total memory usage will be significantly higher than this value * 1kiB. Furthermore, for various reasons, more chunks might have to be kept in memory temporarily. Sample ingestion will be throttled if the configured value is exceeded by more than 10%.",
|
||||||
)
|
)
|
||||||
cfg.fs.DurationVar(
|
cfg.fs.DurationVar(
|
||||||
&cfg.storage.PersistenceRetentionPeriod, "storage.local.retention", 15*24*time.Hour,
|
&cfg.storage.PersistenceRetentionPeriod, "storage.local.retention", 15*24*time.Hour,
|
||||||
|
@ -115,7 +115,7 @@ func init() {
|
||||||
)
|
)
|
||||||
cfg.fs.IntVar(
|
cfg.fs.IntVar(
|
||||||
&cfg.storage.MaxChunksToPersist, "storage.local.max-chunks-to-persist", 512*1024,
|
&cfg.storage.MaxChunksToPersist, "storage.local.max-chunks-to-persist", 512*1024,
|
||||||
"How many chunks can be waiting for persistence before sample ingestion will stop. Many chunks waiting to be persisted will increase the checkpoint size.",
|
"How many chunks can be waiting for persistence before sample ingestion will be throttled. Many chunks waiting to be persisted will increase the checkpoint size.",
|
||||||
)
|
)
|
||||||
cfg.fs.DurationVar(
|
cfg.fs.DurationVar(
|
||||||
&cfg.storage.CheckpointInterval, "storage.local.checkpoint-interval", 5*time.Minute,
|
&cfg.storage.CheckpointInterval, "storage.local.checkpoint-interval", 5*time.Minute,
|
||||||
|
|
|
@ -14,8 +14,6 @@
|
||||||
package retrieval
|
package retrieval
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/prometheus/common/model"
|
"github.com/prometheus/common/model"
|
||||||
|
|
||||||
"github.com/prometheus/prometheus/config"
|
"github.com/prometheus/prometheus/config"
|
||||||
|
@ -26,14 +24,13 @@ type nopAppender struct{}
|
||||||
func (a nopAppender) Append(*model.Sample) {
|
func (a nopAppender) Append(*model.Sample) {
|
||||||
}
|
}
|
||||||
|
|
||||||
type slowAppender struct{}
|
func (a nopAppender) NeedsThrottling() bool {
|
||||||
|
return false
|
||||||
func (a slowAppender) Append(*model.Sample) {
|
|
||||||
time.Sleep(time.Millisecond)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type collectResultAppender struct {
|
type collectResultAppender struct {
|
||||||
result model.Samples
|
result model.Samples
|
||||||
|
throttled bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *collectResultAppender) Append(s *model.Sample) {
|
func (a *collectResultAppender) Append(s *model.Sample) {
|
||||||
|
@ -45,6 +42,10 @@ func (a *collectResultAppender) Append(s *model.Sample) {
|
||||||
a.result = append(a.result, s)
|
a.result = append(a.result, s)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *collectResultAppender) NeedsThrottling() bool {
|
||||||
|
return a.throttled
|
||||||
|
}
|
||||||
|
|
||||||
// fakeTargetProvider implements a TargetProvider and allows manual injection
|
// fakeTargetProvider implements a TargetProvider and allows manual injection
|
||||||
// of TargetGroups through the update channel.
|
// of TargetGroups through the update channel.
|
||||||
type fakeTargetProvider struct {
|
type fakeTargetProvider struct {
|
||||||
|
|
|
@ -48,7 +48,7 @@ const (
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
errIngestChannelFull = errors.New("ingestion channel full")
|
errSkippedScrape = errors.New("scrape skipped due to throttled ingestion")
|
||||||
|
|
||||||
targetIntervalLength = prometheus.NewSummaryVec(
|
targetIntervalLength = prometheus.NewSummaryVec(
|
||||||
prometheus.SummaryOpts{
|
prometheus.SummaryOpts{
|
||||||
|
@ -59,10 +59,19 @@ var (
|
||||||
},
|
},
|
||||||
[]string{interval},
|
[]string{interval},
|
||||||
)
|
)
|
||||||
|
targetSkippedScrapes = prometheus.NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Namespace: namespace,
|
||||||
|
Name: "target_skipped_scrapes_total",
|
||||||
|
Help: "Total number of scrapes that were skipped because the metric storage was throttled.",
|
||||||
|
},
|
||||||
|
[]string{interval},
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
prometheus.MustRegister(targetIntervalLength)
|
prometheus.MustRegister(targetIntervalLength)
|
||||||
|
prometheus.MustRegister(targetSkippedScrapes)
|
||||||
}
|
}
|
||||||
|
|
||||||
// TargetHealth describes the health state of a target.
|
// TargetHealth describes the health state of a target.
|
||||||
|
@ -151,8 +160,6 @@ type Target struct {
|
||||||
scraperStopping chan struct{}
|
scraperStopping chan struct{}
|
||||||
// Closing scraperStopped signals that scraping has been stopped.
|
// Closing scraperStopped signals that scraping has been stopped.
|
||||||
scraperStopped chan struct{}
|
scraperStopped chan struct{}
|
||||||
// Channel to buffer ingested samples.
|
|
||||||
ingestedSamples chan model.Vector
|
|
||||||
|
|
||||||
// Mutex protects the members below.
|
// Mutex protects the members below.
|
||||||
sync.RWMutex
|
sync.RWMutex
|
||||||
|
@ -166,8 +173,6 @@ type Target struct {
|
||||||
baseLabels model.LabelSet
|
baseLabels model.LabelSet
|
||||||
// Internal labels, such as scheme.
|
// Internal labels, such as scheme.
|
||||||
internalLabels model.LabelSet
|
internalLabels model.LabelSet
|
||||||
// What is the deadline for the HTTP or HTTPS against this endpoint.
|
|
||||||
deadline time.Duration
|
|
||||||
// The time between two scrapes.
|
// The time between two scrapes.
|
||||||
scrapeInterval time.Duration
|
scrapeInterval time.Duration
|
||||||
// Whether the target's labels have precedence over the base labels
|
// Whether the target's labels have precedence over the base labels
|
||||||
|
@ -237,7 +242,6 @@ func (t *Target) Update(cfg *config.ScrapeConfig, baseLabels, metaLabels model.L
|
||||||
t.url.RawQuery = params.Encode()
|
t.url.RawQuery = params.Encode()
|
||||||
|
|
||||||
t.scrapeInterval = time.Duration(cfg.ScrapeInterval)
|
t.scrapeInterval = time.Duration(cfg.ScrapeInterval)
|
||||||
t.deadline = time.Duration(cfg.ScrapeTimeout)
|
|
||||||
|
|
||||||
t.honorLabels = cfg.HonorLabels
|
t.honorLabels = cfg.HonorLabels
|
||||||
t.metaLabels = metaLabels
|
t.metaLabels = metaLabels
|
||||||
|
@ -361,6 +365,11 @@ func (t *Target) RunScraper(sampleAppender storage.SampleAppender) {
|
||||||
targetIntervalLength.WithLabelValues(intervalStr).Observe(
|
targetIntervalLength.WithLabelValues(intervalStr).Observe(
|
||||||
float64(took) / float64(time.Second), // Sub-second precision.
|
float64(took) / float64(time.Second), // Sub-second precision.
|
||||||
)
|
)
|
||||||
|
if sampleAppender.NeedsThrottling() {
|
||||||
|
targetSkippedScrapes.WithLabelValues(intervalStr).Inc()
|
||||||
|
t.status.setLastError(errSkippedScrape)
|
||||||
|
continue
|
||||||
|
}
|
||||||
t.scrape(sampleAppender)
|
t.scrape(sampleAppender)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -377,26 +386,6 @@ func (t *Target) StopScraper() {
|
||||||
log.Debugf("Scraper for target %v stopped.", t)
|
log.Debugf("Scraper for target %v stopped.", t)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t *Target) ingest(s model.Vector) error {
|
|
||||||
t.RLock()
|
|
||||||
deadline := t.deadline
|
|
||||||
t.RUnlock()
|
|
||||||
// Since the regular case is that ingestedSamples is ready to receive,
|
|
||||||
// first try without setting a timeout so that we don't need to allocate
|
|
||||||
// a timer most of the time.
|
|
||||||
select {
|
|
||||||
case t.ingestedSamples <- s:
|
|
||||||
return nil
|
|
||||||
default:
|
|
||||||
select {
|
|
||||||
case t.ingestedSamples <- s:
|
|
||||||
return nil
|
|
||||||
case <-time.After(deadline / 10):
|
|
||||||
return errIngestChannelFull
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const acceptHeader = `application/vnd.google.protobuf;proto=io.prometheus.client.MetricFamily;encoding=delimited;q=0.7,text/plain;version=0.0.4;q=0.3,application/json;schema="prometheus/telemetry";version=0.0.2;q=0.2,*/*;q=0.1`
|
const acceptHeader = `application/vnd.google.protobuf;proto=io.prometheus.client.MetricFamily;encoding=delimited;q=0.7,text/plain;version=0.0.4;q=0.3,application/json;schema="prometheus/telemetry";version=0.0.2;q=0.2,*/*;q=0.1`
|
||||||
|
|
||||||
func (t *Target) scrape(appender storage.SampleAppender) (err error) {
|
func (t *Target) scrape(appender storage.SampleAppender) (err error) {
|
||||||
|
@ -414,20 +403,20 @@ func (t *Target) scrape(appender storage.SampleAppender) (err error) {
|
||||||
// so the relabeling rules are applied to the correct label set.
|
// so the relabeling rules are applied to the correct label set.
|
||||||
if len(t.metricRelabelConfigs) > 0 {
|
if len(t.metricRelabelConfigs) > 0 {
|
||||||
appender = relabelAppender{
|
appender = relabelAppender{
|
||||||
app: appender,
|
SampleAppender: appender,
|
||||||
relabelings: t.metricRelabelConfigs,
|
relabelings: t.metricRelabelConfigs,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if t.honorLabels {
|
if t.honorLabels {
|
||||||
appender = honorLabelsAppender{
|
appender = honorLabelsAppender{
|
||||||
app: appender,
|
SampleAppender: appender,
|
||||||
labels: baseLabels,
|
labels: baseLabels,
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
appender = ruleLabelsAppender{
|
appender = ruleLabelsAppender{
|
||||||
app: appender,
|
SampleAppender: appender,
|
||||||
labels: baseLabels,
|
labels: baseLabels,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -460,27 +449,11 @@ func (t *Target) scrape(appender storage.SampleAppender) (err error) {
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
t.ingestedSamples = make(chan model.Vector, ingestedSamplesCap)
|
var samples model.Vector
|
||||||
|
for {
|
||||||
go func() {
|
if err = sdec.Decode(&samples); err != nil {
|
||||||
for {
|
break
|
||||||
// TODO(fabxc): Change the SampleAppender interface to return an error
|
|
||||||
// so we can proceed based on the status and don't leak goroutines trying
|
|
||||||
// to append a single sample after dropping all the other ones.
|
|
||||||
//
|
|
||||||
// This will also allow use to reuse this vector and save allocations.
|
|
||||||
var samples model.Vector
|
|
||||||
if err = sdec.Decode(&samples); err != nil {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
if err = t.ingest(samples); err != nil {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
close(t.ingestedSamples)
|
|
||||||
}()
|
|
||||||
|
|
||||||
for samples := range t.ingestedSamples {
|
|
||||||
for _, s := range samples {
|
for _, s := range samples {
|
||||||
appender.Append(s)
|
appender.Append(s)
|
||||||
}
|
}
|
||||||
|
@ -495,7 +468,7 @@ func (t *Target) scrape(appender storage.SampleAppender) (err error) {
|
||||||
// Merges the ingested sample's metric with the label set. On a collision the
|
// Merges the ingested sample's metric with the label set. On a collision the
|
||||||
// value of the ingested label is stored in a label prefixed with 'exported_'.
|
// value of the ingested label is stored in a label prefixed with 'exported_'.
|
||||||
type ruleLabelsAppender struct {
|
type ruleLabelsAppender struct {
|
||||||
app storage.SampleAppender
|
storage.SampleAppender
|
||||||
labels model.LabelSet
|
labels model.LabelSet
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -507,11 +480,11 @@ func (app ruleLabelsAppender) Append(s *model.Sample) {
|
||||||
s.Metric[ln] = lv
|
s.Metric[ln] = lv
|
||||||
}
|
}
|
||||||
|
|
||||||
app.app.Append(s)
|
app.SampleAppender.Append(s)
|
||||||
}
|
}
|
||||||
|
|
||||||
type honorLabelsAppender struct {
|
type honorLabelsAppender struct {
|
||||||
app storage.SampleAppender
|
storage.SampleAppender
|
||||||
labels model.LabelSet
|
labels model.LabelSet
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -525,13 +498,13 @@ func (app honorLabelsAppender) Append(s *model.Sample) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
app.app.Append(s)
|
app.SampleAppender.Append(s)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Applies a set of relabel configurations to the sample's metric
|
// Applies a set of relabel configurations to the sample's metric
|
||||||
// before actually appending it.
|
// before actually appending it.
|
||||||
type relabelAppender struct {
|
type relabelAppender struct {
|
||||||
app storage.SampleAppender
|
storage.SampleAppender
|
||||||
relabelings []*config.RelabelConfig
|
relabelings []*config.RelabelConfig
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -547,7 +520,7 @@ func (app relabelAppender) Append(s *model.Sample) {
|
||||||
}
|
}
|
||||||
s.Metric = model.Metric(labels)
|
s.Metric = model.Metric(labels)
|
||||||
|
|
||||||
app.app.Append(s)
|
app.SampleAppender.Append(s)
|
||||||
}
|
}
|
||||||
|
|
||||||
// URL returns a copy of the target's URL.
|
// URL returns a copy of the target's URL.
|
||||||
|
|
|
@ -139,12 +139,12 @@ func TestTargetScrapeUpdatesState(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestTargetScrapeWithFullChannel(t *testing.T) {
|
func TestTargetScrapeWithThrottledStorage(t *testing.T) {
|
||||||
server := httptest.NewServer(
|
server := httptest.NewServer(
|
||||||
http.HandlerFunc(
|
http.HandlerFunc(
|
||||||
func(w http.ResponseWriter, r *http.Request) {
|
func(w http.ResponseWriter, r *http.Request) {
|
||||||
w.Header().Set("Content-Type", `text/plain; version=0.0.4`)
|
w.Header().Set("Content-Type", `text/plain; version=0.0.4`)
|
||||||
for i := 0; i < 2*ingestedSamplesCap; i++ {
|
for i := 0; i < 10; i++ {
|
||||||
w.Write([]byte(
|
w.Write([]byte(
|
||||||
fmt.Sprintf("test_metric_%d{foo=\"bar\"} 123.456\n", i),
|
fmt.Sprintf("test_metric_%d{foo=\"bar\"} 123.456\n", i),
|
||||||
))
|
))
|
||||||
|
@ -155,15 +155,21 @@ func TestTargetScrapeWithFullChannel(t *testing.T) {
|
||||||
defer server.Close()
|
defer server.Close()
|
||||||
|
|
||||||
testTarget := newTestTarget(server.URL, time.Second, model.LabelSet{"dings": "bums"})
|
testTarget := newTestTarget(server.URL, time.Second, model.LabelSet{"dings": "bums"})
|
||||||
// Affects full channel but not HTTP fetch
|
|
||||||
testTarget.deadline = 0
|
|
||||||
|
|
||||||
testTarget.scrape(slowAppender{})
|
go testTarget.RunScraper(&collectResultAppender{throttled: true})
|
||||||
|
|
||||||
|
// Enough time for a scrape to happen.
|
||||||
|
time.Sleep(20 * time.Millisecond)
|
||||||
|
|
||||||
|
testTarget.StopScraper()
|
||||||
|
// Wait for it to take effect.
|
||||||
|
time.Sleep(20 * time.Millisecond)
|
||||||
|
|
||||||
if testTarget.status.Health() != HealthBad {
|
if testTarget.status.Health() != HealthBad {
|
||||||
t.Errorf("Expected target state %v, actual: %v", HealthBad, testTarget.status.Health())
|
t.Errorf("Expected target state %v, actual: %v", HealthBad, testTarget.status.Health())
|
||||||
}
|
}
|
||||||
if testTarget.status.LastError() != errIngestChannelFull {
|
if testTarget.status.LastError() != errSkippedScrape {
|
||||||
t.Errorf("Expected target error %q, actual: %q", errIngestChannelFull, testTarget.status.LastError())
|
t.Errorf("Expected target error %q, actual: %q", errSkippedScrape, testTarget.status.LastError())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -450,7 +456,6 @@ func newTestTarget(targetURL string, deadline time.Duration, baseLabels model.La
|
||||||
Host: strings.TrimLeft(targetURL, "http://"),
|
Host: strings.TrimLeft(targetURL, "http://"),
|
||||||
Path: "/metrics",
|
Path: "/metrics",
|
||||||
},
|
},
|
||||||
deadline: deadline,
|
|
||||||
status: &TargetStatus{},
|
status: &TargetStatus{},
|
||||||
scrapeInterval: 1 * time.Millisecond,
|
scrapeInterval: 1 * time.Millisecond,
|
||||||
httpClient: c,
|
httpClient: c,
|
||||||
|
|
|
@ -165,6 +165,7 @@ func (tm *TargetManager) Run() {
|
||||||
})
|
})
|
||||||
|
|
||||||
tm.running = true
|
tm.running = true
|
||||||
|
log.Info("Target manager started.")
|
||||||
}
|
}
|
||||||
|
|
||||||
// handleUpdates receives target group updates and handles them in the
|
// handleUpdates receives target group updates and handles them in the
|
||||||
|
|
|
@ -66,9 +66,19 @@ var (
|
||||||
iterationDuration = prometheus.NewSummary(prometheus.SummaryOpts{
|
iterationDuration = prometheus.NewSummary(prometheus.SummaryOpts{
|
||||||
Namespace: namespace,
|
Namespace: namespace,
|
||||||
Name: "evaluator_duration_seconds",
|
Name: "evaluator_duration_seconds",
|
||||||
Help: "The duration for all evaluations to execute.",
|
Help: "The duration of rule group evaluations.",
|
||||||
Objectives: map[float64]float64{0.01: 0.001, 0.05: 0.005, 0.5: 0.05, 0.90: 0.01, 0.99: 0.001},
|
Objectives: map[float64]float64{0.01: 0.001, 0.05: 0.005, 0.5: 0.05, 0.90: 0.01, 0.99: 0.001},
|
||||||
})
|
})
|
||||||
|
iterationsSkipped = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
|
Namespace: namespace,
|
||||||
|
Name: "evaluator_iterations_skipped_total",
|
||||||
|
Help: "The total number of rule group evaluations skipped due to throttled metric storage.",
|
||||||
|
})
|
||||||
|
iterationsScheduled = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
|
Namespace: namespace,
|
||||||
|
Name: "evaluator_iterations_total",
|
||||||
|
Help: "The total number of scheduled rule group evaluations, whether skipped or executed.",
|
||||||
|
})
|
||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
|
@ -78,6 +88,7 @@ func init() {
|
||||||
evalFailures.WithLabelValues(string(ruleTypeRecording))
|
evalFailures.WithLabelValues(string(ruleTypeRecording))
|
||||||
|
|
||||||
prometheus.MustRegister(iterationDuration)
|
prometheus.MustRegister(iterationDuration)
|
||||||
|
prometheus.MustRegister(iterationsSkipped)
|
||||||
prometheus.MustRegister(evalFailures)
|
prometheus.MustRegister(evalFailures)
|
||||||
prometheus.MustRegister(evalDuration)
|
prometheus.MustRegister(evalDuration)
|
||||||
}
|
}
|
||||||
|
@ -133,6 +144,11 @@ func (g *Group) run() {
|
||||||
}
|
}
|
||||||
|
|
||||||
iter := func() {
|
iter := func() {
|
||||||
|
iterationsScheduled.Inc()
|
||||||
|
if g.opts.SampleAppender.NeedsThrottling() {
|
||||||
|
iterationsSkipped.Inc()
|
||||||
|
return
|
||||||
|
}
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
g.eval()
|
g.eval()
|
||||||
|
|
||||||
|
|
|
@ -34,6 +34,9 @@ type Storage interface {
|
||||||
// from the provided Sample as those labels are considered equivalent to
|
// from the provided Sample as those labels are considered equivalent to
|
||||||
// a label not present at all.
|
// a label not present at all.
|
||||||
Append(*model.Sample)
|
Append(*model.Sample)
|
||||||
|
// NeedsThrottling returns true if the Storage has too many chunks in memory
|
||||||
|
// already or has too many chunks waiting for persistence.
|
||||||
|
NeedsThrottling() bool
|
||||||
// NewPreloader returns a new Preloader which allows preloading and pinning
|
// NewPreloader returns a new Preloader which allows preloading and pinning
|
||||||
// series data into memory for use within a query.
|
// series data into memory for use within a query.
|
||||||
NewPreloader() Preloader
|
NewPreloader() Preloader
|
||||||
|
|
|
@ -47,9 +47,9 @@ const (
|
||||||
persintenceUrgencyScoreForLeavingRushedMode = 0.7
|
persintenceUrgencyScoreForLeavingRushedMode = 0.7
|
||||||
|
|
||||||
// This factor times -storage.local.memory-chunks is the number of
|
// This factor times -storage.local.memory-chunks is the number of
|
||||||
// memory chunks we tolerate before suspending ingestion (TODO!). It is
|
// memory chunks we tolerate before throttling the storage. It is also a
|
||||||
// also a basis for calculating the persistenceUrgencyScore.
|
// basis for calculating the persistenceUrgencyScore.
|
||||||
toleranceFactorForMemChunks = 1.1
|
toleranceFactorMemChunks = 1.1
|
||||||
// This factor times -storage.local.max-chunks-to-persist is the minimum
|
// This factor times -storage.local.max-chunks-to-persist is the minimum
|
||||||
// required number of chunks waiting for persistence before the number
|
// required number of chunks waiting for persistence before the number
|
||||||
// of chunks in memory may influence the persistenceUrgencyScore. (In
|
// of chunks in memory may influence the persistenceUrgencyScore. (In
|
||||||
|
@ -121,9 +121,10 @@ type syncStrategy func() bool
|
||||||
|
|
||||||
type memorySeriesStorage struct {
|
type memorySeriesStorage struct {
|
||||||
// numChunksToPersist has to be aligned for atomic operations.
|
// numChunksToPersist has to be aligned for atomic operations.
|
||||||
numChunksToPersist int64 // The number of chunks waiting for persistence.
|
numChunksToPersist int64 // The number of chunks waiting for persistence.
|
||||||
maxChunksToPersist int // If numChunksToPersist reaches this threshold, ingestion will stall.
|
maxChunksToPersist int // If numChunksToPersist reaches this threshold, ingestion will be throttled.
|
||||||
rushed bool // Whether the storage is in rushed mode.
|
rushed bool // Whether the storage is in rushed mode.
|
||||||
|
throttled chan struct{} // This chan is sent to whenever NeedsThrottling() returns true (for logging).
|
||||||
|
|
||||||
fpLocker *fingerprintLocker
|
fpLocker *fingerprintLocker
|
||||||
fpToSeries *seriesMap
|
fpToSeries *seriesMap
|
||||||
|
@ -180,6 +181,7 @@ func NewMemorySeriesStorage(o *MemorySeriesStorageOptions) Storage {
|
||||||
|
|
||||||
loopStopping: make(chan struct{}),
|
loopStopping: make(chan struct{}),
|
||||||
loopStopped: make(chan struct{}),
|
loopStopped: make(chan struct{}),
|
||||||
|
throttled: make(chan struct{}, 1),
|
||||||
maxMemoryChunks: o.MemoryChunks,
|
maxMemoryChunks: o.MemoryChunks,
|
||||||
dropAfter: o.PersistenceRetentionPeriod,
|
dropAfter: o.PersistenceRetentionPeriod,
|
||||||
checkpointInterval: o.CheckpointInterval,
|
checkpointInterval: o.CheckpointInterval,
|
||||||
|
@ -306,6 +308,7 @@ func (s *memorySeriesStorage) Start() (err error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
go s.handleEvictList()
|
go s.handleEvictList()
|
||||||
|
go s.logThrottling()
|
||||||
go s.loop()
|
go s.loop()
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
|
@ -571,16 +574,6 @@ func (s *memorySeriesStorage) Append(sample *model.Sample) {
|
||||||
delete(sample.Metric, ln)
|
delete(sample.Metric, ln)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if s.getNumChunksToPersist() >= s.maxChunksToPersist {
|
|
||||||
log.Warnf(
|
|
||||||
"%d chunks waiting for persistence, sample ingestion suspended.",
|
|
||||||
s.getNumChunksToPersist(),
|
|
||||||
)
|
|
||||||
for s.getNumChunksToPersist() >= s.maxChunksToPersist {
|
|
||||||
time.Sleep(time.Second)
|
|
||||||
}
|
|
||||||
log.Warn("Sample ingestion resumed.")
|
|
||||||
}
|
|
||||||
rawFP := sample.Metric.FastFingerprint()
|
rawFP := sample.Metric.FastFingerprint()
|
||||||
s.fpLocker.Lock(rawFP)
|
s.fpLocker.Lock(rawFP)
|
||||||
fp, err := s.mapper.mapFP(rawFP, sample.Metric)
|
fp, err := s.mapper.mapFP(rawFP, sample.Metric)
|
||||||
|
@ -616,6 +609,57 @@ func (s *memorySeriesStorage) Append(sample *model.Sample) {
|
||||||
s.incNumChunksToPersist(completedChunksCount)
|
s.incNumChunksToPersist(completedChunksCount)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NeedsThrottling implements Storage.
|
||||||
|
func (s *memorySeriesStorage) NeedsThrottling() bool {
|
||||||
|
if s.getNumChunksToPersist() > s.maxChunksToPersist ||
|
||||||
|
float64(atomic.LoadInt64(&numMemChunks)) > float64(s.maxMemoryChunks)*toleranceFactorMemChunks {
|
||||||
|
select {
|
||||||
|
case s.throttled <- struct{}{}:
|
||||||
|
default: // Do nothing, signal aready pending.
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// logThrottling handles logging of throttled events and has to be started as a
|
||||||
|
// goroutine. It stops once s.loopStopping is closed.
|
||||||
|
//
|
||||||
|
// Logging strategy: Whenever Throttle() is called and returns true, an signal
|
||||||
|
// is sent to s.throttled. If that happens for the first time, an Error is
|
||||||
|
// logged that the storage is now throttled. As long as signals continues to be
|
||||||
|
// sent via s.throttled at least once per minute, nothing else is logged. Once
|
||||||
|
// no signal has arrived for a minute, an Info is logged that the storage is not
|
||||||
|
// throttled anymore. This resets things to the initial state, i.e. once a
|
||||||
|
// signal arrives again, the Error will be logged again.
|
||||||
|
func (s *memorySeriesStorage) logThrottling() {
|
||||||
|
timer := time.NewTimer(time.Minute)
|
||||||
|
timer.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-s.throttled:
|
||||||
|
if !timer.Reset(time.Minute) {
|
||||||
|
log.
|
||||||
|
With("chunksToPersist", s.getNumChunksToPersist()).
|
||||||
|
With("maxChunksToPersist", s.maxChunksToPersist).
|
||||||
|
With("memoryChunks", atomic.LoadInt64(&numMemChunks)).
|
||||||
|
With("maxToleratedMemChunks", int(float64(s.maxMemoryChunks)*toleranceFactorMemChunks)).
|
||||||
|
Error("Storage needs throttling. Scrapes and rule evaluations will be skipped.")
|
||||||
|
}
|
||||||
|
case <-timer.C:
|
||||||
|
log.
|
||||||
|
With("chunksToPersist", s.getNumChunksToPersist()).
|
||||||
|
With("maxChunksToPersist", s.maxChunksToPersist).
|
||||||
|
With("memoryChunks", atomic.LoadInt64(&numMemChunks)).
|
||||||
|
With("maxToleratedMemChunks", int(float64(s.maxMemoryChunks)*toleranceFactorMemChunks)).
|
||||||
|
Info("Storage does not need throttling anymore.")
|
||||||
|
case <-s.loopStopping:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (s *memorySeriesStorage) getOrCreateSeries(fp model.Fingerprint, m model.Metric) *memorySeries {
|
func (s *memorySeriesStorage) getOrCreateSeries(fp model.Fingerprint, m model.Metric) *memorySeries {
|
||||||
series, ok := s.fpToSeries.get(fp)
|
series, ok := s.fpToSeries.get(fp)
|
||||||
if !ok {
|
if !ok {
|
||||||
|
@ -1210,7 +1254,7 @@ func (s *memorySeriesStorage) calculatePersistenceUrgencyScore() float64 {
|
||||||
if chunksToPersist > maxChunksToPersist*factorMinChunksToPersist {
|
if chunksToPersist > maxChunksToPersist*factorMinChunksToPersist {
|
||||||
score = math.Max(
|
score = math.Max(
|
||||||
score,
|
score,
|
||||||
(memChunks/maxMemChunks-1)/(toleranceFactorForMemChunks-1),
|
(memChunks/maxMemChunks-1)/(toleranceFactorMemChunks-1),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
if score > 1 {
|
if score > 1 {
|
||||||
|
@ -1230,11 +1274,11 @@ func (s *memorySeriesStorage) calculatePersistenceUrgencyScore() float64 {
|
||||||
s.rushedMode.Set(0)
|
s.rushedMode.Set(0)
|
||||||
log.
|
log.
|
||||||
With("urgencyScore", score).
|
With("urgencyScore", score).
|
||||||
With("chunksToPersist", chunksToPersist).
|
With("chunksToPersist", int(chunksToPersist)).
|
||||||
With("maxChunksToPersist", maxChunksToPersist).
|
With("maxChunksToPersist", int(maxChunksToPersist)).
|
||||||
With("memoryChunks", memChunks).
|
With("memoryChunks", int(memChunks)).
|
||||||
With("maxMemoryChunks", maxMemChunks).
|
With("maxMemoryChunks", int(maxMemChunks)).
|
||||||
Warn("Storage has left rushed mode.")
|
Info("Storage has left rushed mode.")
|
||||||
return score
|
return score
|
||||||
}
|
}
|
||||||
if score > persintenceUrgencyScoreForEnteringRushedMode {
|
if score > persintenceUrgencyScoreForEnteringRushedMode {
|
||||||
|
@ -1243,10 +1287,10 @@ func (s *memorySeriesStorage) calculatePersistenceUrgencyScore() float64 {
|
||||||
s.rushedMode.Set(1)
|
s.rushedMode.Set(1)
|
||||||
log.
|
log.
|
||||||
With("urgencyScore", score).
|
With("urgencyScore", score).
|
||||||
With("chunksToPersist", chunksToPersist).
|
With("chunksToPersist", int(chunksToPersist)).
|
||||||
With("maxChunksToPersist", maxChunksToPersist).
|
With("maxChunksToPersist", int(maxChunksToPersist)).
|
||||||
With("memoryChunks", memChunks).
|
With("memoryChunks", int(memChunks)).
|
||||||
With("maxMemoryChunks", maxMemChunks).
|
With("maxMemoryChunks", int(maxMemChunks)).
|
||||||
Warn("Storage has entered rushed mode.")
|
Warn("Storage has entered rushed mode.")
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
|
@ -132,8 +132,7 @@ func NewStorageQueueManager(tsdb StorageClient, queueCapacity int) *StorageQueue
|
||||||
}
|
}
|
||||||
|
|
||||||
// Append queues a sample to be sent to the remote storage. It drops the
|
// Append queues a sample to be sent to the remote storage. It drops the
|
||||||
// sample on the floor if the queue is full. It implements
|
// sample on the floor if the queue is full.
|
||||||
// storage.SampleAppender.
|
|
||||||
func (t *StorageQueueManager) Append(s *model.Sample) {
|
func (t *StorageQueueManager) Append(s *model.Sample) {
|
||||||
select {
|
select {
|
||||||
case t.queue <- s:
|
case t.queue <- s:
|
||||||
|
|
|
@ -124,6 +124,13 @@ func (s *Storage) Append(smpl *model.Sample) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NeedsThrottling implements storage.SampleAppender. It will always return
|
||||||
|
// false as a remote storage drops samples on the floor if backlogging instead
|
||||||
|
// of asking for throttling.
|
||||||
|
func (s *Storage) NeedsThrottling() bool {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
// Describe implements prometheus.Collector.
|
// Describe implements prometheus.Collector.
|
||||||
func (s *Storage) Describe(ch chan<- *prometheus.Desc) {
|
func (s *Storage) Describe(ch chan<- *prometheus.Desc) {
|
||||||
for _, q := range s.queues {
|
for _, q := range s.queues {
|
||||||
|
|
|
@ -18,9 +18,32 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
// SampleAppender is the interface to append samples to both, local and remote
|
// SampleAppender is the interface to append samples to both, local and remote
|
||||||
// storage.
|
// storage. All methods are goroutine-safe.
|
||||||
type SampleAppender interface {
|
type SampleAppender interface {
|
||||||
|
// Append appends a sample to the underlying storage. Depending on the
|
||||||
|
// storage implementation, there are different guarantees for the fate
|
||||||
|
// of the sample after Append has returned. Remote storage
|
||||||
|
// implementation will simply drop samples if they cannot keep up with
|
||||||
|
// sending samples. Local storage implementations will only drop metrics
|
||||||
|
// upon unrecoverable errors. Reporting any errors is done via metrics
|
||||||
|
// and logs and not the concern of the caller.
|
||||||
Append(*model.Sample)
|
Append(*model.Sample)
|
||||||
|
// NeedsThrottling returns true if the underlying storage wishes to not
|
||||||
|
// receive any more samples. Append will still work but might lead to
|
||||||
|
// undue resource usage. It is recommended to call NeedsThrottling once
|
||||||
|
// before an upcoming batch of Append calls (e.g. a full scrape of a
|
||||||
|
// target or the evaluation of a rule group) and only proceed with the
|
||||||
|
// batch if NeedsThrottling returns false. In that way, the result of a
|
||||||
|
// scrape or of an evaluation of a rule group will always be appended
|
||||||
|
// completely or not at all, and the work of scraping or evaluation will
|
||||||
|
// not be performed in vain. Also, a call of NeedsThrottling is
|
||||||
|
// potentially expensive, so limiting the number of calls is reasonable.
|
||||||
|
//
|
||||||
|
// Only SampleAppenders for which it is considered critical to receive
|
||||||
|
// each and every sample should ever return true. SampleAppenders that
|
||||||
|
// tolerate not receiving all samples should always return false and
|
||||||
|
// instead drop samples as they see fit to avoid overload.
|
||||||
|
NeedsThrottling() bool
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fanout is a SampleAppender that appends every sample to each SampleAppender
|
// Fanout is a SampleAppender that appends every sample to each SampleAppender
|
||||||
|
@ -35,3 +58,14 @@ func (f Fanout) Append(s *model.Sample) {
|
||||||
a.Append(s)
|
a.Append(s)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NeedsThrottling returns true if at least one of the SampleAppenders in the
|
||||||
|
// Fanout slice is throttled.
|
||||||
|
func (f Fanout) NeedsThrottling() bool {
|
||||||
|
for _, a := range f {
|
||||||
|
if a.NeedsThrottling() {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue