mirror of
https://github.com/prometheus/prometheus.git
synced 2025-02-21 03:16:00 -08:00
promql: Fix limiting of extrapolation to negative values
This is a bit tough to explain, but I'll try: `rate` & friends have a sophisticated extrapolation algorithm. Usually, we extrapolate the result to the total interval specified in the range selector. However, if the first sample within the range is too far away from the beginning of the interval, or if the last sample within the range is too far away from the end of the interval, we assume the series has just started half a sampling interval before the first sample or after the last sample, respectively, and shorten the extrapolation interval correspondingly. We calculate the sampling interval by looking at the average time between samples within the range, and we define "too far away" as "more than 110% of that sampling interval". However, if this algorithm leads to an extrapolated starting value that is negative, we limit the start of the extrapolation interval to the point where the extrapolated starting value is zero. At least that was the intention. What we actually implemented is the following: If extrapolating all the way to the beginning of the total interval would lead to an extrapolated negative value, we would only extrapolate to the zero point as above, even if the algorithm above would have selected a starting point that is just half a sampling interval before the first sample and that starting point would not have an extrapolated negative value. In other word: What was meant as a _limitation_ of the extrapolation interval yielded a _longer_ extrapolation interval in this case. There is an exception to the case just described: If the increase of the extrapolation interval is more than 110% of the sampling interval, we suddenly drop back to only extrapolate to half a sampling interval. This behavior can be nicely seen in the testcounter_zero_cutoff test, where the rate goes up all the way to 0.7 and then jumps back to 0.6. This commit changes the behavior to what was (presumably) intended from the beginning: The extension of the extrapolation interval is only limited if actually needed to prevent extrapolation to negative values, but the "limitation" never leads to _more_ extrapolation anymore. The difference is subtle, and probably it never bothered anyone. However, if you calculate a rate of a classic histograms, the old behavior might create non-monotonic histograms as a result (because of the jumps you can see nicely in the old version of the testcounter_zero_cutoff test). With this fix, that doesn't happen anymore. Signed-off-by: beorn7 <beorn@grafana.com>
This commit is contained in:
parent
6fd7c0c297
commit
7f912db15a
|
@ -128,20 +128,6 @@ func extrapolatedRate(vals []parser.Value, args parser.Expressions, enh *EvalNod
|
|||
sampledInterval := float64(lastT-firstT) / 1000
|
||||
averageDurationBetweenSamples := sampledInterval / float64(numSamplesMinusOne)
|
||||
|
||||
// TODO(beorn7): Do this for histograms, too.
|
||||
if isCounter && resultFloat > 0 && len(samples.Floats) > 0 && samples.Floats[0].F >= 0 {
|
||||
// Counters cannot be negative. If we have any slope at all
|
||||
// (i.e. resultFloat went up), we can extrapolate the zero point
|
||||
// of the counter. If the duration to the zero point is shorter
|
||||
// than the durationToStart, we take the zero point as the start
|
||||
// of the series, thereby avoiding extrapolation to negative
|
||||
// counter values.
|
||||
durationToZero := sampledInterval * (samples.Floats[0].F / resultFloat)
|
||||
if durationToZero < durationToStart {
|
||||
durationToStart = durationToZero
|
||||
}
|
||||
}
|
||||
|
||||
// If the first/last samples are close to the boundaries of the range,
|
||||
// extrapolate the result. This is as we expect that another sample
|
||||
// will exist given the spacing between samples we've seen thus far,
|
||||
|
@ -149,16 +135,29 @@ func extrapolatedRate(vals []parser.Value, args parser.Expressions, enh *EvalNod
|
|||
extrapolationThreshold := averageDurationBetweenSamples * 1.1
|
||||
extrapolateToInterval := sampledInterval
|
||||
|
||||
if durationToStart < extrapolationThreshold {
|
||||
extrapolateToInterval += durationToStart
|
||||
} else {
|
||||
extrapolateToInterval += averageDurationBetweenSamples / 2
|
||||
if durationToStart >= extrapolationThreshold {
|
||||
durationToStart = averageDurationBetweenSamples / 2
|
||||
}
|
||||
if durationToEnd < extrapolationThreshold {
|
||||
extrapolateToInterval += durationToEnd
|
||||
} else {
|
||||
extrapolateToInterval += averageDurationBetweenSamples / 2
|
||||
if isCounter && resultFloat > 0 && len(samples.Floats) > 0 && samples.Floats[0].F >= 0 {
|
||||
// Counters cannot be negative. If we have any slope at all
|
||||
// (i.e. resultFloat went up), we can extrapolate the zero point
|
||||
// of the counter. If the duration to the zero point is shorter
|
||||
// than the durationToStart, we take the zero point as the start
|
||||
// of the series, thereby avoiding extrapolation to negative
|
||||
// counter values.
|
||||
// TODO(beorn7): Do this for histograms, too.
|
||||
durationToZero := sampledInterval * (samples.Floats[0].F / resultFloat)
|
||||
if durationToZero < durationToStart {
|
||||
durationToStart = durationToZero
|
||||
}
|
||||
}
|
||||
extrapolateToInterval += durationToStart
|
||||
|
||||
if durationToEnd >= extrapolationThreshold {
|
||||
durationToEnd = averageDurationBetweenSamples / 2
|
||||
}
|
||||
extrapolateToInterval += durationToEnd
|
||||
|
||||
factor := extrapolateToInterval / sampledInterval
|
||||
if isRate {
|
||||
factor /= ms.Range.Seconds()
|
||||
|
|
29
promql/testdata/functions.test
vendored
29
promql/testdata/functions.test
vendored
|
@ -71,15 +71,28 @@ clear
|
|||
load 5m
|
||||
http_requests{path="/foo"} 0+10x10
|
||||
http_requests{path="/bar"} 0+10x5 0+10x5
|
||||
http_requests{path="/dings"} 10+10x10
|
||||
http_requests{path="/bumms"} 1+10x10
|
||||
|
||||
# Tests for increase().
|
||||
eval instant at 50m increase(http_requests[50m])
|
||||
{path="/foo"} 100
|
||||
{path="/bar"} 90
|
||||
{path="/foo"} 100
|
||||
{path="/bar"} 90
|
||||
{path="/dings"} 100
|
||||
{path="/bumms"} 100
|
||||
|
||||
# "foo" and "bar" are already at value 0 at t=0, so no extrapolation
|
||||
# happens. "dings" has value 10 at t=0 and would reach 0 at t=-5m. The
|
||||
# normal extrapolation by half a sample interval only goes to
|
||||
# t=-2m30s, so that's not yet reaching a negative value and therefore
|
||||
# chosen. However, "bumms" has value 1 at t=0 and would reach 0 at
|
||||
# t=-30s. Here the extrapolation to t=-2m30s would reach a negative
|
||||
# value, and therefore the extrapolation happens only by 30s.
|
||||
eval instant at 50m increase(http_requests[100m])
|
||||
{path="/foo"} 100
|
||||
{path="/bar"} 90
|
||||
{path="/foo"} 100
|
||||
{path="/bar"} 90
|
||||
{path="/dings"} 105
|
||||
{path="/bumms"} 101
|
||||
|
||||
clear
|
||||
|
||||
|
@ -133,13 +146,15 @@ load 4m
|
|||
testcounter_zero_cutoff{start="4m"} 240+240x10
|
||||
testcounter_zero_cutoff{start="5m"} 300+240x10
|
||||
|
||||
# Zero cutoff for left-side extrapolation.
|
||||
# Zero cutoff for left-side extrapolation happens until we
|
||||
# reach half a sampling interval (2m). Beyond that, we only
|
||||
# extrapolate by half a sampling interval.
|
||||
eval instant at 10m rate(testcounter_zero_cutoff[20m])
|
||||
{start="0m"} 0.5
|
||||
{start="1m"} 0.55
|
||||
{start="2m"} 0.6
|
||||
{start="3m"} 0.65
|
||||
{start="4m"} 0.7
|
||||
{start="3m"} 0.6
|
||||
{start="4m"} 0.6
|
||||
{start="5m"} 0.6
|
||||
|
||||
# Normal half-interval cutoff for left-side extrapolation.
|
||||
|
|
Loading…
Reference in a new issue