mirror of
https://github.com/prometheus/prometheus.git
synced 2025-01-12 14:27:27 -08:00
remote_write: shard up more when backlogged
Change the coefficient from 1% to 5%, so instead of targetting to clear the backlog in 100s we target 20s. Update unit test to reflect the new behaviour. Signed-off-by: Bryan Boreham <bjboreham@gmail.com>
This commit is contained in:
parent
d588b14d9c
commit
6d01ce8c4d
|
@ -831,14 +831,12 @@ func (t *QueueManager) calculateDesiredShards() int {
|
||||||
return t.numShards
|
return t.numShards
|
||||||
}
|
}
|
||||||
|
|
||||||
// When behind we will try to catch up on a proporation of samples per tick.
|
|
||||||
// This works similarly to an integral accumulator in that pending samples
|
|
||||||
// is the result of the error integral.
|
|
||||||
const integralGain = 0.1 / float64(shardUpdateDuration/time.Second)
|
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
// When behind we will try to catch up on 5% of samples per second.
|
||||||
|
backlogCatchup = 0.05 * dataPending
|
||||||
|
// Calculate Time to send one sample, averaged across all sends done this tick.
|
||||||
timePerSample = dataOutDuration / dataOutRate
|
timePerSample = dataOutDuration / dataOutRate
|
||||||
desiredShards = timePerSample * (dataInRate*dataKeptRatio + integralGain*dataPending)
|
desiredShards = timePerSample * (dataInRate*dataKeptRatio + backlogCatchup)
|
||||||
)
|
)
|
||||||
t.metrics.desiredNumShards.Set(desiredShards)
|
t.metrics.desiredNumShards.Set(desiredShards)
|
||||||
level.Debug(t.logger).Log("msg", "QueueManager.calculateDesiredShards",
|
level.Debug(t.logger).Log("msg", "QueueManager.calculateDesiredShards",
|
||||||
|
|
|
@ -990,25 +990,25 @@ func TestCalculateDesiredShardsDetail(t *testing.T) {
|
||||||
dataOut: 10,
|
dataOut: 10,
|
||||||
dataOutDuration: 2,
|
dataOutDuration: 2,
|
||||||
backlog: 20,
|
backlog: 20,
|
||||||
expectedShards: 2, // ? should be trying to catch up
|
expectedShards: 4,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "backlogged 90s",
|
name: "backlogged 90s",
|
||||||
prevShards: 2,
|
prevShards: 4,
|
||||||
dataIn: 10,
|
dataIn: 10,
|
||||||
dataOut: 10,
|
dataOut: 10,
|
||||||
dataOutDuration: 2,
|
dataOutDuration: 4,
|
||||||
backlog: 90,
|
backlog: 90,
|
||||||
expectedShards: 4, // ?! should be trying to go much faster
|
expectedShards: 22,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "backlog reduced",
|
name: "backlog reduced",
|
||||||
prevShards: 4,
|
prevShards: 22,
|
||||||
dataIn: 10,
|
dataIn: 10,
|
||||||
dataOut: 20,
|
dataOut: 20,
|
||||||
dataOutDuration: 4,
|
dataOutDuration: 4,
|
||||||
backlog: 10,
|
backlog: 10,
|
||||||
expectedShards: 3, // ?! shouldn't downshard from 4 to 3: less than 30% change
|
expectedShards: 3,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "backlog eliminated",
|
name: "backlog eliminated",
|
||||||
|
|
Loading…
Reference in a new issue