From 6286c10df0efc3591493c298cefd4b5979734132 Mon Sep 17 00:00:00 2001 From: David King Date: Mon, 16 Apr 2018 09:03:04 +0100 Subject: [PATCH] Fix OOM when a large K is used in topk queries (#4087) This attempts to close #3973. Handles cases where the length of the input vector to an aggregate topk / bottomk function is less than the K paramater. The change updates Prometheus to allocate a result vector the same length as the input vector in these cases. Previously Prometheus would out-of-memory panic for large K values. This change makes that unlikely unless the size of the input vector is equally large. Signed-off-by: David King --- promql/engine.go | 9 +++++++-- promql/testdata/aggregators.test | 10 ++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/promql/engine.go b/promql/engine.go index 2c5973ec9..38389f99c 100644 --- a/promql/engine.go +++ b/promql/engine.go @@ -1333,14 +1333,19 @@ func (ev *evaluator) aggregation(op ItemType, grouping []string, without bool, p valuesSquaredSum: s.V * s.V, groupCount: 1, } + input_vec_len := int64(len(vec)) + result_size := k + if k > input_vec_len { + result_size = input_vec_len + } if op == itemTopK || op == itemQuantile { - result[groupingKey].heap = make(vectorByValueHeap, 0, k) + result[groupingKey].heap = make(vectorByValueHeap, 0, result_size) heap.Push(&result[groupingKey].heap, &Sample{ Point: Point{V: s.V}, Metric: s.Metric, }) } else if op == itemBottomK { - result[groupingKey].reverseHeap = make(vectorByReverseValueHeap, 0, k) + result[groupingKey].reverseHeap = make(vectorByReverseValueHeap, 0, result_size) heap.Push(&result[groupingKey].reverseHeap, &Sample{ Point: Point{V: s.V}, Metric: s.Metric, diff --git a/promql/testdata/aggregators.test b/promql/testdata/aggregators.test index 40b66d606..0cbcd5590 100644 --- a/promql/testdata/aggregators.test +++ b/promql/testdata/aggregators.test @@ -184,6 +184,16 @@ eval_ordered instant at 50m bottomk(3, http_requests{job="api-server",group="pro http_requests{job="api-server", instance="1", group="production"} 200 http_requests{job="api-server", instance="2", group="production"} NaN +# Test topk and bottomk allocate min(k, input_vector) for results vector +eval_ordered instant at 50m bottomk(9999999999, http_requests{job="app-server",group="canary"}) + http_requests{group="canary", instance="0", job="app-server"} 700 + http_requests{group="canary", instance="1", job="app-server"} 800 + +eval_ordered instant at 50m topk(9999999999, http_requests{job="api-server",group="production"}) + http_requests{job="api-server", instance="1", group="production"} 200 + http_requests{job="api-server", instance="0", group="production"} 100 + http_requests{job="api-server", instance="2", group="production"} NaN + clear # Tests for count_values.