From 34230bb17293535b65a53e33cbaf626663a679c3 Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Mon, 27 Nov 2023 17:20:27 +0000 Subject: [PATCH 001/137] tsdb/wlog: close segment files sooner 'defer' runs at the end of the whole function; we should close each segment file as soon as we finished reading it. Signed-off-by: Bryan Boreham --- tsdb/wlog/watcher.go | 5 +++-- tsdb/wlog/watcher_test.go | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tsdb/wlog/watcher.go b/tsdb/wlog/watcher.go index 1c76e3887..56cd0cc4e 100644 --- a/tsdb/wlog/watcher.go +++ b/tsdb/wlog/watcher.go @@ -730,10 +730,11 @@ func (w *Watcher) readCheckpoint(checkpointDir string, readFn segmentReadFn) err if err != nil { return fmt.Errorf("unable to open segment: %w", err) } - defer sr.Close() r := NewLiveReader(w.logger, w.readerMetrics, sr) - if err := readFn(w, r, index, false); err != nil && !errors.Is(err, io.EOF) { + err = readFn(w, r, index, false) + sr.Close() + if err != nil && !errors.Is(err, io.EOF) { return fmt.Errorf("readSegment: %w", err) } diff --git a/tsdb/wlog/watcher_test.go b/tsdb/wlog/watcher_test.go index b30dce91a..2686d3bc9 100644 --- a/tsdb/wlog/watcher_test.go +++ b/tsdb/wlog/watcher_test.go @@ -218,11 +218,11 @@ func TestTailSamples(t *testing.T) { for i := first; i <= last; i++ { segment, err := OpenReadSegment(SegmentName(watcher.walDir, i)) require.NoError(t, err) - defer segment.Close() reader := NewLiveReader(nil, NewLiveReaderMetrics(nil), segment) // Use tail true so we can ensure we got the right number of samples. watcher.readSegment(reader, i, true) + require.NoError(t, segment.Close()) } expectedSeries := seriesCount From b5389192583bac5171716e6467a370f29bda3c43 Mon Sep 17 00:00:00 2001 From: tyltr Date: Wed, 31 Jan 2024 20:27:23 +0800 Subject: [PATCH 002/137] remove redundant code Signed-off-by: tyltr --- discovery/legacymanager/manager_test.go | 1 - 1 file changed, 1 deletion(-) diff --git a/discovery/legacymanager/manager_test.go b/discovery/legacymanager/manager_test.go index 6fbecabc2..767532168 100644 --- a/discovery/legacymanager/manager_test.go +++ b/discovery/legacymanager/manager_test.go @@ -1091,7 +1091,6 @@ func TestCoordinationWithReceiver(t *testing.T) { } for _, tc := range testCases { - tc := tc t.Run(tc.title, func(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() From fb6a45f06bbe474096162d33c6ac8afcbbb71f45 Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Wed, 17 Jan 2024 18:28:06 +0100 Subject: [PATCH 003/137] tsdb/wlog: Only treat unknown record types as failure Signed-off-by: Arve Knudsen --- CHANGELOG.md | 1 + tsdb/wlog/watcher.go | 10 ++++------ 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1dfcc5c33..e17124abe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ * [ENHANCEMENT] OTLP: Improve remote write format translation performance by using label set hashes for metric identifiers instead of string based ones. #14006 #13991 * [BUGFIX] OTLP: Don't generate target_info unless at least one identifying label is defined. #13991 * [BUGFIX] OTLP: Don't generate target_info unless there are metrics. #13991 +* [BUGFIX] tsdb/wlog.Watcher.readSegmentForGC: Only count unknown record types against record_decode_failures_total metric. #14042 ## 2.52.0-rc.1 / 2024-05-03 diff --git a/tsdb/wlog/watcher.go b/tsdb/wlog/watcher.go index 8ebd9249a..fd4f5f20f 100644 --- a/tsdb/wlog/watcher.go +++ b/tsdb/wlog/watcher.go @@ -685,14 +685,12 @@ func (w *Watcher) readSegmentForGC(r *LiveReader, segmentNum int, _ bool) error } w.writer.UpdateSeriesSegment(series, segmentNum) - // Ignore these; we're only interested in series. - case record.Samples: - case record.Exemplars: - case record.Tombstones: - - default: + case record.Unknown: // Could be corruption, or reading from a WAL from a newer Prometheus. w.recordDecodeFailsMetric.Inc() + + default: + // We're only interested in series. } } if err := r.Err(); err != nil { From 694f717dc44849592b439fce6ffa5fbcdf7957a6 Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Tue, 28 May 2024 15:23:50 +0200 Subject: [PATCH 004/137] Watcher.readSegment: Only consider unknown rec types failures Signed-off-by: Arve Knudsen --- tsdb/wlog/watcher.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tsdb/wlog/watcher.go b/tsdb/wlog/watcher.go index fd4f5f20f..5a73acdd4 100644 --- a/tsdb/wlog/watcher.go +++ b/tsdb/wlog/watcher.go @@ -625,6 +625,7 @@ func (w *Watcher) readSegment(r *LiveReader, segmentNum int, tail bool) error { w.writer.AppendHistograms(histogramsToSend) histogramsToSend = histogramsToSend[:0] } + case record.FloatHistogramSamples: // Skip if experimental "histograms over remote write" is not enabled. if !w.sendHistograms { @@ -652,11 +653,13 @@ func (w *Watcher) readSegment(r *LiveReader, segmentNum int, tail bool) error { w.writer.AppendFloatHistograms(floatHistogramsToSend) floatHistogramsToSend = floatHistogramsToSend[:0] } - case record.Tombstones: - default: + case record.Unknown: // Could be corruption, or reading from a WAL from a newer Prometheus. w.recordDecodeFailsMetric.Inc() + + default: + // We're not interested in other types of records. } } if err := r.Err(); err != nil { From 9a837b7f3c1b6d5ef75cb36808bea9189eced911 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Thu, 4 Jul 2024 15:51:41 +0200 Subject: [PATCH 005/137] promql: Make groupedAggregation.groupCount a float64 It's always used as such. Let's avoid the countless conversions. Signed-off-by: beorn7 --- promql/engine.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/promql/engine.go b/promql/engine.go index bf19aac8b..7c84a0a27 100644 --- a/promql/engine.go +++ b/promql/engine.go @@ -2779,7 +2779,7 @@ type groupedAggregation struct { floatValue float64 histogramValue *histogram.FloatHistogram floatMean float64 // Mean, or "compensating value" for Kahan summation. - groupCount int + groupCount float64 groupAggrComplete bool // Used by LIMITK to short-cut series loop when we've reached K elem on every group heap vectorByValueHeap } @@ -2855,8 +2855,8 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix if h != nil { group.hasHistogram = true if group.histogramValue != nil { - left := h.Copy().Div(float64(group.groupCount)) - right := group.histogramValue.Copy().Div(float64(group.groupCount)) + left := h.Copy().Div(group.groupCount) + right := group.histogramValue.Copy().Div(group.groupCount) toAdd, err := left.Sub(right) if err != nil { handleAggregationError(err, e, inputMatrix[si].Metric.Get(model.MetricNameLabel), &annos) @@ -2889,7 +2889,7 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix } } // Divide each side of the `-` by `group.groupCount` to avoid float64 overflows. - group.floatMean += f/float64(group.groupCount) - group.floatMean/float64(group.groupCount) + group.floatMean += f/group.groupCount - group.floatMean/group.groupCount } case parser.GROUP: @@ -2912,7 +2912,7 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix if h == nil { // Ignore native histograms. group.groupCount++ delta := f - group.floatMean - group.floatMean += delta / float64(group.groupCount) + group.floatMean += delta / group.groupCount group.floatValue += delta * (f - group.floatMean) } @@ -2945,13 +2945,13 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix } case parser.COUNT: - aggr.floatValue = float64(aggr.groupCount) + aggr.floatValue = aggr.groupCount case parser.STDVAR: - aggr.floatValue /= float64(aggr.groupCount) + aggr.floatValue /= aggr.groupCount case parser.STDDEV: - aggr.floatValue = math.Sqrt(aggr.floatValue / float64(aggr.groupCount)) + aggr.floatValue = math.Sqrt(aggr.floatValue / aggr.groupCount) case parser.QUANTILE: aggr.floatValue = quantile(q, aggr.heap) From 44d8c1d1828e063a8d695681826f1f6e13f0f88c Mon Sep 17 00:00:00 2001 From: beorn7 Date: Thu, 4 Jul 2024 15:52:48 +0200 Subject: [PATCH 006/137] nit: add period at end of sentence Signed-off-by: beorn7 --- promql/engine.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/promql/engine.go b/promql/engine.go index 7c84a0a27..06a26e377 100644 --- a/promql/engine.go +++ b/promql/engine.go @@ -2780,7 +2780,7 @@ type groupedAggregation struct { histogramValue *histogram.FloatHistogram floatMean float64 // Mean, or "compensating value" for Kahan summation. groupCount float64 - groupAggrComplete bool // Used by LIMITK to short-cut series loop when we've reached K elem on every group + groupAggrComplete bool // Used by LIMITK to short-cut series loop when we've reached K elem on every group. heap vectorByValueHeap } From b5b04ddbe3733a296a531f2a54462ab5529062a7 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Thu, 4 Jul 2024 18:58:41 +0200 Subject: [PATCH 007/137] promql: add avg aggregation benchmark Signed-off-by: beorn7 --- promql/bench_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/promql/bench_test.go b/promql/bench_test.go index bd6728029..33523b2db 100644 --- a/promql/bench_test.go +++ b/promql/bench_test.go @@ -165,6 +165,9 @@ func rangeQueryCases() []benchCase { { expr: "sum(a_X)", }, + { + expr: "avg(a_X)", + }, { expr: "sum without (l)(h_X)", }, From c46074f4dda17dda1e93d79ed8e6e015bb222bdb Mon Sep 17 00:00:00 2001 From: beorn7 Date: Thu, 4 Jul 2024 15:13:35 +0200 Subject: [PATCH 008/137] promql: make avg aggregation more precise and less expensive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The basic idea here is that the previous code was always doing incremental calculation of the mean value, which is more costly and can be less precise. It protects against overflows, but in most cases, an overflow doesn't happen anyway. The other idea applied here is to expand on #14074, where Kahan summation was applied to sum(). With this commit, the average is calculated in a conventional way (adding everything up and divide in the end) as long as the sum isn't overflowing float64. This is combined with Kahan summation so that the avg aggregation, in most cases, is really equivalent to the sum aggregation with a following division (which is the user's expectation as avg is supposed to be syntactic sugar for sum with a following divison). If the sum hits ±Inf, the calculation reverts to incremental calculation of the mean value. Kahan summation is also applied here, although it cannot fully compensate for the numerical errors introduced by the incremental mean calculation. (The tests added in this commit would fail if incremental mean calculation was always used.) Signed-off-by: beorn7 --- promql/engine.go | 59 +++++++++++++++------ promql/promqltest/testdata/aggregators.test | 20 ++++++- 2 files changed, 61 insertions(+), 18 deletions(-) diff --git a/promql/engine.go b/promql/engine.go index 06a26e377..8a0aa23f6 100644 --- a/promql/engine.go +++ b/promql/engine.go @@ -2773,15 +2773,19 @@ func vectorElemBinop(op parser.ItemType, lhs, rhs float64, hlhs, hrhs *histogram } type groupedAggregation struct { + floatValue float64 + histogramValue *histogram.FloatHistogram + floatMean float64 + floatKahanC float64 // "Compensating value" for Kahan summation. + groupCount float64 + heap vectorByValueHeap + + // All bools together for better packing within the struct. seen bool // Was this output groups seen in the input at this timestamp. hasFloat bool // Has at least 1 float64 sample aggregated. hasHistogram bool // Has at least 1 histogram sample aggregated. - floatValue float64 - histogramValue *histogram.FloatHistogram - floatMean float64 // Mean, or "compensating value" for Kahan summation. - groupCount float64 groupAggrComplete bool // Used by LIMITK to short-cut series loop when we've reached K elem on every group. - heap vectorByValueHeap + incrementalMean bool // True after reverting to incremental calculation of the mean value. } // aggregation evaluates sum, avg, count, stdvar, stddev or quantile at one timestep on inputMatrix. @@ -2807,13 +2811,11 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix *group = groupedAggregation{ seen: true, floatValue: f, + floatMean: f, groupCount: 1, } switch op { - case parser.AVG: - group.floatMean = f - fallthrough - case parser.SUM: + case parser.AVG, parser.SUM: if h == nil { group.hasFloat = true } else { @@ -2821,7 +2823,6 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix group.hasHistogram = true } case parser.STDVAR, parser.STDDEV: - group.floatMean = f group.floatValue = 0 case parser.QUANTILE: group.heap = make(vectorByValueHeap, 1) @@ -2847,7 +2848,7 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix // point in copying the histogram in that case. } else { group.hasFloat = true - group.floatValue, group.floatMean = kahanSumInc(f, group.floatValue, group.floatMean) + group.floatValue, group.floatKahanC = kahanSumInc(f, group.floatValue, group.floatKahanC) } case parser.AVG: @@ -2871,6 +2872,22 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix // point in copying the histogram in that case. } else { group.hasFloat = true + if !group.incrementalMean { + newV, newC := kahanSumInc(f, group.floatValue, group.floatKahanC) + if !math.IsInf(newV, 0) { + // The sum doesn't overflow, so we propagate it to the + // group struct and continue with the regular + // calculation of the mean value. + group.floatValue, group.floatKahanC = newV, newC + break + } + // If we are here, we know that the sum _would_ overflow. So + // instead of continue to sum up, we revert to incremental + // calculation of the mean value from here on. + group.incrementalMean = true + group.floatMean = group.floatValue / (group.groupCount - 1) + group.floatKahanC /= group.groupCount - 1 + } if math.IsInf(group.floatMean, 0) { if math.IsInf(f, 0) && (group.floatMean > 0) == (f > 0) { // The `floatMean` and `s.F` values are `Inf` of the same sign. They @@ -2888,8 +2905,13 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix break } } - // Divide each side of the `-` by `group.groupCount` to avoid float64 overflows. - group.floatMean += f/group.groupCount - group.floatMean/group.groupCount + currentMean := group.floatMean + group.floatKahanC + group.floatMean, group.floatKahanC = kahanSumInc( + // Divide each side of the `-` by `group.groupCount` to avoid float64 overflows. + f/group.groupCount-currentMean/group.groupCount, + group.floatMean, + group.floatKahanC, + ) } case parser.GROUP: @@ -2938,10 +2960,13 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix annos.Add(annotations.NewMixedFloatsHistogramsAggWarning(e.Expr.PositionRange())) continue } - if aggr.hasHistogram { + switch { + case aggr.hasHistogram: aggr.histogramValue = aggr.histogramValue.Compact(0) - } else { - aggr.floatValue = aggr.floatMean + case aggr.incrementalMean: + aggr.floatValue = aggr.floatMean + aggr.floatKahanC + default: + aggr.floatValue = (aggr.floatValue + aggr.floatKahanC) / aggr.groupCount } case parser.COUNT: @@ -2965,7 +2990,7 @@ func (ev *evaluator) aggregation(e *parser.AggregateExpr, q float64, inputMatrix if aggr.hasHistogram { aggr.histogramValue.Compact(0) } else { - aggr.floatValue += aggr.floatMean // Add Kahan summation compensating term. + aggr.floatValue += aggr.floatKahanC } default: // For other aggregations, we already have the right value. diff --git a/promql/promqltest/testdata/aggregators.test b/promql/promqltest/testdata/aggregators.test index cbb255a12..68d2e735b 100644 --- a/promql/promqltest/testdata/aggregators.test +++ b/promql/promqltest/testdata/aggregators.test @@ -503,7 +503,7 @@ eval instant at 1m avg(data{test="-big"}) eval instant at 1m avg(data{test="bigzero"}) {} 0 -# Test summing extreme values. +# Test summing and averaging extreme values. clear load 10s @@ -529,21 +529,39 @@ load 10s eval instant at 1m sum(data{test="ten"}) {} 10 +eval instant at 1m avg(data{test="ten"}) + {} 2.5 + eval instant at 1m sum by (group) (data{test="pos_inf"}) {group="1"} Inf {group="2"} Inf +eval instant at 1m avg by (group) (data{test="pos_inf"}) + {group="1"} Inf + {group="2"} Inf + eval instant at 1m sum by (group) (data{test="neg_inf"}) {group="1"} -Inf {group="2"} -Inf +eval instant at 1m avg by (group) (data{test="neg_inf"}) + {group="1"} -Inf + {group="2"} -Inf + eval instant at 1m sum(data{test="inf_inf"}) {} NaN +eval instant at 1m avg(data{test="inf_inf"}) + {} NaN + eval instant at 1m sum by (group) (data{test="nan"}) {group="1"} NaN {group="2"} NaN +eval instant at 1m avg by (group) (data{test="nan"}) + {group="1"} NaN + {group="2"} NaN + clear # Test that aggregations are deterministic. From 3a908d8e088763e6df7f7a5f74345ada30c9b0bb Mon Sep 17 00:00:00 2001 From: beorn7 Date: Thu, 4 Jul 2024 18:36:32 +0200 Subject: [PATCH 009/137] promql: Improve Kahan usage in avg_over_time The calculation of the mean value in avg_over_time is performed in an incremental fashion. This introduces additional numerical errors that even Kahan summation cannot compensate, but at least we can use the Kahan-corrected mean value when we use the intermediate mean value in the calculation. Signed-off-by: beorn7 --- promql/functions.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/promql/functions.go b/promql/functions.go index dcc2cd759..575f8302d 100644 --- a/promql/functions.go +++ b/promql/functions.go @@ -593,7 +593,8 @@ func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode continue } } - mean, c = kahanSumInc(f.F/count-mean/count, mean, c) + correctedMean := mean + c + mean, c = kahanSumInc(f.F/count-correctedMean/count, mean, c) } if math.IsInf(mean, 0) { From cff0429b1ada726887f1fa717f44b63f72d45877 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Thu, 4 Jul 2024 18:47:52 +0200 Subject: [PATCH 010/137] promql: make avg_over_time faster and more precise Same idea as for the avg aggregator before: Most of the time, there is no overflow, so we don't have to revert to the more expensive and less precise incremental calculation of the mean value. Signed-off-by: beorn7 --- promql/functions.go | 32 ++++++++++++++++++----- promql/promqltest/testdata/functions.test | 4 ++- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/promql/functions.go b/promql/functions.go index 575f8302d..ca987545d 100644 --- a/promql/functions.go +++ b/promql/functions.go @@ -573,9 +573,28 @@ func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode return vec, nil } return aggrOverTime(vals, enh, func(s Series) float64 { - var mean, count, c float64 + var ( + sum, mean, count, kahanC float64 + incrementalMean bool + ) for _, f := range s.Floats { count++ + if !incrementalMean { + newSum, newC := kahanSumInc(f.F, sum, kahanC) + // Perform regular mean calculation as long as + // the sum doesn't overflow and (in any case) + // for the first iteration (even if we start + // with ±Inf) to not run into division-by-zero + // problems below. + if count == 1 || !math.IsInf(newSum, 0) { + sum, kahanC = newSum, newC + continue + } + // Handle overflow by reverting to incremental calculation of the mean value. + incrementalMean = true + mean = sum / (count - 1) + kahanC /= count - 1 + } if math.IsInf(mean, 0) { if math.IsInf(f.F, 0) && (mean > 0) == (f.F > 0) { // The `mean` and `f.F` values are `Inf` of the same sign. They @@ -593,14 +612,13 @@ func funcAvgOverTime(vals []parser.Value, args parser.Expressions, enh *EvalNode continue } } - correctedMean := mean + c - mean, c = kahanSumInc(f.F/count-correctedMean/count, mean, c) + correctedMean := mean + kahanC + mean, kahanC = kahanSumInc(f.F/count-correctedMean/count, mean, kahanC) } - - if math.IsInf(mean, 0) { - return mean + if incrementalMean { + return mean + kahanC } - return mean + c + return (sum + kahanC) / count }), nil } diff --git a/promql/promqltest/testdata/functions.test b/promql/promqltest/testdata/functions.test index 718e001c3..290beb5b9 100644 --- a/promql/promqltest/testdata/functions.test +++ b/promql/promqltest/testdata/functions.test @@ -737,7 +737,6 @@ eval instant at 1m avg_over_time(metric6c[1m]) eval instant at 1m sum_over_time(metric6c[1m])/count_over_time(metric6c[1m]) {} NaN - eval instant at 1m avg_over_time(metric7[1m]) {} NaN @@ -772,6 +771,9 @@ load 10s eval instant at 1m sum_over_time(metric[1m]) {} 2 +eval instant at 1m avg_over_time(metric[1m]) + {} 0.5 + # Tests for stddev_over_time and stdvar_over_time. clear load 10s From c04924bc41982dfef9ea29939c1a2c6fe56333c7 Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Wed, 24 Jul 2024 16:47:33 +0200 Subject: [PATCH 011/137] otlptranslator: Add tests for BuildCompliantName Signed-off-by: Arve Knudsen --- .../prometheus/normalize_label.go | 11 +- .../prometheus/normalize_name.go | 25 +-- .../prometheus/normalize_name_test.go | 203 ++++++++++++++++++ .../prometheus/testutils_test.go | 49 +++++ 4 files changed, 270 insertions(+), 18 deletions(-) create mode 100644 storage/remote/otlptranslator/prometheus/normalize_name_test.go create mode 100644 storage/remote/otlptranslator/prometheus/testutils_test.go diff --git a/storage/remote/otlptranslator/prometheus/normalize_label.go b/storage/remote/otlptranslator/prometheus/normalize_label.go index 6360aa976..a112b9bbc 100644 --- a/storage/remote/otlptranslator/prometheus/normalize_label.go +++ b/storage/remote/otlptranslator/prometheus/normalize_label.go @@ -21,15 +21,14 @@ import ( "unicode" ) -// Normalizes the specified label to follow Prometheus label names standard +// Normalizes the specified label to follow Prometheus label names standard. // -// See rules at https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels +// See rules at https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels. // -// Labels that start with non-letter rune will be prefixed with "key_" +// Labels that start with non-letter rune will be prefixed with "key_". // -// Exception is made for double-underscores which are allowed +// An exception is made for double-underscores which are allowed. func NormalizeLabel(label string) string { - // Trivial case if len(label) == 0 { return label @@ -48,7 +47,7 @@ func NormalizeLabel(label string) string { return label } -// Return '_' for anything non-alphanumeric +// Return '_' for anything non-alphanumeric. func sanitizeRune(r rune) rune { if unicode.IsLetter(r) || unicode.IsDigit(r) { return r diff --git a/storage/remote/otlptranslator/prometheus/normalize_name.go b/storage/remote/otlptranslator/prometheus/normalize_name.go index 71bba40e4..0f472b80a 100644 --- a/storage/remote/otlptranslator/prometheus/normalize_name.go +++ b/storage/remote/otlptranslator/prometheus/normalize_name.go @@ -76,14 +76,15 @@ var perUnitMap = map[string]string{ "y": "year", } -// BuildCompliantName builds a Prometheus-compliant metric name for the specified metric +// BuildCompliantName builds a Prometheus-compliant metric name for the specified metric. // // Metric name is prefixed with specified namespace and underscore (if any). // Namespace is not cleaned up. Make sure specified namespace follows Prometheus // naming convention. // -// See rules at https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels -// and https://prometheus.io/docs/practices/naming/#metric-and-label-naming +// See rules at https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels, +// https://prometheus.io/docs/practices/naming/#metric-and-label-naming +// and https://github.com/open-telemetry/opentelemetry-specification/blob/v1.33.0/specification/compatibility/prometheus_and_openmetrics.md#otlp-metric-points-to-prometheus. func BuildCompliantName(metric pmetric.Metric, namespace string, addMetricSuffixes bool) string { var metricName string @@ -110,7 +111,7 @@ func BuildCompliantName(metric pmetric.Metric, namespace string, addMetricSuffix // Build a normalized name for the specified metric func normalizeName(metric pmetric.Metric, namespace string) string { - // Split metric name in "tokens" (remove all non-alphanumeric) + // Split metric name into "tokens" (remove all non-alphanumerics) nameTokens := strings.FieldsFunc( metric.Name(), func(r rune) bool { return !unicode.IsLetter(r) && !unicode.IsDigit(r) }, @@ -122,9 +123,9 @@ func normalizeName(metric pmetric.Metric, namespace string) string { // Main unit // Append if not blank, doesn't contain '{}', and is not present in metric name already if len(unitTokens) > 0 { - mainUnitOtel := strings.TrimSpace(unitTokens[0]) - if mainUnitOtel != "" && !strings.ContainsAny(mainUnitOtel, "{}") { - mainUnitProm := CleanUpString(unitMapGetOrDefault(mainUnitOtel)) + mainUnitOTel := strings.TrimSpace(unitTokens[0]) + if mainUnitOTel != "" && !strings.ContainsAny(mainUnitOTel, "{}") { + mainUnitProm := CleanUpString(unitMapGetOrDefault(mainUnitOTel)) if mainUnitProm != "" && !contains(nameTokens, mainUnitProm) { nameTokens = append(nameTokens, mainUnitProm) } @@ -133,11 +134,11 @@ func normalizeName(metric pmetric.Metric, namespace string) string { // Per unit // Append if not blank, doesn't contain '{}', and is not present in metric name already if len(unitTokens) > 1 && unitTokens[1] != "" { - perUnitOtel := strings.TrimSpace(unitTokens[1]) - if perUnitOtel != "" && !strings.ContainsAny(perUnitOtel, "{}") { - perUnitProm := CleanUpString(perUnitMapGetOrDefault(perUnitOtel)) + perUnitOTel := strings.TrimSpace(unitTokens[1]) + if perUnitOTel != "" && !strings.ContainsAny(perUnitOTel, "{}") { + perUnitProm := CleanUpString(perUnitMapGetOrDefault(perUnitOTel)) if perUnitProm != "" && !contains(nameTokens, perUnitProm) { - nameTokens = append(append(nameTokens, "per"), perUnitProm) + nameTokens = append(nameTokens, "per", perUnitProm) } } } @@ -150,7 +151,7 @@ func normalizeName(metric pmetric.Metric, namespace string) string { } // Append _ratio for metrics with unit "1" - // Some Otel receivers improperly use unit "1" for counters of objects + // Some OTel receivers improperly use unit "1" for counters of objects // See https://github.com/open-telemetry/opentelemetry-collector-contrib/issues?q=is%3Aissue+some+metric+units+don%27t+follow+otel+semantic+conventions // Until these issues have been fixed, we're appending `_ratio` for gauges ONLY // Theoretically, counters could be ratios as well, but it's absurd (for mathematical reasons) diff --git a/storage/remote/otlptranslator/prometheus/normalize_name_test.go b/storage/remote/otlptranslator/prometheus/normalize_name_test.go new file mode 100644 index 000000000..ee25bb2df --- /dev/null +++ b/storage/remote/otlptranslator/prometheus/normalize_name_test.go @@ -0,0 +1,203 @@ +// Copyright 2024 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// Provenance-includes-location: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/95e8f8fdc2a9dc87230406c9a3cf02be4fd68bea/pkg/translator/prometheus/normalize_name_test.go +// Provenance-includes-license: Apache-2.0 +// Provenance-includes-copyright: Copyright The OpenTelemetry Authors. + +package prometheus + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/collector/pdata/pmetric" +) + +func TestByte(t *testing.T) { + require.Equal(t, "system_filesystem_usage_bytes", normalizeName(createGauge("system.filesystem.usage", "By"), "")) +} + +func TestByteCounter(t *testing.T) { + require.Equal(t, "system_io_bytes_total", normalizeName(createCounter("system.io", "By"), "")) + require.Equal(t, "network_transmitted_bytes_total", normalizeName(createCounter("network_transmitted_bytes_total", "By"), "")) +} + +func TestWhiteSpaces(t *testing.T) { + require.Equal(t, "system_filesystem_usage_bytes", normalizeName(createGauge("\t system.filesystem.usage ", " By\t"), "")) +} + +func TestNonStandardUnit(t *testing.T) { + require.Equal(t, "system_network_dropped", normalizeName(createGauge("system.network.dropped", "{packets}"), "")) +} + +func TestNonStandardUnitCounter(t *testing.T) { + require.Equal(t, "system_network_dropped_total", normalizeName(createCounter("system.network.dropped", "{packets}"), "")) +} + +func TestBrokenUnit(t *testing.T) { + require.Equal(t, "system_network_dropped_packets", normalizeName(createGauge("system.network.dropped", "packets"), "")) + require.Equal(t, "system_network_packets_dropped", normalizeName(createGauge("system.network.packets.dropped", "packets"), "")) + require.Equal(t, "system_network_packets", normalizeName(createGauge("system.network.packets", "packets"), "")) +} + +func TestBrokenUnitCounter(t *testing.T) { + require.Equal(t, "system_network_dropped_packets_total", normalizeName(createCounter("system.network.dropped", "packets"), "")) + require.Equal(t, "system_network_packets_dropped_total", normalizeName(createCounter("system.network.packets.dropped", "packets"), "")) + require.Equal(t, "system_network_packets_total", normalizeName(createCounter("system.network.packets", "packets"), "")) +} + +func TestRatio(t *testing.T) { + require.Equal(t, "hw_gpu_memory_utilization_ratio", normalizeName(createGauge("hw.gpu.memory.utilization", "1"), "")) + require.Equal(t, "hw_fan_speed_ratio", normalizeName(createGauge("hw.fan.speed_ratio", "1"), "")) + require.Equal(t, "objects_total", normalizeName(createCounter("objects", "1"), "")) +} + +func TestHertz(t *testing.T) { + require.Equal(t, "hw_cpu_speed_limit_hertz", normalizeName(createGauge("hw.cpu.speed_limit", "Hz"), "")) +} + +func TestPer(t *testing.T) { + require.Equal(t, "broken_metric_speed_km_per_hour", normalizeName(createGauge("broken.metric.speed", "km/h"), "")) + require.Equal(t, "astro_light_speed_limit_meters_per_second", normalizeName(createGauge("astro.light.speed_limit", "m/s"), "")) +} + +func TestPercent(t *testing.T) { + require.Equal(t, "broken_metric_success_ratio_percent", normalizeName(createGauge("broken.metric.success_ratio", "%"), "")) + require.Equal(t, "broken_metric_success_percent", normalizeName(createGauge("broken.metric.success_percent", "%"), "")) +} + +func TestEmpty(t *testing.T) { + require.Equal(t, "test_metric_no_unit", normalizeName(createGauge("test.metric.no_unit", ""), "")) + require.Equal(t, "test_metric_spaces", normalizeName(createGauge("test.metric.spaces", " \t "), "")) +} + +func TestUnsupportedRunes(t *testing.T) { + require.Equal(t, "unsupported_metric_temperature_F", normalizeName(createGauge("unsupported.metric.temperature", "°F"), "")) + require.Equal(t, "unsupported_metric_weird", normalizeName(createGauge("unsupported.metric.weird", "+=.:,!* & #"), "")) + require.Equal(t, "unsupported_metric_redundant_test_per_C", normalizeName(createGauge("unsupported.metric.redundant", "__test $/°C"), "")) +} + +func TestOTelReceivers(t *testing.T) { + require.Equal(t, "active_directory_ds_replication_network_io_bytes_total", normalizeName(createCounter("active_directory.ds.replication.network.io", "By"), "")) + require.Equal(t, "active_directory_ds_replication_sync_object_pending_total", normalizeName(createCounter("active_directory.ds.replication.sync.object.pending", "{objects}"), "")) + require.Equal(t, "active_directory_ds_replication_object_rate_per_second", normalizeName(createGauge("active_directory.ds.replication.object.rate", "{objects}/s"), "")) + require.Equal(t, "active_directory_ds_name_cache_hit_rate_percent", normalizeName(createGauge("active_directory.ds.name_cache.hit_rate", "%"), "")) + require.Equal(t, "active_directory_ds_ldap_bind_last_successful_time_milliseconds", normalizeName(createGauge("active_directory.ds.ldap.bind.last_successful.time", "ms"), "")) + require.Equal(t, "apache_current_connections", normalizeName(createGauge("apache.current_connections", "connections"), "")) + require.Equal(t, "apache_workers_connections", normalizeName(createGauge("apache.workers", "connections"), "")) + require.Equal(t, "apache_requests_total", normalizeName(createCounter("apache.requests", "1"), "")) + require.Equal(t, "bigip_virtual_server_request_count_total", normalizeName(createCounter("bigip.virtual_server.request.count", "{requests}"), "")) + require.Equal(t, "system_cpu_utilization_ratio", normalizeName(createGauge("system.cpu.utilization", "1"), "")) + require.Equal(t, "system_disk_operation_time_seconds_total", normalizeName(createCounter("system.disk.operation_time", "s"), "")) + require.Equal(t, "system_cpu_load_average_15m_ratio", normalizeName(createGauge("system.cpu.load_average.15m", "1"), "")) + require.Equal(t, "memcached_operation_hit_ratio_percent", normalizeName(createGauge("memcached.operation_hit_ratio", "%"), "")) + require.Equal(t, "mongodbatlas_process_asserts_per_second", normalizeName(createGauge("mongodbatlas.process.asserts", "{assertions}/s"), "")) + require.Equal(t, "mongodbatlas_process_journaling_data_files_mebibytes", normalizeName(createGauge("mongodbatlas.process.journaling.data_files", "MiBy"), "")) + require.Equal(t, "mongodbatlas_process_network_io_bytes_per_second", normalizeName(createGauge("mongodbatlas.process.network.io", "By/s"), "")) + require.Equal(t, "mongodbatlas_process_oplog_rate_gibibytes_per_hour", normalizeName(createGauge("mongodbatlas.process.oplog.rate", "GiBy/h"), "")) + require.Equal(t, "mongodbatlas_process_db_query_targeting_scanned_per_returned", normalizeName(createGauge("mongodbatlas.process.db.query_targeting.scanned_per_returned", "{scanned}/{returned}"), "")) + require.Equal(t, "nginx_requests", normalizeName(createGauge("nginx.requests", "requests"), "")) + require.Equal(t, "nginx_connections_accepted", normalizeName(createGauge("nginx.connections_accepted", "connections"), "")) + require.Equal(t, "nsxt_node_memory_usage_kilobytes", normalizeName(createGauge("nsxt.node.memory.usage", "KBy"), "")) + require.Equal(t, "redis_latest_fork_microseconds", normalizeName(createGauge("redis.latest_fork", "us"), "")) +} + +func TestTrimPromSuffixes(t *testing.T) { + assert.Equal(t, "active_directory_ds_replication_network_io", TrimPromSuffixes("active_directory_ds_replication_network_io_bytes_total", pmetric.MetricTypeSum, "bytes")) + assert.Equal(t, "active_directory_ds_name_cache_hit_rate", TrimPromSuffixes("active_directory_ds_name_cache_hit_rate_percent", pmetric.MetricTypeGauge, "percent")) + assert.Equal(t, "active_directory_ds_ldap_bind_last_successful_time", TrimPromSuffixes("active_directory_ds_ldap_bind_last_successful_time_milliseconds", pmetric.MetricTypeGauge, "milliseconds")) + assert.Equal(t, "apache_requests", TrimPromSuffixes("apache_requests_total", pmetric.MetricTypeSum, "1")) + assert.Equal(t, "system_cpu_utilization", TrimPromSuffixes("system_cpu_utilization_ratio", pmetric.MetricTypeGauge, "ratio")) + assert.Equal(t, "mongodbatlas_process_journaling_data_files", TrimPromSuffixes("mongodbatlas_process_journaling_data_files_mebibytes", pmetric.MetricTypeGauge, "mebibytes")) + assert.Equal(t, "mongodbatlas_process_network_io", TrimPromSuffixes("mongodbatlas_process_network_io_bytes_per_second", pmetric.MetricTypeGauge, "bytes_per_second")) + assert.Equal(t, "mongodbatlas_process_oplog_rate", TrimPromSuffixes("mongodbatlas_process_oplog_rate_gibibytes_per_hour", pmetric.MetricTypeGauge, "gibibytes_per_hour")) + assert.Equal(t, "nsxt_node_memory_usage", TrimPromSuffixes("nsxt_node_memory_usage_kilobytes", pmetric.MetricTypeGauge, "kilobytes")) + assert.Equal(t, "redis_latest_fork", TrimPromSuffixes("redis_latest_fork_microseconds", pmetric.MetricTypeGauge, "microseconds")) + assert.Equal(t, "up", TrimPromSuffixes("up", pmetric.MetricTypeGauge, "")) + + // These are not necessarily valid OM units, only tested for the sake of completeness. + assert.Equal(t, "active_directory_ds_replication_sync_object_pending", TrimPromSuffixes("active_directory_ds_replication_sync_object_pending_total", pmetric.MetricTypeSum, "{objects}")) + assert.Equal(t, "apache_current", TrimPromSuffixes("apache_current_connections", pmetric.MetricTypeGauge, "connections")) + assert.Equal(t, "bigip_virtual_server_request_count", TrimPromSuffixes("bigip_virtual_server_request_count_total", pmetric.MetricTypeSum, "{requests}")) + assert.Equal(t, "mongodbatlas_process_db_query_targeting_scanned_per_returned", TrimPromSuffixes("mongodbatlas_process_db_query_targeting_scanned_per_returned", pmetric.MetricTypeGauge, "{scanned}/{returned}")) + assert.Equal(t, "nginx_connections_accepted", TrimPromSuffixes("nginx_connections_accepted", pmetric.MetricTypeGauge, "connections")) + assert.Equal(t, "apache_workers", TrimPromSuffixes("apache_workers_connections", pmetric.MetricTypeGauge, "connections")) + assert.Equal(t, "nginx", TrimPromSuffixes("nginx_requests", pmetric.MetricTypeGauge, "requests")) + + // Units shouldn't be trimmed if the unit is not a direct match with the suffix, i.e, a suffix "_seconds" shouldn't be removed if unit is "sec" or "s" + assert.Equal(t, "system_cpu_load_average_15m_ratio", TrimPromSuffixes("system_cpu_load_average_15m_ratio", pmetric.MetricTypeGauge, "1")) + assert.Equal(t, "mongodbatlas_process_asserts_per_second", TrimPromSuffixes("mongodbatlas_process_asserts_per_second", pmetric.MetricTypeGauge, "{assertions}/s")) + assert.Equal(t, "memcached_operation_hit_ratio_percent", TrimPromSuffixes("memcached_operation_hit_ratio_percent", pmetric.MetricTypeGauge, "%")) + assert.Equal(t, "active_directory_ds_replication_object_rate_per_second", TrimPromSuffixes("active_directory_ds_replication_object_rate_per_second", pmetric.MetricTypeGauge, "{objects}/s")) + assert.Equal(t, "system_disk_operation_time_seconds", TrimPromSuffixes("system_disk_operation_time_seconds_total", pmetric.MetricTypeSum, "s")) +} + +func TestNamespace(t *testing.T) { + require.Equal(t, "space_test", normalizeName(createGauge("test", ""), "space")) + require.Equal(t, "space_test", normalizeName(createGauge("#test", ""), "space")) +} + +func TestCleanUpString(t *testing.T) { + require.Equal(t, "", CleanUpString("")) + require.Equal(t, "a_b", CleanUpString("a b")) + require.Equal(t, "hello_world", CleanUpString("hello, world!")) + require.Equal(t, "hello_you_2", CleanUpString("hello you 2")) + require.Equal(t, "1000", CleanUpString("$1000")) + require.Equal(t, "", CleanUpString("*+$^=)")) +} + +func TestUnitMapGetOrDefault(t *testing.T) { + require.Equal(t, "", unitMapGetOrDefault("")) + require.Equal(t, "seconds", unitMapGetOrDefault("s")) + require.Equal(t, "invalid", unitMapGetOrDefault("invalid")) +} + +func TestPerUnitMapGetOrDefault(t *testing.T) { + require.Equal(t, "", perUnitMapGetOrDefault("")) + require.Equal(t, "second", perUnitMapGetOrDefault("s")) + require.Equal(t, "invalid", perUnitMapGetOrDefault("invalid")) +} + +func TestRemoveItem(t *testing.T) { + require.Equal(t, []string{}, removeItem([]string{}, "test")) + require.Equal(t, []string{}, removeItem([]string{}, "")) + require.Equal(t, []string{"a", "b", "c"}, removeItem([]string{"a", "b", "c"}, "d")) + require.Equal(t, []string{"a", "b", "c"}, removeItem([]string{"a", "b", "c"}, "")) + require.Equal(t, []string{"a", "b"}, removeItem([]string{"a", "b", "c"}, "c")) + require.Equal(t, []string{"a", "c"}, removeItem([]string{"a", "b", "c"}, "b")) + require.Equal(t, []string{"b", "c"}, removeItem([]string{"a", "b", "c"}, "a")) +} + +func TestBuildCompliantNameWithNormalize(t *testing.T) { + require.Equal(t, "system_io_bytes_total", BuildCompliantName(createCounter("system.io", "By"), "", true)) + require.Equal(t, "system_network_io_bytes_total", BuildCompliantName(createCounter("network.io", "By"), "system", true)) + require.Equal(t, "_3_14_digits", BuildCompliantName(createGauge("3.14 digits", ""), "", true)) + require.Equal(t, "envoy_rule_engine_zlib_buf_error", BuildCompliantName(createGauge("envoy__rule_engine_zlib_buf_error", ""), "", true)) + require.Equal(t, "foo_bar", BuildCompliantName(createGauge(":foo::bar", ""), "", true)) + require.Equal(t, "foo_bar_total", BuildCompliantName(createCounter(":foo::bar", ""), "", true)) + // Gauges with unit 1 are considered ratios. + require.Equal(t, "foo_bar_ratio", BuildCompliantName(createGauge("foo.bar", "1"), "", true)) + // Slashes in units are converted. + require.Equal(t, "system_io_foo_per_bar_total", BuildCompliantName(createCounter("system.io", "foo/bar"), "", true)) +} + +func TestBuildCompliantNameWithoutSuffixes(t *testing.T) { + require.Equal(t, "system_io", BuildCompliantName(createCounter("system.io", "By"), "", false)) + require.Equal(t, "system_network_io", BuildCompliantName(createCounter("network.io", "By"), "system", false)) + require.Equal(t, "system_network_I_O", BuildCompliantName(createCounter("network (I/O)", "By"), "system", false)) + require.Equal(t, "_3_14_digits", BuildCompliantName(createGauge("3.14 digits", "By"), "", false)) + require.Equal(t, "envoy__rule_engine_zlib_buf_error", BuildCompliantName(createGauge("envoy__rule_engine_zlib_buf_error", ""), "", false)) + require.Equal(t, ":foo::bar", BuildCompliantName(createGauge(":foo::bar", ""), "", false)) + require.Equal(t, ":foo::bar", BuildCompliantName(createCounter(":foo::bar", ""), "", false)) +} diff --git a/storage/remote/otlptranslator/prometheus/testutils_test.go b/storage/remote/otlptranslator/prometheus/testutils_test.go new file mode 100644 index 000000000..363328c57 --- /dev/null +++ b/storage/remote/otlptranslator/prometheus/testutils_test.go @@ -0,0 +1,49 @@ +// Copyright 2024 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// Provenance-includes-location: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/95e8f8fdc2a9dc87230406c9a3cf02be4fd68bea/pkg/translator/prometheus/testutils_test.go +// Provenance-includes-license: Apache-2.0 +// Provenance-includes-copyright: Copyright The OpenTelemetry Authors. + +package prometheus + +import ( + "go.opentelemetry.io/collector/pdata/pmetric" +) + +var ilm pmetric.ScopeMetrics + +func init() { + + metrics := pmetric.NewMetrics() + resourceMetrics := metrics.ResourceMetrics().AppendEmpty() + ilm = resourceMetrics.ScopeMetrics().AppendEmpty() + +} + +// Returns a new Metric of type "Gauge" with specified name and unit +func createGauge(name string, unit string) pmetric.Metric { + gauge := ilm.Metrics().AppendEmpty() + gauge.SetName(name) + gauge.SetUnit(unit) + gauge.SetEmptyGauge() + return gauge +} + +// Returns a new Metric of type Monotonic Sum with specified name and unit +func createCounter(name string, unit string) pmetric.Metric { + counter := ilm.Metrics().AppendEmpty() + counter.SetEmptySum().SetIsMonotonic(true) + counter.SetName(name) + counter.SetUnit(unit) + return counter +} From a4a5994f69d9af423ec886b2c0e20a05453ac832 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Mon, 22 Jul 2024 15:07:12 -0700 Subject: [PATCH 012/137] clarify that 1.0 will eventually be deprecated, it is not yet deprecated Signed-off-by: Callum Styan --- config/config.go | 5 +++-- .../examples/remote_storage/example_write_adapter/README.md | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/config/config.go b/config/config.go index c924e3098..173689d6a 100644 --- a/config/config.go +++ b/config/config.go @@ -1085,8 +1085,9 @@ func (m RemoteWriteProtoMsgs) String() string { } var ( - // RemoteWriteProtoMsgV1 represents the deprecated `prometheus.WriteRequest` protobuf - // message introduced in the https://prometheus.io/docs/specs/remote_write_spec/. + // RemoteWriteProtoMsgV1 represents the `prometheus.WriteRequest` protobuf + // message introduced in the https://prometheus.io/docs/specs/remote_write_spec/, + // which will eventually be deprecated. // // NOTE: This string is used for both HTTP header values and config value, so don't change // this reference. diff --git a/documentation/examples/remote_storage/example_write_adapter/README.md b/documentation/examples/remote_storage/example_write_adapter/README.md index 739cf3be3..968d2b25c 100644 --- a/documentation/examples/remote_storage/example_write_adapter/README.md +++ b/documentation/examples/remote_storage/example_write_adapter/README.md @@ -19,7 +19,7 @@ remote_write: protobuf_message: "io.prometheus.write.v2.Request" ``` -or for deprecated Remote Write 1.0 message: +or for the eventually deprecated Remote Write 1.0 message: ```yaml remote_write: From 7b5897a46d3afe25912e97b73b89e599db66fde8 Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Thu, 25 Jul 2024 17:51:29 +0100 Subject: [PATCH 013/137] Prepare release 2.54.0-rc.0 (#14498) Signed-off-by: Bryan Boreham --- CHANGELOG.md | 41 ++++++++++++++++++-- VERSION | 2 +- web/ui/module/codemirror-promql/package.json | 4 +- web/ui/module/lezer-promql/package.json | 2 +- web/ui/package-lock.json | 14 +++---- web/ui/package.json | 2 +- web/ui/react-app/package.json | 4 +- 7 files changed, 52 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e7314d041..02ffc5e4b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,44 @@ # Changelog -## unreleased +## 2.54.0-rc.0 / 2024-07-19 -* [FEATURE] Remote-Write: Add sender and receiver support for [Remote Write 2.0-rc.2](https://prometheus.io/docs/specs/remote_write_spec_2_0/) specification #14395 #14427 #14444 -* [ENHANCEMENT] Remote-Write: 1.x messages against Remote Write 2.x Receivers will have now correct values for `prometheus_storage__failed_total` in case of partial errors #14444 +Release 2.54 brings a release candidate of a major new version of [Remote Write: 2.0](https://prometheus.io/docs/specs/remote_write_spec_2_0/). +This is experimental at this time and may still change. +Remote-write v2 is enabled by default, but can be disabled via feature-flag `web.remote-write-receiver.accepted-protobuf-messages`. + +* [CHANGE] Remote-Write: `highest_timestamp_in_seconds` and `queue_highest_sent_timestamp_seconds` metrics now initialized to 0. #14437 +* [CHANGE] API: Split warnings from info annotations in API response. #14327 +* [FEATURE] Remote-Write: Version 2.0 experimental, plus metadata in WAL via feature flag `metadata-wal-records` (defaults on). #14395,#14427,#14444 +* [FEATURE] PromQL: add limitk() and limit_ratio() aggregation operators. #12503 +* [ENHANCEMENT] PromQL: Accept underscores in literal numbers, e.g. 1_000_000 for 1 million. #12821 +* [ENHANCEMENT] PromQL: float literal numbers and durations are now interchangeable (experimental). Example: `time() - my_timestamp > 10m`. #9138 +* [ENHANCEMENT] PromQL: use Kahan summation for sum(). #14074,#14362 +* [ENHANCEMENT] PromQL (experimental native histograms): Optimize `histogram_count` and `histogram_sum` functions. #14097 +* [ENHANCEMENT] TSDB: Better support for out-of-order experimental native histogram samples. #14438 +* [ENHANCEMENT] TSDB: Optimise seek within index. #14393 +* [ENHANCEMENT] TSDB: Optimise deletion of stale series. #14307 +* [ENHANCEMENT] TSDB: Reduce locking to optimise adding and removing series. #13286,#14286 +* [ENHANCEMENT] Regexps: Optimize patterns with multiple prefixes. #13843,#14368 +* [ENHANCEMENT] Regexps: Optimize patterns containing multiple literal strings. #14173 +* [ENHANCEMENT] AWS SD: expose Primary IPv6 addresses as __meta_ec2_primary_ipv6_addresses. #14156 +* [ENHANCEMENT] Docker SD: add MatchFirstNetwork for containers with multiple networks. #10490 +* [ENHANCEMENT] OpenStack SD: Use `flavor.original_name` if available. #14312 +* [ENHANCEMENT] UI (experimental native histograms): more accurate representation. #13680,#14430 +* [ENHANCEMENT] Agent: `out_of_order_time_window` config option now applies to agent. #14094 +* [ENHANCEMENT] Notifier: Send any outstanding Alertmanager notifications when shutting down. #14290 +* [ENHANCEMENT] Rules: Add label-matcher support to Rules API. #10194 +* [ENHANCEMENT] HTTP API: Add url to message logged on error while sending response. #14209 +* [BUGFIX] CLI: escape `|` characters when generating docs. #14420 +* [BUGFIX] PromQL (experimental native histograms): Fix some binary operators between native histogram values. #14454 +* [BUGFIX] TSDB: LabelNames API could fail during compaction. #14279 +* [BUGFIX] TSDB: Fix rare issue where pending OOO read can be left dangling if creating querier fails. #14341 +* [BUGFIX] TSDB: fix check for context cancellation in LabelNamesFor. #14302 +* [BUGFIX] Rules: Fix rare panic on reload. #14366 +* [BUGFIX] Config: In YAML marshalling, do not output a regexp field if it was never set. #14004 +* [BUGFIX] Remote-Write: reject samples with future timestamps. #14304 +* [BUGFIX] Remote-Write: Fix data corruption in remote write if max_sample_age is applied. #14078 +* [BUGFIX] Notifier: Fix Alertmanager discovery not updating under heavy load. #14174 +* [BUGFIX] Regexes: some Unicode characters were not matched by case-insensitive comparison. #14170,#14299 ## 2.53.1 / 2024-07-10 diff --git a/VERSION b/VERSION index f419e2c6f..69539c388 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.53.1 +2.54.0-rc.0 diff --git a/web/ui/module/codemirror-promql/package.json b/web/ui/module/codemirror-promql/package.json index ba924346f..02c1d2286 100644 --- a/web/ui/module/codemirror-promql/package.json +++ b/web/ui/module/codemirror-promql/package.json @@ -1,6 +1,6 @@ { "name": "@prometheus-io/codemirror-promql", - "version": "0.53.1", + "version": "0.54.0-rc.0", "description": "a CodeMirror mode for the PromQL language", "types": "dist/esm/index.d.ts", "module": "dist/esm/index.js", @@ -29,7 +29,7 @@ }, "homepage": "https://github.com/prometheus/prometheus/blob/main/web/ui/module/codemirror-promql/README.md", "dependencies": { - "@prometheus-io/lezer-promql": "0.53.1", + "@prometheus-io/lezer-promql": "0.54.0-rc.0", "lru-cache": "^7.18.3" }, "devDependencies": { diff --git a/web/ui/module/lezer-promql/package.json b/web/ui/module/lezer-promql/package.json index cbd03ae2b..af2fcae67 100644 --- a/web/ui/module/lezer-promql/package.json +++ b/web/ui/module/lezer-promql/package.json @@ -1,6 +1,6 @@ { "name": "@prometheus-io/lezer-promql", - "version": "0.53.1", + "version": "0.54.0-rc.0", "description": "lezer-based PromQL grammar", "main": "dist/index.cjs", "type": "module", diff --git a/web/ui/package-lock.json b/web/ui/package-lock.json index 62ac34e43..17bb0f272 100644 --- a/web/ui/package-lock.json +++ b/web/ui/package-lock.json @@ -1,12 +1,12 @@ { "name": "prometheus-io", - "version": "0.53.1", + "version": "0.54.0-rc.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "prometheus-io", - "version": "0.53.1", + "version": "0.54.0-rc.0", "workspaces": [ "react-app", "module/*" @@ -30,10 +30,10 @@ }, "module/codemirror-promql": { "name": "@prometheus-io/codemirror-promql", - "version": "0.53.1", + "version": "0.54.0-rc.0", "license": "Apache-2.0", "dependencies": { - "@prometheus-io/lezer-promql": "0.53.1", + "@prometheus-io/lezer-promql": "0.54.0-rc.0", "lru-cache": "^7.18.3" }, "devDependencies": { @@ -69,7 +69,7 @@ }, "module/lezer-promql": { "name": "@prometheus-io/lezer-promql", - "version": "0.53.1", + "version": "0.54.0-rc.0", "license": "Apache-2.0", "devDependencies": { "@lezer/generator": "^1.7.0", @@ -19332,7 +19332,7 @@ }, "react-app": { "name": "@prometheus-io/app", - "version": "0.53.1", + "version": "0.54.0-rc.0", "dependencies": { "@codemirror/autocomplete": "^6.17.0", "@codemirror/commands": "^6.6.0", @@ -19350,7 +19350,7 @@ "@lezer/lr": "^1.4.1", "@nexucis/fuzzy": "^0.4.1", "@nexucis/kvsearch": "^0.8.1", - "@prometheus-io/codemirror-promql": "0.53.1", + "@prometheus-io/codemirror-promql": "0.54.0-rc.0", "bootstrap": "^4.6.2", "css.escape": "^1.5.1", "downshift": "^9.0.6", diff --git a/web/ui/package.json b/web/ui/package.json index 693a73dec..80e8d815f 100644 --- a/web/ui/package.json +++ b/web/ui/package.json @@ -28,5 +28,5 @@ "ts-jest": "^29.2.2", "typescript": "^4.9.5" }, - "version": "0.53.1" + "version": "0.54.0-rc.0" } diff --git a/web/ui/react-app/package.json b/web/ui/react-app/package.json index c8002433a..df90049ce 100644 --- a/web/ui/react-app/package.json +++ b/web/ui/react-app/package.json @@ -1,6 +1,6 @@ { "name": "@prometheus-io/app", - "version": "0.53.1", + "version": "0.54.0-rc.0", "private": true, "dependencies": { "@codemirror/autocomplete": "^6.17.0", @@ -19,7 +19,7 @@ "@lezer/lr": "^1.4.1", "@nexucis/fuzzy": "^0.4.1", "@nexucis/kvsearch": "^0.8.1", - "@prometheus-io/codemirror-promql": "0.53.1", + "@prometheus-io/codemirror-promql": "0.54.0-rc.0", "bootstrap": "^4.6.2", "css.escape": "^1.5.1", "downshift": "^9.0.6", From be7a4c9b83a9f074f823d12e3d58338407fe76a1 Mon Sep 17 00:00:00 2001 From: Filip Petkovski Date: Fri, 26 Jul 2024 09:49:57 +0200 Subject: [PATCH 014/137] Ignore stale histograms for counter reset detection The histogram stats decoder keeps track of the last seen histogram sample in order to properly detect counter resets. We are seeing an issue where a histogram with UnknownResetHint gets treated as a counter reset when it follows a stale histogram sample. I believe that this is incorrect since stale samples should be completely ignored in PromQL. As a result, they should not be stored in the histogram stats iterator and the counter reset detection needs to be done against the last non-stale sample. Signed-off-by: Filip Petkovski --- promql/histogram_stats_iterator.go | 2 - promql/histogram_stats_iterator_test.go | 123 +++++++++++++++--------- tsdb/tsdbutil/histogram.go | 10 +- 3 files changed, 84 insertions(+), 51 deletions(-) diff --git a/promql/histogram_stats_iterator.go b/promql/histogram_stats_iterator.go index dfafea5f8..0a5f67ae7 100644 --- a/promql/histogram_stats_iterator.go +++ b/promql/histogram_stats_iterator.go @@ -48,7 +48,6 @@ func (f *histogramStatsIterator) AtHistogram(h *histogram.Histogram) (int64, *hi var t int64 t, f.currentH = f.Iterator.AtHistogram(f.currentH) if value.IsStaleNaN(f.currentH.Sum) { - f.setLastH(f.currentH) h = &histogram.Histogram{Sum: f.currentH.Sum} return t, h } @@ -77,7 +76,6 @@ func (f *histogramStatsIterator) AtFloatHistogram(fh *histogram.FloatHistogram) var t int64 t, f.currentFH = f.Iterator.AtFloatHistogram(f.currentFH) if value.IsStaleNaN(f.currentFH.Sum) { - f.setLastFH(f.currentFH) return t, &histogram.FloatHistogram{Sum: f.currentFH.Sum} } diff --git a/promql/histogram_stats_iterator_test.go b/promql/histogram_stats_iterator_test.go index b71a9d602..d5c081348 100644 --- a/promql/histogram_stats_iterator_test.go +++ b/promql/histogram_stats_iterator_test.go @@ -14,62 +14,99 @@ package promql import ( + "fmt" + "math" "testing" "github.com/stretchr/testify/require" "github.com/prometheus/prometheus/model/histogram" "github.com/prometheus/prometheus/model/labels" + "github.com/prometheus/prometheus/model/value" "github.com/prometheus/prometheus/tsdb/chunkenc" "github.com/prometheus/prometheus/tsdb/tsdbutil" ) func TestHistogramStatsDecoding(t *testing.T) { - histograms := []*histogram.Histogram{ - tsdbutil.GenerateTestHistogram(0), - tsdbutil.GenerateTestHistogram(1), - tsdbutil.GenerateTestHistogram(2), - tsdbutil.GenerateTestHistogram(2), - } - histograms[0].CounterResetHint = histogram.NotCounterReset - histograms[1].CounterResetHint = histogram.UnknownCounterReset - histograms[2].CounterResetHint = histogram.CounterReset - histograms[3].CounterResetHint = histogram.UnknownCounterReset - - expectedHints := []histogram.CounterResetHint{ - histogram.NotCounterReset, - histogram.NotCounterReset, - histogram.CounterReset, - histogram.NotCounterReset, + cases := []struct { + name string + histograms []*histogram.Histogram + expectedHints []histogram.CounterResetHint + }{ + { + name: "unknown counter reset triggers detection", + histograms: []*histogram.Histogram{ + tsdbutil.GenerateTestHistogramWithHint(0, histogram.NotCounterReset), + tsdbutil.GenerateTestHistogramWithHint(1, histogram.UnknownCounterReset), + tsdbutil.GenerateTestHistogramWithHint(2, histogram.CounterReset), + tsdbutil.GenerateTestHistogramWithHint(2, histogram.UnknownCounterReset), + }, + expectedHints: []histogram.CounterResetHint{ + histogram.NotCounterReset, + histogram.NotCounterReset, + histogram.CounterReset, + histogram.NotCounterReset, + }, + }, + { + name: "stale sample before unknown reset hint", + histograms: []*histogram.Histogram{ + tsdbutil.GenerateTestHistogramWithHint(0, histogram.NotCounterReset), + tsdbutil.GenerateTestHistogramWithHint(1, histogram.UnknownCounterReset), + {Sum: math.Float64frombits(value.StaleNaN)}, + tsdbutil.GenerateTestHistogramWithHint(1, histogram.UnknownCounterReset), + }, + expectedHints: []histogram.CounterResetHint{ + histogram.NotCounterReset, + histogram.NotCounterReset, + histogram.UnknownCounterReset, + histogram.NotCounterReset, + }, + }, } - t.Run("histogram_stats", func(t *testing.T) { - decodedStats := make([]*histogram.Histogram, 0) - statsIterator := NewHistogramStatsIterator(newHistogramSeries(histograms).Iterator(nil)) - for statsIterator.Next() != chunkenc.ValNone { - _, h := statsIterator.AtHistogram(nil) - decodedStats = append(decodedStats, h) - } - for i := 0; i < len(histograms); i++ { - require.Equal(t, expectedHints[i], decodedStats[i].CounterResetHint) - require.Equal(t, histograms[i].Count, decodedStats[i].Count) - require.Equal(t, histograms[i].Sum, decodedStats[i].Sum) - } - }) - t.Run("float_histogram_stats", func(t *testing.T) { - decodedStats := make([]*histogram.FloatHistogram, 0) - statsIterator := NewHistogramStatsIterator(newHistogramSeries(histograms).Iterator(nil)) - for statsIterator.Next() != chunkenc.ValNone { - _, h := statsIterator.AtFloatHistogram(nil) - decodedStats = append(decodedStats, h) - } - for i := 0; i < len(histograms); i++ { - fh := histograms[i].ToFloat(nil) - require.Equal(t, expectedHints[i], decodedStats[i].CounterResetHint) - require.Equal(t, fh.Count, decodedStats[i].Count) - require.Equal(t, fh.Sum, decodedStats[i].Sum) - } - }) + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Run("histogram_stats", func(t *testing.T) { + decodedStats := make([]*histogram.Histogram, 0) + statsIterator := NewHistogramStatsIterator(newHistogramSeries(tc.histograms).Iterator(nil)) + for statsIterator.Next() != chunkenc.ValNone { + _, h := statsIterator.AtHistogram(nil) + decodedStats = append(decodedStats, h) + } + for i := 0; i < len(tc.histograms); i++ { + require.Equal(t, tc.expectedHints[i], decodedStats[i].CounterResetHint, fmt.Sprintf("mismatch in counter reset hint for histogram %d", i)) + h := tc.histograms[i] + if value.IsStaleNaN(h.Sum) { + require.True(t, value.IsStaleNaN(decodedStats[i].Sum)) + require.Equal(t, uint64(0), decodedStats[i].Count) + } else { + require.Equal(t, tc.histograms[i].Count, decodedStats[i].Count) + require.Equal(t, tc.histograms[i].Sum, decodedStats[i].Sum) + } + } + }) + t.Run("float_histogram_stats", func(t *testing.T) { + decodedStats := make([]*histogram.FloatHistogram, 0) + statsIterator := NewHistogramStatsIterator(newHistogramSeries(tc.histograms).Iterator(nil)) + for statsIterator.Next() != chunkenc.ValNone { + _, h := statsIterator.AtFloatHistogram(nil) + decodedStats = append(decodedStats, h) + } + for i := 0; i < len(tc.histograms); i++ { + require.Equal(t, tc.expectedHints[i], decodedStats[i].CounterResetHint) + fh := tc.histograms[i].ToFloat(nil) + if value.IsStaleNaN(fh.Sum) { + require.True(t, value.IsStaleNaN(decodedStats[i].Sum)) + require.Equal(t, float64(0), decodedStats[i].Count) + } else { + require.Equal(t, fh.Count, decodedStats[i].Count) + require.Equal(t, fh.Sum, decodedStats[i].Sum) + } + } + }) + }) + } } type histogramSeries struct { diff --git a/tsdb/tsdbutil/histogram.go b/tsdb/tsdbutil/histogram.go index 3c7349cf7..ce934a638 100644 --- a/tsdb/tsdbutil/histogram.go +++ b/tsdb/tsdbutil/histogram.go @@ -30,12 +30,10 @@ func GenerateTestHistograms(n int) (r []*histogram.Histogram) { return r } -func GenerateTestHistogramsWithUnknownResetHint(n int) []*histogram.Histogram { - hs := GenerateTestHistograms(n) - for i := range hs { - hs[i].CounterResetHint = histogram.UnknownCounterReset - } - return hs +func GenerateTestHistogramWithHint(n int, hint histogram.CounterResetHint) *histogram.Histogram { + h := GenerateTestHistogram(n) + h.CounterResetHint = hint + return h } // GenerateTestHistogram but it is up to the user to set any known counter reset hint. From 6e89250a5d937485a140c6ba6dcdb35d2db51cd0 Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Fri, 26 Jul 2024 09:49:25 +0100 Subject: [PATCH 015/137] Revert "Chunked remote read: close the querier earlier" Believed to trigger segmentation faults due to memory-mapped block data still being accessed by iterators after the querier is closed. Signed-off-by: Bryan Boreham --- storage/remote/read_handler.go | 53 ++++++++++++++-------------------- 1 file changed, 21 insertions(+), 32 deletions(-) diff --git a/storage/remote/read_handler.go b/storage/remote/read_handler.go index 2a00ce897..ffc64c9c3 100644 --- a/storage/remote/read_handler.go +++ b/storage/remote/read_handler.go @@ -202,16 +202,34 @@ func (h *readHandler) remoteReadStreamedXORChunks(ctx context.Context, w http.Re return err } - chunks := h.getChunkSeriesSet(ctx, query, filteredMatchers) - if err := chunks.Err(); err != nil { + querier, err := h.queryable.ChunkQuerier(query.StartTimestampMs, query.EndTimestampMs) + if err != nil { return err } + defer func() { + if err := querier.Close(); err != nil { + level.Warn(h.logger).Log("msg", "Error on chunk querier close", "err", err.Error()) + } + }() + + var hints *storage.SelectHints + if query.Hints != nil { + hints = &storage.SelectHints{ + Start: query.Hints.StartMs, + End: query.Hints.EndMs, + Step: query.Hints.StepMs, + Func: query.Hints.Func, + Grouping: query.Hints.Grouping, + Range: query.Hints.RangeMs, + By: query.Hints.By, + } + } ws, err := StreamChunkedReadResponses( NewChunkedWriter(w, f), int64(i), // The streaming API has to provide the series sorted. - chunks, + querier.Select(ctx, true, hints, filteredMatchers...), sortedExternalLabels, h.remoteReadMaxBytesInFrame, h.marshalPool, @@ -236,35 +254,6 @@ func (h *readHandler) remoteReadStreamedXORChunks(ctx context.Context, w http.Re } } -// getChunkSeriesSet executes a query to retrieve a ChunkSeriesSet, -// encapsulating the operation in its own function to ensure timely release of -// the querier resources. -func (h *readHandler) getChunkSeriesSet(ctx context.Context, query *prompb.Query, filteredMatchers []*labels.Matcher) storage.ChunkSeriesSet { - querier, err := h.queryable.ChunkQuerier(query.StartTimestampMs, query.EndTimestampMs) - if err != nil { - return storage.ErrChunkSeriesSet(err) - } - defer func() { - if err := querier.Close(); err != nil { - level.Warn(h.logger).Log("msg", "Error on chunk querier close", "err", err.Error()) - } - }() - - var hints *storage.SelectHints - if query.Hints != nil { - hints = &storage.SelectHints{ - Start: query.Hints.StartMs, - End: query.Hints.EndMs, - Step: query.Hints.StepMs, - Func: query.Hints.Func, - Grouping: query.Hints.Grouping, - Range: query.Hints.RangeMs, - By: query.Hints.By, - } - } - return querier.Select(ctx, true, hints, filteredMatchers...) -} - // filterExtLabelsFromMatchers change equality matchers which match external labels // to a matcher that looks for an empty label, // as that label should not be present in the storage. From 4fb2183437728b5107b27323ff1520de0fa21203 Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Fri, 26 Jul 2024 11:21:58 +0200 Subject: [PATCH 016/137] Test a couple more cases without suffix gen Signed-off-by: Arve Knudsen --- storage/remote/otlptranslator/prometheus/normalize_name_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/storage/remote/otlptranslator/prometheus/normalize_name_test.go b/storage/remote/otlptranslator/prometheus/normalize_name_test.go index ee25bb2df..07b9b0a78 100644 --- a/storage/remote/otlptranslator/prometheus/normalize_name_test.go +++ b/storage/remote/otlptranslator/prometheus/normalize_name_test.go @@ -200,4 +200,6 @@ func TestBuildCompliantNameWithoutSuffixes(t *testing.T) { require.Equal(t, "envoy__rule_engine_zlib_buf_error", BuildCompliantName(createGauge("envoy__rule_engine_zlib_buf_error", ""), "", false)) require.Equal(t, ":foo::bar", BuildCompliantName(createGauge(":foo::bar", ""), "", false)) require.Equal(t, ":foo::bar", BuildCompliantName(createCounter(":foo::bar", ""), "", false)) + require.Equal(t, "foo_bar", BuildCompliantName(createGauge("foo.bar", "1"), "", false)) + require.Equal(t, "system_io", BuildCompliantName(createCounter("system.io", "foo/bar"), "", false)) } From 9caba4be7d0f9e4bf2b7945a65ded197ba7acdc1 Mon Sep 17 00:00:00 2001 From: Sergey Date: Fri, 26 Jul 2024 15:32:11 +0300 Subject: [PATCH 017/137] chore: use HumanizeDuration and ConvertToFloat from prometheus/common Signed-off-by: Sergey --- template/template.go | 45 +++++--------------------------------------- 1 file changed, 5 insertions(+), 40 deletions(-) diff --git a/template/template.go b/template/template.go index dbe1607cf..c507dbe74 100644 --- a/template/template.go +++ b/template/template.go @@ -23,7 +23,6 @@ import ( "net" "net/url" "sort" - "strconv" "strings" text_template "text/template" "time" @@ -106,25 +105,6 @@ func query(ctx context.Context, q string, ts time.Time, queryFn QueryFunc) (quer return result, nil } -func convertToFloat(i interface{}) (float64, error) { - switch v := i.(type) { - case float64: - return v, nil - case string: - return strconv.ParseFloat(v, 64) - case int: - return float64(v), nil - case uint: - return float64(v), nil - case int64: - return float64(v), nil - case uint64: - return float64(v), nil - default: - return 0, fmt.Errorf("can't convert %T to float", v) - } -} - // Expander executes templates in text or HTML mode with a common set of Prometheus template functions. type Expander struct { text string @@ -219,7 +199,7 @@ func NewTemplateExpander( return host }, "humanize": func(i interface{}) (string, error) { - v, err := convertToFloat(i) + v, err := common_templates.ConvertToFloat(i) if err != nil { return "", err } @@ -248,7 +228,7 @@ func NewTemplateExpander( return fmt.Sprintf("%.4g%s", v, prefix), nil }, "humanize1024": func(i interface{}) (string, error) { - v, err := convertToFloat(i) + v, err := common_templates.ConvertToFloat(i) if err != nil { return "", err } @@ -267,30 +247,15 @@ func NewTemplateExpander( }, "humanizeDuration": common_templates.HumanizeDuration, "humanizePercentage": func(i interface{}) (string, error) { - v, err := convertToFloat(i) + v, err := common_templates.ConvertToFloat(i) if err != nil { return "", err } return fmt.Sprintf("%.4g%%", v*100), nil }, - "humanizeTimestamp": func(i interface{}) (string, error) { - v, err := convertToFloat(i) - if err != nil { - return "", err - } - - tm, err := floatToTime(v) - switch { - case errors.Is(err, errNaNOrInf): - return fmt.Sprintf("%.4g", v), nil - case err != nil: - return "", err - } - - return fmt.Sprint(tm), nil - }, + "humanizeTimestamp": common_templates.HumanizeTimestamp, "toTime": func(i interface{}) (*time.Time, error) { - v, err := convertToFloat(i) + v, err := common_templates.ConvertToFloat(i) if err != nil { return nil, err } From d4f098ae80fb276153efc757e373c813163da0e8 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 26 Jul 2024 14:55:39 +0200 Subject: [PATCH 018/137] Fix relabel.Regexp zero value marshalling (#14517) Signed-off-by: Marco Pracucci --- model/relabel/relabel.go | 4 ++++ model/relabel/relabel_test.go | 13 +++++++++++++ 2 files changed, 17 insertions(+) diff --git a/model/relabel/relabel.go b/model/relabel/relabel.go index 4f33edda4..a88046596 100644 --- a/model/relabel/relabel.go +++ b/model/relabel/relabel.go @@ -213,6 +213,10 @@ func (re Regexp) IsZero() bool { // String returns the original string used to compile the regular expression. func (re Regexp) String() string { + if re.Regexp == nil { + return "" + } + str := re.Regexp.String() // Trim the anchor `^(?:` prefix and `)$` suffix. return str[4 : len(str)-2] diff --git a/model/relabel/relabel_test.go b/model/relabel/relabel_test.go index 0f11f7068..fc9952134 100644 --- a/model/relabel/relabel_test.go +++ b/model/relabel/relabel_test.go @@ -900,3 +900,16 @@ action: replace }) } } + +func TestRegexp_ShouldMarshalAndUnmarshalZeroValue(t *testing.T) { + var zero Regexp + + marshalled, err := yaml.Marshal(&zero) + require.NoError(t, err) + require.Equal(t, "null\n", string(marshalled)) + + var unmarshalled Regexp + err = yaml.Unmarshal(marshalled, &unmarshalled) + require.NoError(t, err) + require.Nil(t, unmarshalled.Regexp) +} From fe12924638d433c99b51f9acb1d7ebb9c1f40881 Mon Sep 17 00:00:00 2001 From: Kushal shukla <85934954+kushalShukla-web@users.noreply.github.com> Date: Mon, 29 Jul 2024 07:28:08 -0400 Subject: [PATCH 019/137] promtool: JUnit-Format XML Test Results (#14506) * Junit compatible output Signed-off-by: Kushal Shukla --- cmd/promtool/main.go | 7 ++- cmd/promtool/unittest.go | 40 +++++++++++++---- cmd/promtool/unittest_test.go | 50 +++++++++++++++++++++ docs/command-line/promtool.md | 9 ++++ util/junitxml/junitxml.go | 81 ++++++++++++++++++++++++++++++++++ util/junitxml/junitxml_test.go | 66 +++++++++++++++++++++++++++ 6 files changed, 243 insertions(+), 10 deletions(-) create mode 100644 util/junitxml/junitxml.go create mode 100644 util/junitxml/junitxml_test.go diff --git a/cmd/promtool/main.go b/cmd/promtool/main.go index e1d275e97..1c8e1dd1c 100644 --- a/cmd/promtool/main.go +++ b/cmd/promtool/main.go @@ -204,6 +204,7 @@ func main() { pushMetricsHeaders := pushMetricsCmd.Flag("header", "Prometheus remote write header.").StringMap() testCmd := app.Command("test", "Unit testing.") + junitOutFile := testCmd.Flag("junit", "File path to store JUnit XML test results.").OpenFile(os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o644) testRulesCmd := testCmd.Command("rules", "Unit tests for rules.") testRulesRun := testRulesCmd.Flag("run", "If set, will only run test groups whose names match the regular expression. Can be specified multiple times.").Strings() testRulesFiles := testRulesCmd.Arg( @@ -378,7 +379,11 @@ func main() { os.Exit(QueryLabels(serverURL, httpRoundTripper, *queryLabelsMatch, *queryLabelsName, *queryLabelsBegin, *queryLabelsEnd, p)) case testRulesCmd.FullCommand(): - os.Exit(RulesUnitTest( + results := io.Discard + if *junitOutFile != nil { + results = *junitOutFile + } + os.Exit(RulesUnitTestResult(results, promqltest.LazyLoaderOpts{ EnableAtModifier: true, EnableNegativeOffset: true, diff --git a/cmd/promtool/unittest.go b/cmd/promtool/unittest.go index 5451c5296..7030635d1 100644 --- a/cmd/promtool/unittest.go +++ b/cmd/promtool/unittest.go @@ -18,6 +18,7 @@ import ( "encoding/json" "errors" "fmt" + "io" "os" "path/filepath" "sort" @@ -29,9 +30,10 @@ import ( "github.com/google/go-cmp/cmp" "github.com/grafana/regexp" "github.com/nsf/jsondiff" - "github.com/prometheus/common/model" "gopkg.in/yaml.v2" + "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/model/histogram" "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/promql" @@ -39,12 +41,18 @@ import ( "github.com/prometheus/prometheus/promql/promqltest" "github.com/prometheus/prometheus/rules" "github.com/prometheus/prometheus/storage" + "github.com/prometheus/prometheus/util/junitxml" ) // RulesUnitTest does unit testing of rules based on the unit testing files provided. // More info about the file format can be found in the docs. func RulesUnitTest(queryOpts promqltest.LazyLoaderOpts, runStrings []string, diffFlag bool, files ...string) int { + return RulesUnitTestResult(io.Discard, queryOpts, runStrings, diffFlag, files...) +} + +func RulesUnitTestResult(results io.Writer, queryOpts promqltest.LazyLoaderOpts, runStrings []string, diffFlag bool, files ...string) int { failed := false + junit := &junitxml.JUnitXML{} var run *regexp.Regexp if runStrings != nil { @@ -52,7 +60,7 @@ func RulesUnitTest(queryOpts promqltest.LazyLoaderOpts, runStrings []string, dif } for _, f := range files { - if errs := ruleUnitTest(f, queryOpts, run, diffFlag); errs != nil { + if errs := ruleUnitTest(f, queryOpts, run, diffFlag, junit.Suite(f)); errs != nil { fmt.Fprintln(os.Stderr, " FAILED:") for _, e := range errs { fmt.Fprintln(os.Stderr, e.Error()) @@ -64,25 +72,30 @@ func RulesUnitTest(queryOpts promqltest.LazyLoaderOpts, runStrings []string, dif } fmt.Println() } + err := junit.WriteXML(results) + if err != nil { + fmt.Fprintf(os.Stderr, "failed to write JUnit XML: %s\n", err) + } if failed { return failureExitCode } return successExitCode } -func ruleUnitTest(filename string, queryOpts promqltest.LazyLoaderOpts, run *regexp.Regexp, diffFlag bool) []error { - fmt.Println("Unit Testing: ", filename) - +func ruleUnitTest(filename string, queryOpts promqltest.LazyLoaderOpts, run *regexp.Regexp, diffFlag bool, ts *junitxml.TestSuite) []error { b, err := os.ReadFile(filename) if err != nil { + ts.Abort(err) return []error{err} } var unitTestInp unitTestFile if err := yaml.UnmarshalStrict(b, &unitTestInp); err != nil { + ts.Abort(err) return []error{err} } if err := resolveAndGlobFilepaths(filepath.Dir(filename), &unitTestInp); err != nil { + ts.Abort(err) return []error{err} } @@ -91,29 +104,38 @@ func ruleUnitTest(filename string, queryOpts promqltest.LazyLoaderOpts, run *reg } evalInterval := time.Duration(unitTestInp.EvaluationInterval) - + ts.Settime(time.Now().Format("2006-01-02T15:04:05")) // Giving number for groups mentioned in the file for ordering. // Lower number group should be evaluated before higher number group. groupOrderMap := make(map[string]int) for i, gn := range unitTestInp.GroupEvalOrder { if _, ok := groupOrderMap[gn]; ok { - return []error{fmt.Errorf("group name repeated in evaluation order: %s", gn)} + err := fmt.Errorf("group name repeated in evaluation order: %s", gn) + ts.Abort(err) + return []error{err} } groupOrderMap[gn] = i } // Testing. var errs []error - for _, t := range unitTestInp.Tests { + for i, t := range unitTestInp.Tests { if !matchesRun(t.TestGroupName, run) { continue } - + testname := t.TestGroupName + if testname == "" { + testname = fmt.Sprintf("unnamed#%d", i) + } + tc := ts.Case(testname) if t.Interval == 0 { t.Interval = unitTestInp.EvaluationInterval } ers := t.test(evalInterval, groupOrderMap, queryOpts, diffFlag, unitTestInp.RuleFiles...) if ers != nil { + for _, e := range ers { + tc.Fail(e.Error()) + } errs = append(errs, ers...) } } diff --git a/cmd/promtool/unittest_test.go b/cmd/promtool/unittest_test.go index 2dbd5a4e5..9bbac28e9 100644 --- a/cmd/promtool/unittest_test.go +++ b/cmd/promtool/unittest_test.go @@ -14,11 +14,15 @@ package main import ( + "bytes" + "encoding/xml" + "fmt" "testing" "github.com/stretchr/testify/require" "github.com/prometheus/prometheus/promql/promqltest" + "github.com/prometheus/prometheus/util/junitxml" ) func TestRulesUnitTest(t *testing.T) { @@ -125,13 +129,59 @@ func TestRulesUnitTest(t *testing.T) { want: 0, }, } + reuseFiles := []string{} + reuseCount := [2]int{} for _, tt := range tests { + if (tt.queryOpts == promqltest.LazyLoaderOpts{ + EnableNegativeOffset: true, + } || tt.queryOpts == promqltest.LazyLoaderOpts{ + EnableAtModifier: true, + }) { + reuseFiles = append(reuseFiles, tt.args.files...) + reuseCount[tt.want] += len(tt.args.files) + } t.Run(tt.name, func(t *testing.T) { if got := RulesUnitTest(tt.queryOpts, nil, false, tt.args.files...); got != tt.want { t.Errorf("RulesUnitTest() = %v, want %v", got, tt.want) } }) } + t.Run("Junit xml output ", func(t *testing.T) { + var buf bytes.Buffer + if got := RulesUnitTestResult(&buf, promqltest.LazyLoaderOpts{}, nil, false, reuseFiles...); got != 1 { + t.Errorf("RulesUnitTestResults() = %v, want 1", got) + } + var test junitxml.JUnitXML + output := buf.Bytes() + err := xml.Unmarshal(output, &test) + if err != nil { + fmt.Println("error in decoding XML:", err) + return + } + var total int + var passes int + var failures int + var cases int + total = len(test.Suites) + if total != len(reuseFiles) { + t.Errorf("JUnit output had %d testsuite elements; expected %d\n", total, len(reuseFiles)) + } + + for _, i := range test.Suites { + if i.FailureCount == 0 { + passes++ + } else { + failures++ + } + cases += len(i.Cases) + } + if total != passes+failures { + t.Errorf("JUnit output mismatch: Total testsuites (%d) does not equal the sum of passes (%d) and failures (%d).", total, passes, failures) + } + if cases < total { + t.Errorf("JUnit output had %d suites without test cases\n", total-cases) + } + }) } func TestRulesUnitTestRun(t *testing.T) { diff --git a/docs/command-line/promtool.md b/docs/command-line/promtool.md index 443cd3f0c..6bb80169a 100644 --- a/docs/command-line/promtool.md +++ b/docs/command-line/promtool.md @@ -442,6 +442,15 @@ Unit testing. +#### Flags + +| Flag | Description | +| --- | --- | +| --junit | File path to store JUnit XML test results. | + + + + ##### `promtool test rules` Unit tests for rules. diff --git a/util/junitxml/junitxml.go b/util/junitxml/junitxml.go new file mode 100644 index 000000000..14e4b6dba --- /dev/null +++ b/util/junitxml/junitxml.go @@ -0,0 +1,81 @@ +// Copyright 2024 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package junitxml + +import ( + "encoding/xml" + "io" +) + +type JUnitXML struct { + XMLName xml.Name `xml:"testsuites"` + Suites []*TestSuite `xml:"testsuite"` +} + +type TestSuite struct { + Name string `xml:"name,attr"` + TestCount int `xml:"tests,attr"` + FailureCount int `xml:"failures,attr"` + ErrorCount int `xml:"errors,attr"` + SkippedCount int `xml:"skipped,attr"` + Timestamp string `xml:"timestamp,attr"` + Cases []*TestCase `xml:"testcase"` +} +type TestCase struct { + Name string `xml:"name,attr"` + Failures []string `xml:"failure,omitempty"` + Error string `xml:"error,omitempty"` +} + +func (j *JUnitXML) WriteXML(h io.Writer) error { + return xml.NewEncoder(h).Encode(j) +} + +func (j *JUnitXML) Suite(name string) *TestSuite { + ts := &TestSuite{Name: name} + j.Suites = append(j.Suites, ts) + return ts +} + +func (ts *TestSuite) Fail(f string) { + ts.FailureCount++ + curt := ts.lastCase() + curt.Failures = append(curt.Failures, f) +} + +func (ts *TestSuite) lastCase() *TestCase { + if len(ts.Cases) == 0 { + ts.Case("unknown") + } + return ts.Cases[len(ts.Cases)-1] +} + +func (ts *TestSuite) Case(name string) *TestSuite { + j := &TestCase{ + Name: name, + } + ts.Cases = append(ts.Cases, j) + ts.TestCount++ + return ts +} + +func (ts *TestSuite) Settime(name string) { + ts.Timestamp = name +} + +func (ts *TestSuite) Abort(e error) { + ts.ErrorCount++ + curt := ts.lastCase() + curt.Error = e.Error() +} diff --git a/util/junitxml/junitxml_test.go b/util/junitxml/junitxml_test.go new file mode 100644 index 000000000..ad4d0293d --- /dev/null +++ b/util/junitxml/junitxml_test.go @@ -0,0 +1,66 @@ +// Copyright 2024 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package junitxml + +import ( + "bytes" + "encoding/xml" + "errors" + "testing" +) + +func TestJunitOutput(t *testing.T) { + var buf bytes.Buffer + var test JUnitXML + x := FakeTestSuites() + if err := x.WriteXML(&buf); err != nil { + t.Fatalf("Failed to encode XML: %v", err) + } + + output := buf.Bytes() + + err := xml.Unmarshal(output, &test) + if err != nil { + t.Errorf("Unmarshal failed with error: %v", err) + } + var total int + var cases int + total = len(test.Suites) + if total != 3 { + t.Errorf("JUnit output had %d testsuite elements; expected 3\n", total) + } + for _, i := range test.Suites { + cases += len(i.Cases) + } + + if cases != 7 { + t.Errorf("JUnit output had %d testcase; expected 7\n", cases) + } +} + +func FakeTestSuites() *JUnitXML { + ju := &JUnitXML{} + good := ju.Suite("all good") + good.Case("alpha") + good.Case("beta") + good.Case("gamma") + mixed := ju.Suite("mixed") + mixed.Case("good") + bad := mixed.Case("bad") + bad.Fail("once") + bad.Fail("twice") + mixed.Case("ugly").Abort(errors.New("buggy")) + ju.Suite("fast").Fail("fail early") + return ju +} From 2cd97c61e02ac9cf50e0fa4a72bbc61f8e128b8b Mon Sep 17 00:00:00 2001 From: Filip Petkovski Date: Mon, 29 Jul 2024 14:53:32 +0200 Subject: [PATCH 020/137] Add more test cases Signed-off-by: Filip Petkovski --- promql/histogram_stats_iterator_test.go | 33 +++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/promql/histogram_stats_iterator_test.go b/promql/histogram_stats_iterator_test.go index d5c081348..7a2953d3e 100644 --- a/promql/histogram_stats_iterator_test.go +++ b/promql/histogram_stats_iterator_test.go @@ -63,6 +63,39 @@ func TestHistogramStatsDecoding(t *testing.T) { histogram.NotCounterReset, }, }, + { + name: "unknown counter reset at the beginning", + histograms: []*histogram.Histogram{ + tsdbutil.GenerateTestHistogramWithHint(1, histogram.UnknownCounterReset), + }, + expectedHints: []histogram.CounterResetHint{ + histogram.NotCounterReset, + }, + }, + { + name: "detect real counter reset", + histograms: []*histogram.Histogram{ + tsdbutil.GenerateTestHistogramWithHint(2, histogram.UnknownCounterReset), + tsdbutil.GenerateTestHistogramWithHint(1, histogram.UnknownCounterReset), + }, + expectedHints: []histogram.CounterResetHint{ + histogram.NotCounterReset, + histogram.CounterReset, + }, + }, + { + name: "detect real counter reset after stale NaN", + histograms: []*histogram.Histogram{ + tsdbutil.GenerateTestHistogramWithHint(2, histogram.UnknownCounterReset), + {Sum: math.Float64frombits(value.StaleNaN)}, + tsdbutil.GenerateTestHistogramWithHint(1, histogram.UnknownCounterReset), + }, + expectedHints: []histogram.CounterResetHint{ + histogram.NotCounterReset, + histogram.UnknownCounterReset, + histogram.CounterReset, + }, + }, } for _, tc := range cases { From d186caead514dd81bf7f4d57629089b39ff2635a Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Mon, 29 Jul 2024 14:41:10 +0100 Subject: [PATCH 021/137] Merge pull request #14496 from bboreham/fix-nil-primary (#14509) [BUGFIX] Storage: errors from a single secondary querier should be warnings. This is a backport of #14496 to release-2.54 branch. #13434 introduced an unwanted change in behaviour: if there was no primary querier and a single secondary querier, the secondary would be treated like a primary. This PR restores the previous behaviour, that all secondary queriers report errors as warnings. In order to test this behaviour, I changed `TestMergeQuerierWithSecondaries_ErrorHandling` so it now calls `NewMergeQuerier` rather than creating the internal data structure directly. This in turn required all the data types to change, so I merged `mockGenericQuerier` into `mockQuerier`. Also replaced `unwrapMockGenericQuerier` with a visitor pattern. While I was there, I addressed the comment from https://github.com/prometheus/prometheus/pull/13434#pullrequestreview-2191058921 to short-circuit the merge of single querier with any number of no-op or nil queriers. Signed-off-by: Bryan Boreham --- storage/merge.go | 50 ++++--- storage/merge_test.go | 318 ++++++++++++++++++++++-------------------- 2 files changed, 204 insertions(+), 164 deletions(-) diff --git a/storage/merge.go b/storage/merge.go index 194494b6a..2424b26ab 100644 --- a/storage/merge.go +++ b/storage/merge.go @@ -45,25 +45,24 @@ type mergeGenericQuerier struct { // // In case of overlaps between the data given by primaries' and secondaries' Selects, merge function will be used. func NewMergeQuerier(primaries, secondaries []Querier, mergeFn VerticalSeriesMergeFunc) Querier { + primaries = filterQueriers(primaries) + secondaries = filterQueriers(secondaries) + switch { - case len(primaries)+len(secondaries) == 0: + case len(primaries) == 0 && len(secondaries) == 0: return noopQuerier{} case len(primaries) == 1 && len(secondaries) == 0: return primaries[0] case len(primaries) == 0 && len(secondaries) == 1: - return secondaries[0] + return &querierAdapter{newSecondaryQuerierFrom(secondaries[0])} } queriers := make([]genericQuerier, 0, len(primaries)+len(secondaries)) for _, q := range primaries { - if _, ok := q.(noopQuerier); !ok && q != nil { - queriers = append(queriers, newGenericQuerierFrom(q)) - } + queriers = append(queriers, newGenericQuerierFrom(q)) } for _, q := range secondaries { - if _, ok := q.(noopQuerier); !ok && q != nil { - queriers = append(queriers, newSecondaryQuerierFrom(q)) - } + queriers = append(queriers, newSecondaryQuerierFrom(q)) } concurrentSelect := false @@ -77,31 +76,40 @@ func NewMergeQuerier(primaries, secondaries []Querier, mergeFn VerticalSeriesMer }} } +func filterQueriers(qs []Querier) []Querier { + ret := make([]Querier, 0, len(qs)) + for _, q := range qs { + if _, ok := q.(noopQuerier); !ok && q != nil { + ret = append(ret, q) + } + } + return ret +} + // NewMergeChunkQuerier returns a new Chunk Querier that merges results of given primary and secondary chunk queriers. // See NewFanout commentary to learn more about primary vs secondary differences. // // In case of overlaps between the data given by primaries' and secondaries' Selects, merge function will be used. // TODO(bwplotka): Currently merge will compact overlapping chunks with bigger chunk, without limit. Split it: https://github.com/prometheus/tsdb/issues/670 func NewMergeChunkQuerier(primaries, secondaries []ChunkQuerier, mergeFn VerticalChunkSeriesMergeFunc) ChunkQuerier { + primaries = filterChunkQueriers(primaries) + secondaries = filterChunkQueriers(secondaries) + switch { case len(primaries) == 0 && len(secondaries) == 0: return noopChunkQuerier{} case len(primaries) == 1 && len(secondaries) == 0: return primaries[0] case len(primaries) == 0 && len(secondaries) == 1: - return secondaries[0] + return &chunkQuerierAdapter{newSecondaryQuerierFromChunk(secondaries[0])} } queriers := make([]genericQuerier, 0, len(primaries)+len(secondaries)) for _, q := range primaries { - if _, ok := q.(noopChunkQuerier); !ok && q != nil { - queriers = append(queriers, newGenericQuerierFromChunk(q)) - } + queriers = append(queriers, newGenericQuerierFromChunk(q)) } - for _, querier := range secondaries { - if _, ok := querier.(noopChunkQuerier); !ok && querier != nil { - queriers = append(queriers, newSecondaryQuerierFromChunk(querier)) - } + for _, q := range secondaries { + queriers = append(queriers, newSecondaryQuerierFromChunk(q)) } concurrentSelect := false @@ -115,6 +123,16 @@ func NewMergeChunkQuerier(primaries, secondaries []ChunkQuerier, mergeFn Vertica }} } +func filterChunkQueriers(qs []ChunkQuerier) []ChunkQuerier { + ret := make([]ChunkQuerier, 0, len(qs)) + for _, q := range qs { + if _, ok := q.(noopChunkQuerier); !ok && q != nil { + ret = append(ret, q) + } + } + return ret +} + // Select returns a set of series that matches the given label matchers. func (q *mergeGenericQuerier) Select(ctx context.Context, sortSeries bool, hints *SelectHints, matchers ...*labels.Matcher) genericSeriesSet { seriesSets := make([]genericSeriesSet, 0, len(q.queriers)) diff --git a/storage/merge_test.go b/storage/merge_test.go index 7619af3c1..b145743c8 100644 --- a/storage/merge_test.go +++ b/storage/merge_test.go @@ -912,9 +912,23 @@ func TestConcatenatingChunkIterator(t *testing.T) { } type mockQuerier struct { - LabelQuerier + mtx sync.Mutex - toReturn []Series + toReturn []Series // Response for Select. + + closed bool + labelNamesCalls int + labelNamesRequested []labelNameRequest + sortedSeriesRequested []bool + + resp []string // Response for LabelNames and LabelValues; turned into Select response if toReturn is not supplied. + warnings annotations.Annotations + err error +} + +type labelNameRequest struct { + name string + matchers []*labels.Matcher } type seriesByLabel []Series @@ -924,13 +938,47 @@ func (a seriesByLabel) Swap(i, j int) { a[i], a[j] = a[j], a[i] } func (a seriesByLabel) Less(i, j int) bool { return labels.Compare(a[i].Labels(), a[j].Labels()) < 0 } func (m *mockQuerier) Select(_ context.Context, sortSeries bool, _ *SelectHints, _ ...*labels.Matcher) SeriesSet { - cpy := make([]Series, len(m.toReturn)) - copy(cpy, m.toReturn) + m.mtx.Lock() + defer m.mtx.Unlock() + m.sortedSeriesRequested = append(m.sortedSeriesRequested, sortSeries) + + var ret []Series + if len(m.toReturn) > 0 { + ret = make([]Series, len(m.toReturn)) + copy(ret, m.toReturn) + } else if len(m.resp) > 0 { + ret = make([]Series, 0, len(m.resp)) + for _, l := range m.resp { + ret = append(ret, NewListSeries(labels.FromStrings("test", l), nil)) + } + } if sortSeries { - sort.Sort(seriesByLabel(cpy)) + sort.Sort(seriesByLabel(ret)) } - return NewMockSeriesSet(cpy...) + return &mockSeriesSet{idx: -1, series: ret, warnings: m.warnings, err: m.err} +} + +func (m *mockQuerier) LabelValues(_ context.Context, name string, hints *LabelHints, matchers ...*labels.Matcher) ([]string, annotations.Annotations, error) { + m.mtx.Lock() + m.labelNamesRequested = append(m.labelNamesRequested, labelNameRequest{ + name: name, + matchers: matchers, + }) + m.mtx.Unlock() + return m.resp, m.warnings, m.err +} + +func (m *mockQuerier) LabelNames(context.Context, *LabelHints, ...*labels.Matcher) ([]string, annotations.Annotations, error) { + m.mtx.Lock() + m.labelNamesCalls++ + m.mtx.Unlock() + return m.resp, m.warnings, m.err +} + +func (m *mockQuerier) Close() error { + m.closed = true + return nil } type mockChunkQuerier struct { @@ -960,6 +1008,9 @@ func (m *mockChunkQuerier) Select(_ context.Context, sortSeries bool, _ *SelectH type mockSeriesSet struct { idx int series []Series + + warnings annotations.Annotations + err error } func NewMockSeriesSet(series ...Series) SeriesSet { @@ -970,15 +1021,18 @@ func NewMockSeriesSet(series ...Series) SeriesSet { } func (m *mockSeriesSet) Next() bool { + if m.err != nil { + return false + } m.idx++ return m.idx < len(m.series) } func (m *mockSeriesSet) At() Series { return m.series[m.idx] } -func (m *mockSeriesSet) Err() error { return nil } +func (m *mockSeriesSet) Err() error { return m.err } -func (m *mockSeriesSet) Warnings() annotations.Annotations { return nil } +func (m *mockSeriesSet) Warnings() annotations.Annotations { return m.warnings } type mockChunkSeriesSet struct { idx int @@ -1336,105 +1390,44 @@ func BenchmarkMergeSeriesSet(b *testing.B) { } } -type mockGenericQuerier struct { - mtx sync.Mutex - - closed bool - labelNamesCalls int - labelNamesRequested []labelNameRequest - sortedSeriesRequested []bool - - resp []string - warnings annotations.Annotations - err error -} - -type labelNameRequest struct { - name string - matchers []*labels.Matcher -} - -func (m *mockGenericQuerier) Select(_ context.Context, b bool, _ *SelectHints, _ ...*labels.Matcher) genericSeriesSet { - m.mtx.Lock() - m.sortedSeriesRequested = append(m.sortedSeriesRequested, b) - m.mtx.Unlock() - return &mockGenericSeriesSet{resp: m.resp, warnings: m.warnings, err: m.err} -} - -func (m *mockGenericQuerier) LabelValues(_ context.Context, name string, hints *LabelHints, matchers ...*labels.Matcher) ([]string, annotations.Annotations, error) { - m.mtx.Lock() - m.labelNamesRequested = append(m.labelNamesRequested, labelNameRequest{ - name: name, - matchers: matchers, - }) - m.mtx.Unlock() - return m.resp, m.warnings, m.err -} - -func (m *mockGenericQuerier) LabelNames(context.Context, *LabelHints, ...*labels.Matcher) ([]string, annotations.Annotations, error) { - m.mtx.Lock() - m.labelNamesCalls++ - m.mtx.Unlock() - return m.resp, m.warnings, m.err -} - -func (m *mockGenericQuerier) Close() error { - m.closed = true - return nil -} - -type mockGenericSeriesSet struct { - resp []string - warnings annotations.Annotations - err error - - curr int -} - -func (m *mockGenericSeriesSet) Next() bool { - if m.err != nil { - return false +func visitMockQueriers(t *testing.T, qr Querier, f func(t *testing.T, q *mockQuerier)) int { + count := 0 + switch x := qr.(type) { + case *mockQuerier: + count++ + f(t, x) + case *querierAdapter: + count += visitMockQueriersInGenericQuerier(t, x.genericQuerier, f) } - if m.curr >= len(m.resp) { - return false + return count +} + +func visitMockQueriersInGenericQuerier(t *testing.T, g genericQuerier, f func(t *testing.T, q *mockQuerier)) int { + count := 0 + switch x := g.(type) { + case *mergeGenericQuerier: + for _, q := range x.queriers { + count += visitMockQueriersInGenericQuerier(t, q, f) + } + case *genericQuerierAdapter: + // Visitor for chunkQuerier not implemented. + count += visitMockQueriers(t, x.q, f) + case *secondaryQuerier: + count += visitMockQueriersInGenericQuerier(t, x.genericQuerier, f) } - m.curr++ - return true + return count } -func (m *mockGenericSeriesSet) Err() error { return m.err } -func (m *mockGenericSeriesSet) Warnings() annotations.Annotations { return m.warnings } - -func (m *mockGenericSeriesSet) At() Labels { - return mockLabels(m.resp[m.curr-1]) -} - -type mockLabels string - -func (l mockLabels) Labels() labels.Labels { - return labels.FromStrings("test", string(l)) -} - -func unwrapMockGenericQuerier(t *testing.T, qr genericQuerier) *mockGenericQuerier { - m, ok := qr.(*mockGenericQuerier) - if !ok { - s, ok := qr.(*secondaryQuerier) - require.True(t, ok, "expected secondaryQuerier got something else") - m, ok = s.genericQuerier.(*mockGenericQuerier) - require.True(t, ok, "expected mockGenericQuerier got something else") - } - return m -} - -func TestMergeGenericQuerierWithSecondaries_ErrorHandling(t *testing.T) { +func TestMergeQuerierWithSecondaries_ErrorHandling(t *testing.T) { var ( errStorage = errors.New("storage error") warnStorage = errors.New("storage warning") ctx = context.Background() ) for _, tcase := range []struct { - name string - queriers []genericQuerier + name string + primaries []Querier + secondaries []Querier expectedSelectsSeries []labels.Labels expectedLabels []string @@ -1443,10 +1436,8 @@ func TestMergeGenericQuerierWithSecondaries_ErrorHandling(t *testing.T) { expectedErrs [4]error }{ { - // NewMergeQuerier will not create a mergeGenericQuerier - // with just one querier inside, but we can test it anyway. - name: "one successful primary querier", - queriers: []genericQuerier{&mockGenericQuerier{resp: []string{"a", "b"}, warnings: nil, err: nil}}, + name: "one successful primary querier", + primaries: []Querier{&mockQuerier{resp: []string{"a", "b"}, warnings: nil, err: nil}}, expectedSelectsSeries: []labels.Labels{ labels.FromStrings("test", "a"), labels.FromStrings("test", "b"), @@ -1455,9 +1446,9 @@ func TestMergeGenericQuerierWithSecondaries_ErrorHandling(t *testing.T) { }, { name: "multiple successful primary queriers", - queriers: []genericQuerier{ - &mockGenericQuerier{resp: []string{"a", "b"}, warnings: nil, err: nil}, - &mockGenericQuerier{resp: []string{"b", "c"}, warnings: nil, err: nil}, + primaries: []Querier{ + &mockQuerier{resp: []string{"a", "b"}, warnings: nil, err: nil}, + &mockQuerier{resp: []string{"b", "c"}, warnings: nil, err: nil}, }, expectedSelectsSeries: []labels.Labels{ labels.FromStrings("test", "a"), @@ -1468,15 +1459,17 @@ func TestMergeGenericQuerierWithSecondaries_ErrorHandling(t *testing.T) { }, { name: "one failed primary querier", - queriers: []genericQuerier{&mockGenericQuerier{warnings: nil, err: errStorage}}, + primaries: []Querier{&mockQuerier{warnings: nil, err: errStorage}}, expectedErrs: [4]error{errStorage, errStorage, errStorage, errStorage}, }, { name: "one successful primary querier with successful secondaries", - queriers: []genericQuerier{ - &mockGenericQuerier{resp: []string{"a", "b"}, warnings: nil, err: nil}, - &secondaryQuerier{genericQuerier: &mockGenericQuerier{resp: []string{"b"}, warnings: nil, err: nil}}, - &secondaryQuerier{genericQuerier: &mockGenericQuerier{resp: []string{"c"}, warnings: nil, err: nil}}, + primaries: []Querier{ + &mockQuerier{resp: []string{"a", "b"}, warnings: nil, err: nil}, + }, + secondaries: []Querier{ + &mockQuerier{resp: []string{"b"}, warnings: nil, err: nil}, + &mockQuerier{resp: []string{"c"}, warnings: nil, err: nil}, }, expectedSelectsSeries: []labels.Labels{ labels.FromStrings("test", "a"), @@ -1487,10 +1480,12 @@ func TestMergeGenericQuerierWithSecondaries_ErrorHandling(t *testing.T) { }, { name: "one successful primary querier with empty response and successful secondaries", - queriers: []genericQuerier{ - &mockGenericQuerier{resp: []string{}, warnings: nil, err: nil}, - &secondaryQuerier{genericQuerier: &mockGenericQuerier{resp: []string{"b"}, warnings: nil, err: nil}}, - &secondaryQuerier{genericQuerier: &mockGenericQuerier{resp: []string{"c"}, warnings: nil, err: nil}}, + primaries: []Querier{ + &mockQuerier{resp: []string{}, warnings: nil, err: nil}, + }, + secondaries: []Querier{ + &mockQuerier{resp: []string{"b"}, warnings: nil, err: nil}, + &mockQuerier{resp: []string{"c"}, warnings: nil, err: nil}, }, expectedSelectsSeries: []labels.Labels{ labels.FromStrings("test", "b"), @@ -1500,19 +1495,42 @@ func TestMergeGenericQuerierWithSecondaries_ErrorHandling(t *testing.T) { }, { name: "one failed primary querier with successful secondaries", - queriers: []genericQuerier{ - &mockGenericQuerier{warnings: nil, err: errStorage}, - &secondaryQuerier{genericQuerier: &mockGenericQuerier{resp: []string{"b"}, warnings: nil, err: nil}}, - &secondaryQuerier{genericQuerier: &mockGenericQuerier{resp: []string{"c"}, warnings: nil, err: nil}}, + primaries: []Querier{ + &mockQuerier{warnings: nil, err: errStorage}, + }, + secondaries: []Querier{ + &mockQuerier{resp: []string{"b"}, warnings: nil, err: nil}, + &mockQuerier{resp: []string{"c"}, warnings: nil, err: nil}, }, expectedErrs: [4]error{errStorage, errStorage, errStorage, errStorage}, }, + { + name: "nil primary querier with failed secondary", + primaries: nil, + secondaries: []Querier{ + &mockQuerier{resp: []string{"b"}, warnings: nil, err: errStorage}, + }, + expectedLabels: []string{}, + expectedWarnings: annotations.New().Add(errStorage), + }, + { + name: "nil primary querier with two failed secondaries", + primaries: nil, + secondaries: []Querier{ + &mockQuerier{resp: []string{"b"}, warnings: nil, err: errStorage}, + &mockQuerier{resp: []string{"c"}, warnings: nil, err: errStorage}, + }, + expectedLabels: []string{}, + expectedWarnings: annotations.New().Add(errStorage), + }, { name: "one successful primary querier with failed secondaries", - queriers: []genericQuerier{ - &mockGenericQuerier{resp: []string{"a"}, warnings: nil, err: nil}, - &secondaryQuerier{genericQuerier: &mockGenericQuerier{resp: []string{"b"}, warnings: nil, err: errStorage}}, - &secondaryQuerier{genericQuerier: &mockGenericQuerier{resp: []string{"c"}, warnings: nil, err: errStorage}}, + primaries: []Querier{ + &mockQuerier{resp: []string{"a"}, warnings: nil, err: nil}, + }, + secondaries: []Querier{ + &mockQuerier{resp: []string{"b"}, warnings: nil, err: errStorage}, + &mockQuerier{resp: []string{"c"}, warnings: nil, err: errStorage}, }, expectedSelectsSeries: []labels.Labels{ labels.FromStrings("test", "a"), @@ -1522,9 +1540,11 @@ func TestMergeGenericQuerierWithSecondaries_ErrorHandling(t *testing.T) { }, { name: "successful queriers with warnings", - queriers: []genericQuerier{ - &mockGenericQuerier{resp: []string{"a"}, warnings: annotations.New().Add(warnStorage), err: nil}, - &secondaryQuerier{genericQuerier: &mockGenericQuerier{resp: []string{"b"}, warnings: annotations.New().Add(warnStorage), err: nil}}, + primaries: []Querier{ + &mockQuerier{resp: []string{"a"}, warnings: annotations.New().Add(warnStorage), err: nil}, + }, + secondaries: []Querier{ + &mockQuerier{resp: []string{"b"}, warnings: annotations.New().Add(warnStorage), err: nil}, }, expectedSelectsSeries: []labels.Labels{ labels.FromStrings("test", "a"), @@ -1535,10 +1555,7 @@ func TestMergeGenericQuerierWithSecondaries_ErrorHandling(t *testing.T) { }, } { t.Run(tcase.name, func(t *testing.T) { - q := &mergeGenericQuerier{ - queriers: tcase.queriers, - mergeFn: func(l ...Labels) Labels { return l[0] }, - } + q := NewMergeQuerier(tcase.primaries, tcase.secondaries, func(s ...Series) Series { return s[0] }) t.Run("Select", func(t *testing.T) { res := q.Select(context.Background(), false, nil) @@ -1551,65 +1568,70 @@ func TestMergeGenericQuerierWithSecondaries_ErrorHandling(t *testing.T) { require.ErrorIs(t, res.Err(), tcase.expectedErrs[0], "expected error doesn't match") require.Equal(t, tcase.expectedSelectsSeries, lbls) - for _, qr := range q.queriers { - m := unwrapMockGenericQuerier(t, qr) - // mergeGenericQuerier forces all Selects to be sorted. - require.Equal(t, []bool{true}, m.sortedSeriesRequested) - } + n := visitMockQueriers(t, q, func(t *testing.T, m *mockQuerier) { + // Single queries should be unsorted; merged queries sorted. + exp := len(tcase.primaries)+len(tcase.secondaries) > 1 + require.Equal(t, []bool{exp}, m.sortedSeriesRequested) + }) + // Check we visited all queriers. + require.Equal(t, len(tcase.primaries)+len(tcase.secondaries), n) }) t.Run("LabelNames", func(t *testing.T) { res, w, err := q.LabelNames(ctx, nil) require.Subset(t, tcase.expectedWarnings, w) require.ErrorIs(t, err, tcase.expectedErrs[1], "expected error doesn't match") - require.Equal(t, tcase.expectedLabels, res) + requireEqualSlice(t, tcase.expectedLabels, res) if err != nil { return } - for _, qr := range q.queriers { - m := unwrapMockGenericQuerier(t, qr) - + visitMockQueriers(t, q, func(t *testing.T, m *mockQuerier) { require.Equal(t, 1, m.labelNamesCalls) - } + }) }) t.Run("LabelValues", func(t *testing.T) { res, w, err := q.LabelValues(ctx, "test", nil) require.Subset(t, tcase.expectedWarnings, w) require.ErrorIs(t, err, tcase.expectedErrs[2], "expected error doesn't match") - require.Equal(t, tcase.expectedLabels, res) + requireEqualSlice(t, tcase.expectedLabels, res) if err != nil { return } - for _, qr := range q.queriers { - m := unwrapMockGenericQuerier(t, qr) - + visitMockQueriers(t, q, func(t *testing.T, m *mockQuerier) { require.Equal(t, []labelNameRequest{{name: "test"}}, m.labelNamesRequested) - } + }) }) t.Run("LabelValuesWithMatchers", func(t *testing.T) { matcher := labels.MustNewMatcher(labels.MatchEqual, "otherLabel", "someValue") res, w, err := q.LabelValues(ctx, "test2", nil, matcher) require.Subset(t, tcase.expectedWarnings, w) require.ErrorIs(t, err, tcase.expectedErrs[3], "expected error doesn't match") - require.Equal(t, tcase.expectedLabels, res) + requireEqualSlice(t, tcase.expectedLabels, res) if err != nil { return } - for _, qr := range q.queriers { - m := unwrapMockGenericQuerier(t, qr) - + visitMockQueriers(t, q, func(t *testing.T, m *mockQuerier) { require.Equal(t, []labelNameRequest{ {name: "test"}, {name: "test2", matchers: []*labels.Matcher{matcher}}, }, m.labelNamesRequested) - } + }) }) }) } } +// Check slice but ignore difference between nil and empty. +func requireEqualSlice[T any](t require.TestingT, a, b []T, msgAndArgs ...interface{}) { + if len(a) == 0 { + require.Empty(t, b, msgAndArgs...) + } else { + require.Equal(t, a, b, msgAndArgs...) + } +} + type errIterator struct { err error } From b7f2f3c3ac90f2347de6112c185a4e470e7ae8a6 Mon Sep 17 00:00:00 2001 From: Oleg Zaytsev Date: Tue, 30 Jul 2024 10:19:56 +0200 Subject: [PATCH 022/137] Add BenchmarkLoadRealWLs This benchmark runs on real WLs rather than fake generated ones. Signed-off-by: Oleg Zaytsev --- tsdb/head_test.go | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tsdb/head_test.go b/tsdb/head_test.go index c192c8a07..09927c23c 100644 --- a/tsdb/head_test.go +++ b/tsdb/head_test.go @@ -23,6 +23,7 @@ import ( "path" "path/filepath" "reflect" + "runtime/pprof" "sort" "strconv" "strings" @@ -89,6 +90,43 @@ func newTestHeadWithOptions(t testing.TB, compressWAL wlog.CompressionType, opts return h, wal } +// BenchmarkLoadRealWLs will be skipped unless the BENCHMARK_LOAD_REAL_WLS_DIR environment variable is set. +// BENCHMARK_LOAD_REAL_WLS_DIR should be the folder where `wal` and `chunks_head` are located. +// Optionally, BENCHMARK_LOAD_REAL_WLS_PROFILE can be set to a file path to write a CPU profile. +func BenchmarkLoadRealWLs(b *testing.B) { + dir := os.Getenv("BENCHMARK_LOAD_REAL_WLS_DIR") + if dir == "" { + b.Skipped() + } + + profileFile := os.Getenv("BENCHMARK_LOAD_REAL_WLS_PROFILE") + if profileFile != "" { + b.Logf("Will profile in %s", profileFile) + f, err := os.Create(profileFile) + require.NoError(b, err) + b.Cleanup(func() { f.Close() }) + require.NoError(b, pprof.StartCPUProfile(f)) + b.Cleanup(pprof.StopCPUProfile) + } + + wal, err := wlog.New(nil, nil, filepath.Join(dir, "wal"), wlog.CompressionNone) + require.NoError(b, err) + b.Cleanup(func() { wal.Close() }) + + wbl, err := wlog.New(nil, nil, filepath.Join(dir, "wbl"), wlog.CompressionNone) + require.NoError(b, err) + b.Cleanup(func() { wbl.Close() }) + + // Load the WAL. + for i := 0; i < b.N; i++ { + opts := DefaultHeadOptions() + opts.ChunkDirRoot = dir + h, err := NewHead(nil, nil, wal, wbl, opts, nil) + require.NoError(b, err) + h.Init(0) + } +} + func BenchmarkCreateSeries(b *testing.B) { series := genSeries(b.N, 10, 0, 0) h, _ := newTestHead(b, 10000, wlog.CompressionNone, false) From d8e1b6bdfd3c8cd02a38b21386453dac9b14da1b Mon Sep 17 00:00:00 2001 From: Oleg Zaytsev Date: Tue, 30 Jul 2024 10:20:29 +0200 Subject: [PATCH 023/137] Store mmMaxTime in same field as seriesShard We don't use seriesShard during DB initialization, so we can use the same 8 bytes to store mmMaxTime, and save those during the rest of the lifetime of the database. This doesn't affect CPU performance. Signed-off-by: Oleg Zaytsev --- tsdb/head.go | 46 +++++++++++++++++++++++++++++++++++----------- tsdb/head_read.go | 2 +- tsdb/head_wal.go | 13 ++++++++----- 3 files changed, 44 insertions(+), 17 deletions(-) diff --git a/tsdb/head.go b/tsdb/head.go index b7bfaa0fd..1659e57a4 100644 --- a/tsdb/head.go +++ b/tsdb/head.go @@ -178,6 +178,7 @@ type HeadOptions struct { WALReplayConcurrency int // EnableSharding enables ShardedPostings() support in the Head. + // EnableSharding is temporarily disabled during Init(). EnableSharding bool } @@ -609,7 +610,7 @@ const cardinalityCacheExpirationTime = time.Duration(30) * time.Second // Init loads data from the write ahead log and prepares the head for writes. // It should be called before using an appender so that it // limits the ingested samples to the head min valid time. -func (h *Head) Init(minValidTime int64) error { +func (h *Head) Init(minValidTime int64) (err error) { h.minValidTime.Store(minValidTime) defer func() { h.postings.EnsureOrder(h.opts.WALReplayConcurrency) @@ -623,6 +624,24 @@ func (h *Head) Init(minValidTime int64) error { } }() + // If sharding is enabled, disable it while initializing, and calculate the shards later. + // We're going to use that field for other purposes during WAL replay, + // so we don't want to waste time on calculating the shard that we're going to lose anyway. + if h.opts.EnableSharding { + h.opts.EnableSharding = false + defer func() { + if err == nil { + h.opts.EnableSharding = true + // No locking is needed here as nobody should be writing while we're in Init. + for _, stripe := range h.series.series { + for _, s := range stripe { + s.shardHashOrMemoryMappedMaxTime = labels.StableHash(s.lset) + } + } + } + }() + } + level.Info(h.logger).Log("msg", "Replaying on-disk memory mappable chunks if any") start := time.Now() @@ -683,7 +702,6 @@ func (h *Head) Init(minValidTime int64) error { mmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk oooMmappedChunks map[chunks.HeadSeriesRef][]*mmappedChunk lastMmapRef chunks.ChunkDiskMapperRef - err error mmapChunkReplayDuration time.Duration ) @@ -2068,9 +2086,11 @@ type memSeries struct { ref chunks.HeadSeriesRef meta *metadata.Metadata - // Series labels hash to use for sharding purposes. The value is always 0 when sharding has not - // been explicitly enabled in TSDB. - shardHash uint64 + // Series labels hash to use for sharding purposes. + // The value is always 0 when sharding has not been explicitly enabled in TSDB. + // While the WAL replay the value stored here is the max time of any mmapped chunk, + // and the shard hash is re-calculated after WAL replay is complete. + shardHashOrMemoryMappedMaxTime uint64 // Everything after here should only be accessed with the lock held. sync.Mutex @@ -2095,8 +2115,6 @@ type memSeries struct { ooo *memSeriesOOOFields - mmMaxTime int64 // Max time of any mmapped chunk, only used during WAL replay. - nextAt int64 // Timestamp at which to cut the next chunk. histogramChunkHasComputedEndTime bool // True if nextAt has been predicted for the current histograms chunk; false otherwise. pendingCommit bool // Whether there are samples waiting to be committed to this series. @@ -2127,10 +2145,10 @@ type memSeriesOOOFields struct { func newMemSeries(lset labels.Labels, id chunks.HeadSeriesRef, shardHash uint64, isolationDisabled bool) *memSeries { s := &memSeries{ - lset: lset, - ref: id, - nextAt: math.MinInt64, - shardHash: shardHash, + lset: lset, + ref: id, + nextAt: math.MinInt64, + shardHashOrMemoryMappedMaxTime: shardHash, } if !isolationDisabled { s.txs = newTxRing(0) @@ -2218,6 +2236,12 @@ func (s *memSeries) truncateChunksBefore(mint int64, minOOOMmapRef chunks.ChunkD return removedInOrder + removedOOO } +// shardHash returns the shard hash of the series, only available after WAL replay. +func (s *memSeries) shardHash() uint64 { return s.shardHashOrMemoryMappedMaxTime } + +// mmMaxTime returns the max time of any mmapped chunk in the series, only available during WAL replay. +func (s *memSeries) mmMaxTime() int64 { return int64(s.shardHashOrMemoryMappedMaxTime) } + // cleanupAppendIDsBelow cleans up older appendIDs. Has to be called after // acquiring lock. func (s *memSeries) cleanupAppendIDsBelow(bound uint64) { diff --git a/tsdb/head_read.go b/tsdb/head_read.go index 9ba8785ad..3a50f316b 100644 --- a/tsdb/head_read.go +++ b/tsdb/head_read.go @@ -170,7 +170,7 @@ func (h *headIndexReader) ShardedPostings(p index.Postings, shardIndex, shardCou } // Check if the series belong to the shard. - if s.shardHash%shardCount != shardIndex { + if s.shardHash()%shardCount != shardIndex { continue } diff --git a/tsdb/head_wal.go b/tsdb/head_wal.go index 787cb7c26..2852709a0 100644 --- a/tsdb/head_wal.go +++ b/tsdb/head_wal.go @@ -435,6 +435,8 @@ Outer: return nil } +func minInt64() int64 { return math.MinInt64 } + // resetSeriesWithMMappedChunks is only used during the WAL replay. func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc, oooMmc []*mmappedChunk, walSeriesRef chunks.HeadSeriesRef) (overlapped bool) { if mSeries.ref != walSeriesRef { @@ -481,10 +483,11 @@ func (h *Head) resetSeriesWithMMappedChunks(mSeries *memSeries, mmc, oooMmc []*m } // Cache the last mmapped chunk time, so we can skip calling append() for samples it will reject. if len(mmc) == 0 { - mSeries.mmMaxTime = math.MinInt64 + mSeries.shardHashOrMemoryMappedMaxTime = uint64(minInt64()) } else { - mSeries.mmMaxTime = mmc[len(mmc)-1].maxTime - h.updateMinMaxTime(mmc[0].minTime, mSeries.mmMaxTime) + mmMaxTime := mmc[len(mmc)-1].maxTime + mSeries.shardHashOrMemoryMappedMaxTime = uint64(mmMaxTime) + h.updateMinMaxTime(mmc[0].minTime, mmMaxTime) } if len(oooMmc) != 0 { // Mint and maxt can be in any chunk, they are not sorted. @@ -585,7 +588,7 @@ func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks, oooMmapp unknownRefs++ continue } - if s.T <= ms.mmMaxTime { + if s.T <= ms.mmMaxTime() { continue } if _, chunkCreated := ms.append(s.T, s.V, 0, appendChunkOpts); chunkCreated { @@ -614,7 +617,7 @@ func (wp *walSubsetProcessor) processWALSamples(h *Head, mmappedChunks, oooMmapp unknownHistogramRefs++ continue } - if s.t <= ms.mmMaxTime { + if s.t <= ms.mmMaxTime() { continue } var chunkCreated bool From 03963b9ba059af04a50f869578519ff159bf2b14 Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Tue, 30 Jul 2024 10:11:16 +0100 Subject: [PATCH 024/137] Merge pull request #14515 from prometheus/revert-13777-remoteread2 (#14524) Revert "Chunked remote read: close the querier earlier" Signed-off-by: Bryan Boreham --- storage/remote/read_handler.go | 53 ++++++++++++++-------------------- 1 file changed, 21 insertions(+), 32 deletions(-) diff --git a/storage/remote/read_handler.go b/storage/remote/read_handler.go index 2a00ce897..ffc64c9c3 100644 --- a/storage/remote/read_handler.go +++ b/storage/remote/read_handler.go @@ -202,16 +202,34 @@ func (h *readHandler) remoteReadStreamedXORChunks(ctx context.Context, w http.Re return err } - chunks := h.getChunkSeriesSet(ctx, query, filteredMatchers) - if err := chunks.Err(); err != nil { + querier, err := h.queryable.ChunkQuerier(query.StartTimestampMs, query.EndTimestampMs) + if err != nil { return err } + defer func() { + if err := querier.Close(); err != nil { + level.Warn(h.logger).Log("msg", "Error on chunk querier close", "err", err.Error()) + } + }() + + var hints *storage.SelectHints + if query.Hints != nil { + hints = &storage.SelectHints{ + Start: query.Hints.StartMs, + End: query.Hints.EndMs, + Step: query.Hints.StepMs, + Func: query.Hints.Func, + Grouping: query.Hints.Grouping, + Range: query.Hints.RangeMs, + By: query.Hints.By, + } + } ws, err := StreamChunkedReadResponses( NewChunkedWriter(w, f), int64(i), // The streaming API has to provide the series sorted. - chunks, + querier.Select(ctx, true, hints, filteredMatchers...), sortedExternalLabels, h.remoteReadMaxBytesInFrame, h.marshalPool, @@ -236,35 +254,6 @@ func (h *readHandler) remoteReadStreamedXORChunks(ctx context.Context, w http.Re } } -// getChunkSeriesSet executes a query to retrieve a ChunkSeriesSet, -// encapsulating the operation in its own function to ensure timely release of -// the querier resources. -func (h *readHandler) getChunkSeriesSet(ctx context.Context, query *prompb.Query, filteredMatchers []*labels.Matcher) storage.ChunkSeriesSet { - querier, err := h.queryable.ChunkQuerier(query.StartTimestampMs, query.EndTimestampMs) - if err != nil { - return storage.ErrChunkSeriesSet(err) - } - defer func() { - if err := querier.Close(); err != nil { - level.Warn(h.logger).Log("msg", "Error on chunk querier close", "err", err.Error()) - } - }() - - var hints *storage.SelectHints - if query.Hints != nil { - hints = &storage.SelectHints{ - Start: query.Hints.StartMs, - End: query.Hints.EndMs, - Step: query.Hints.StepMs, - Func: query.Hints.Func, - Grouping: query.Hints.Grouping, - Range: query.Hints.RangeMs, - By: query.Hints.By, - } - } - return querier.Select(ctx, true, hints, filteredMatchers...) -} - // filterExtLabelsFromMatchers change equality matchers which match external labels // to a matcher that looks for an empty label, // as that label should not be present in the storage. From 2898d5d715738776bf8dd31d424a9b297380a2f3 Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Tue, 30 Jul 2024 10:15:23 +0100 Subject: [PATCH 025/137] Add #14515 to CHANGELOG Signed-off-by: Bryan Boreham --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 02ffc5e4b..115055d12 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,7 @@ Remote-write v2 is enabled by default, but can be disabled via feature-flag `web * [BUGFIX] Remote-Write: Fix data corruption in remote write if max_sample_age is applied. #14078 * [BUGFIX] Notifier: Fix Alertmanager discovery not updating under heavy load. #14174 * [BUGFIX] Regexes: some Unicode characters were not matched by case-insensitive comparison. #14170,#14299 +* [BUGFIX] Remote-Read: Resolve occasional segmentation fault on query. #14515 ## 2.53.1 / 2024-07-10 From 0300ad58a97098674ca4757c79a74a05e9c33322 Mon Sep 17 00:00:00 2001 From: Oleg Zaytsev Date: Tue, 30 Jul 2024 11:31:31 +0200 Subject: [PATCH 026/137] Revert the option regardless of error Signed-off-by: Oleg Zaytsev --- tsdb/head.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tsdb/head.go b/tsdb/head.go index 1659e57a4..9d81b24ae 100644 --- a/tsdb/head.go +++ b/tsdb/head.go @@ -630,8 +630,8 @@ func (h *Head) Init(minValidTime int64) (err error) { if h.opts.EnableSharding { h.opts.EnableSharding = false defer func() { + h.opts.EnableSharding = true if err == nil { - h.opts.EnableSharding = true // No locking is needed here as nobody should be writing while we're in Init. for _, stripe := range h.series.series { for _, s := range stripe { From 6cef8698c27b99263efcbe5025846187cf4358f7 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 30 Jul 2024 13:30:49 +0200 Subject: [PATCH 027/137] build(deps-dev): bump @lezer/generator from 1.7.0 to 1.7.1 in /web/ui (#14382) Bumps [@lezer/generator](https://github.com/lezer-parser/generator) from 1.7.0 to 1.7.1. - [Changelog](https://github.com/lezer-parser/generator/blob/main/CHANGELOG.md) - [Commits](https://github.com/lezer-parser/generator/compare/1.7.0...1.7.1) --- updated-dependencies: - dependency-name: "@lezer/generator" dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- web/ui/module/lezer-promql/package.json | 2 +- web/ui/package-lock.json | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/web/ui/module/lezer-promql/package.json b/web/ui/module/lezer-promql/package.json index cbd03ae2b..43a5c44fa 100644 --- a/web/ui/module/lezer-promql/package.json +++ b/web/ui/module/lezer-promql/package.json @@ -30,7 +30,7 @@ "test": "NODE_OPTIONS=--experimental-vm-modules jest" }, "devDependencies": { - "@lezer/generator": "^1.7.0", + "@lezer/generator": "^1.7.1", "@lezer/highlight": "^1.2.0", "@lezer/lr": "^1.4.1" }, diff --git a/web/ui/package-lock.json b/web/ui/package-lock.json index 62ac34e43..2028c3402 100644 --- a/web/ui/package-lock.json +++ b/web/ui/package-lock.json @@ -72,7 +72,7 @@ "version": "0.53.1", "license": "Apache-2.0", "devDependencies": { - "@lezer/generator": "^1.7.0", + "@lezer/generator": "^1.7.1", "@lezer/highlight": "^1.2.0", "@lezer/lr": "^1.4.1" }, @@ -3371,9 +3371,9 @@ "integrity": "sha512-yemX0ZD2xS/73llMZIK6KplkjIjf2EvAHcinDi/TfJ9hS25G0388+ClHt6/3but0oOxinTcQHJLDXh6w1crzFQ==" }, "node_modules/@lezer/generator": { - "version": "1.7.0", - "resolved": "https://registry.npmjs.org/@lezer/generator/-/generator-1.7.0.tgz", - "integrity": "sha512-IJ16tx3biLKlCXUzcK4v8S10AVa2BSM2rB12rtAL6f1hL2TS/HQQlGCoWRvanlL2J4mCYEEIv9uG7n4kVMkVDA==", + "version": "1.7.1", + "resolved": "https://registry.npmjs.org/@lezer/generator/-/generator-1.7.1.tgz", + "integrity": "sha512-MgPJN9Si+ccxzXl3OAmCeZuUKw4XiPl4y664FX/hnnyG9CTqUPq65N3/VGPA2jD23D7QgMTtNqflta+cPN+5mQ==", "dev": true, "dependencies": { "@lezer/common": "^1.1.0", From 84b819a69f375dc66ea41302a56e44975c0317e3 Mon Sep 17 00:00:00 2001 From: Max Amin Date: Tue, 30 Jul 2024 11:25:19 -0400 Subject: [PATCH 028/137] feat: add Google cloud roundtripper for remote write (#14346) * feat: Google Auth for remote write Signed-off-by: Max Amin --------- Signed-off-by: Max Amin --- config/config.go | 36 +++++++++++++----- config/config_test.go | 2 +- docs/configuration/configuration.md | 16 ++++++-- promql/engine_test.go | 3 +- rules/manager_test.go | 3 +- storage/remote/client.go | 9 +++++ storage/remote/googleiam/googleiam.go | 54 +++++++++++++++++++++++++++ storage/remote/write.go | 1 + tsdb/db_test.go | 5 ++- 9 files changed, 110 insertions(+), 19 deletions(-) create mode 100644 storage/remote/googleiam/googleiam.go diff --git a/config/config.go b/config/config.go index 913983881..8a6216146 100644 --- a/config/config.go +++ b/config/config.go @@ -37,6 +37,7 @@ import ( "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/model/relabel" "github.com/prometheus/prometheus/storage/remote/azuread" + "github.com/prometheus/prometheus/storage/remote/googleiam" ) var ( @@ -1123,6 +1124,7 @@ type RemoteWriteConfig struct { MetadataConfig MetadataConfig `yaml:"metadata_config,omitempty"` SigV4Config *sigv4.SigV4Config `yaml:"sigv4,omitempty"` AzureADConfig *azuread.AzureADConfig `yaml:"azuread,omitempty"` + GoogleIAMConfig *googleiam.Config `yaml:"google_iam,omitempty"` } // SetDirectory joins any relative file paths with dir. @@ -1160,17 +1162,33 @@ func (c *RemoteWriteConfig) UnmarshalYAML(unmarshal func(interface{}) error) err return err } - httpClientConfigAuthEnabled := c.HTTPClientConfig.BasicAuth != nil || - c.HTTPClientConfig.Authorization != nil || c.HTTPClientConfig.OAuth2 != nil + return validateAuthConfigs(c) +} - if httpClientConfigAuthEnabled && (c.SigV4Config != nil || c.AzureADConfig != nil) { - return fmt.Errorf("at most one of basic_auth, authorization, oauth2, sigv4, & azuread must be configured") +// validateAuthConfigs validates that at most one of basic_auth, authorization, oauth2, sigv4, azuread or google_iam must be configured. +func validateAuthConfigs(c *RemoteWriteConfig) error { + var authConfigured []string + if c.HTTPClientConfig.BasicAuth != nil { + authConfigured = append(authConfigured, "basic_auth") } - - if c.SigV4Config != nil && c.AzureADConfig != nil { - return fmt.Errorf("at most one of basic_auth, authorization, oauth2, sigv4, & azuread must be configured") + if c.HTTPClientConfig.Authorization != nil { + authConfigured = append(authConfigured, "authorization") + } + if c.HTTPClientConfig.OAuth2 != nil { + authConfigured = append(authConfigured, "oauth2") + } + if c.SigV4Config != nil { + authConfigured = append(authConfigured, "sigv4") + } + if c.AzureADConfig != nil { + authConfigured = append(authConfigured, "azuread") + } + if c.GoogleIAMConfig != nil { + authConfigured = append(authConfigured, "google_iam") + } + if len(authConfigured) > 1 { + return fmt.Errorf("at most one of basic_auth, authorization, oauth2, sigv4, azuread or google_iam must be configured. Currently configured: %v", authConfigured) } - return nil } @@ -1189,7 +1207,7 @@ func validateHeadersForTracing(headers map[string]string) error { func validateHeaders(headers map[string]string) error { for header := range headers { if strings.ToLower(header) == "authorization" { - return errors.New("authorization header must be changed via the basic_auth, authorization, oauth2, sigv4, or azuread parameter") + return errors.New("authorization header must be changed via the basic_auth, authorization, oauth2, sigv4, azuread or google_iam parameter") } if _, ok := reservedHeaders[strings.ToLower(header)]; ok { return fmt.Errorf("%s is a reserved header. It must not be changed", header) diff --git a/config/config_test.go b/config/config_test.go index b684fdb50..9b074bef1 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -1826,7 +1826,7 @@ var expectedErrors = []struct { }, { filename: "remote_write_authorization_header.bad.yml", - errMsg: `authorization header must be changed via the basic_auth, authorization, oauth2, sigv4, or azuread parameter`, + errMsg: `authorization header must be changed via the basic_auth, authorization, oauth2, sigv4, azuread or google_iam parameter`, }, { filename: "remote_write_wrong_msg.bad.yml", diff --git a/docs/configuration/configuration.md b/docs/configuration/configuration.md index 5aa57b3ba..313a7f2f3 100644 --- a/docs/configuration/configuration.md +++ b/docs/configuration/configuration.md @@ -3401,8 +3401,8 @@ authorization: # It is mutually exclusive with `credentials`. [ credentials_file: ] -# Optionally configures AWS's Signature Verification 4 signing process to -# sign requests. Cannot be set at the same time as basic_auth, authorization, or oauth2. +# Optionally configures AWS's Signature Verification 4 signing process to sign requests. +# Cannot be set at the same time as basic_auth, authorization, oauth2, azuread or google_iam. # To use the default credentials from the AWS SDK, use `sigv4: {}`. sigv4: # The AWS region. If blank, the region from the default credentials chain @@ -3655,12 +3655,12 @@ sigv4: [ role_arn: ] # Optional OAuth 2.0 configuration. -# Cannot be used at the same time as basic_auth, authorization, sigv4, or azuread. +# Cannot be used at the same time as basic_auth, authorization, sigv4, azuread or google_iam. oauth2: [ ] # Optional AzureAD configuration. -# Cannot be used at the same time as basic_auth, authorization, oauth2, or sigv4. +# Cannot be used at the same time as basic_auth, authorization, oauth2, sigv4 or google_iam. azuread: # The Azure Cloud. Options are 'AzurePublic', 'AzureChina', or 'AzureGovernment'. [ cloud: | default = AzurePublic ] @@ -3680,6 +3680,14 @@ azuread: [ sdk: [ tenant_id: ] ] +# WARNING: Remote write is NOT SUPPORTED by Google Cloud. This configuration is reserved for future use. +# Optional Google Cloud Monitoring configuration. +# Cannot be used at the same time as basic_auth, authorization, oauth2, sigv4 or azuread. +# To use the default credentials from the Google Cloud SDK, use `google_iam: {}`. +google_iam: + # Service account key with monitoring write permessions. + credentials_file: + # Configures the remote write request's TLS settings. tls_config: [ ] diff --git a/promql/engine_test.go b/promql/engine_test.go index 523c0613d..8e618d435 100644 --- a/promql/engine_test.go +++ b/promql/engine_test.go @@ -26,7 +26,6 @@ import ( "time" "github.com/stretchr/testify/require" - "go.uber.org/goleak" "github.com/prometheus/prometheus/model/histogram" "github.com/prometheus/prometheus/model/labels" @@ -51,7 +50,7 @@ const ( func TestMain(m *testing.M) { // Enable experimental functions testing parser.EnableExperimentalFunctions = true - goleak.VerifyTestMain(m) + testutil.TolerantVerifyLeak(m) } func TestQueryConcurrency(t *testing.T) { diff --git a/rules/manager_test.go b/rules/manager_test.go index 51239e6c9..9865cbdfe 100644 --- a/rules/manager_test.go +++ b/rules/manager_test.go @@ -32,7 +32,6 @@ import ( "github.com/prometheus/common/model" "github.com/stretchr/testify/require" "go.uber.org/atomic" - "go.uber.org/goleak" "gopkg.in/yaml.v2" "github.com/prometheus/prometheus/model/labels" @@ -50,7 +49,7 @@ import ( ) func TestMain(m *testing.M) { - goleak.VerifyTestMain(m) + prom_testutil.TolerantVerifyLeak(m) } func TestAlertingRule(t *testing.T) { diff --git a/storage/remote/client.go b/storage/remote/client.go index 17caf7be9..11e423b6a 100644 --- a/storage/remote/client.go +++ b/storage/remote/client.go @@ -37,6 +37,7 @@ import ( "github.com/prometheus/prometheus/config" "github.com/prometheus/prometheus/prompb" "github.com/prometheus/prometheus/storage/remote/azuread" + "github.com/prometheus/prometheus/storage/remote/googleiam" ) const maxErrMsgLen = 1024 @@ -131,6 +132,7 @@ type ClientConfig struct { HTTPClientConfig config_util.HTTPClientConfig SigV4Config *sigv4.SigV4Config AzureADConfig *azuread.AzureADConfig + GoogleIAMConfig *googleiam.Config Headers map[string]string RetryOnRateLimit bool WriteProtoMsg config.RemoteWriteProtoMsg @@ -192,6 +194,13 @@ func NewWriteClient(name string, conf *ClientConfig) (WriteClient, error) { } } + if conf.GoogleIAMConfig != nil { + t, err = googleiam.NewRoundTripper(conf.GoogleIAMConfig, t) + if err != nil { + return nil, err + } + } + writeProtoMsg := config.RemoteWriteProtoMsgV1 if conf.WriteProtoMsg != "" { writeProtoMsg = conf.WriteProtoMsg diff --git a/storage/remote/googleiam/googleiam.go b/storage/remote/googleiam/googleiam.go new file mode 100644 index 000000000..acf3bd5a6 --- /dev/null +++ b/storage/remote/googleiam/googleiam.go @@ -0,0 +1,54 @@ +// Copyright 2024 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package googleiam provides an http.RoundTripper that attaches an Google Cloud accessToken +// to remote write requests. +package googleiam + +import ( + "context" + "fmt" + "net/http" + + "golang.org/x/oauth2/google" + "google.golang.org/api/option" + apihttp "google.golang.org/api/transport/http" +) + +type Config struct { + CredentialsFile string `yaml:"credentials_file,omitempty"` +} + +// NewRoundTripper creates a round tripper that adds Google Cloud Monitoring authorization to calls +// using either a credentials file or the default credentials. +func NewRoundTripper(cfg *Config, next http.RoundTripper) (http.RoundTripper, error) { + if next == nil { + next = http.DefaultTransport + } + const scopes = "https://www.googleapis.com/auth/monitoring.write" + ctx := context.Background() + opts := []option.ClientOption{ + option.WithScopes(scopes), + } + if cfg.CredentialsFile != "" { + opts = append(opts, option.WithCredentialsFile(cfg.CredentialsFile)) + } else { + creds, err := google.FindDefaultCredentials(ctx, scopes) + if err != nil { + return nil, fmt.Errorf("error finding default Google credentials: %w", err) + } + opts = append(opts, option.WithCredentials(creds)) + } + + return apihttp.NewTransport(ctx, next, opts...) +} diff --git a/storage/remote/write.go b/storage/remote/write.go index 81902a8f1..3d2f1fdfc 100644 --- a/storage/remote/write.go +++ b/storage/remote/write.go @@ -176,6 +176,7 @@ func (rws *WriteStorage) ApplyConfig(conf *config.Config) error { HTTPClientConfig: rwConf.HTTPClientConfig, SigV4Config: rwConf.SigV4Config, AzureADConfig: rwConf.AzureADConfig, + GoogleIAMConfig: rwConf.GoogleIAMConfig, Headers: rwConf.Headers, RetryOnRateLimit: rwConf.QueueConfig.RetryOnRateLimit, }) diff --git a/tsdb/db_test.go b/tsdb/db_test.go index c0edafe08..c8dad8699 100644 --- a/tsdb/db_test.go +++ b/tsdb/db_test.go @@ -63,7 +63,10 @@ func TestMain(m *testing.M) { flag.Parse() defaultIsolationDisabled = !isolationEnabled - goleak.VerifyTestMain(m, goleak.IgnoreTopFunction("github.com/prometheus/prometheus/tsdb.(*SegmentWAL).cut.func1"), goleak.IgnoreTopFunction("github.com/prometheus/prometheus/tsdb.(*SegmentWAL).cut.func2")) + goleak.VerifyTestMain(m, + goleak.IgnoreTopFunction("github.com/prometheus/prometheus/tsdb.(*SegmentWAL).cut.func1"), + goleak.IgnoreTopFunction("github.com/prometheus/prometheus/tsdb.(*SegmentWAL).cut.func2"), + goleak.IgnoreTopFunction("go.opencensus.io/stats/view.(*worker).start")) } func openTestDB(t testing.TB, opts *Options, rngs []int64) (db *DB) { From 15618157321f988e069cdaa955422b24632f5743 Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Tue, 30 Jul 2024 14:08:28 -0700 Subject: [PATCH 029/137] remote write: increase time threshold for resharding (#14450) Don't reshard if we haven't successfully sent a sample in the last shardUpdateDuration seconds. Signed-off-by: Callum Styan Co-authored-by: kushagra Shukla --- storage/remote/queue_manager.go | 6 +++--- storage/remote/queue_manager_test.go | 13 ++++++++----- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/storage/remote/queue_manager.go b/storage/remote/queue_manager.go index 5b59288e6..17ff1850f 100644 --- a/storage/remote/queue_manager.go +++ b/storage/remote/queue_manager.go @@ -1109,9 +1109,9 @@ func (t *QueueManager) shouldReshard(desiredShards int) bool { if desiredShards == t.numShards { return false } - // We shouldn't reshard if Prometheus hasn't been able to send to the - // remote endpoint successfully within some period of time. - minSendTimestamp := time.Now().Add(-2 * time.Duration(t.cfg.BatchSendDeadline)).Unix() + // We shouldn't reshard if Prometheus hasn't been able to send + // since the last time it checked if it should reshard. + minSendTimestamp := time.Now().Add(-1 * shardUpdateDuration).Unix() lsts := t.lastSendTimestamp.Load() if lsts < minSendTimestamp { level.Warn(t.logger).Log("msg", "Skipping resharding, last successful send was beyond threshold", "lastSendTimestamp", lsts, "minSendTimestamp", minSendTimestamp) diff --git a/storage/remote/queue_manager_test.go b/storage/remote/queue_manager_test.go index 7343184fc..1c06173a5 100644 --- a/storage/remote/queue_manager_test.go +++ b/storage/remote/queue_manager_test.go @@ -703,32 +703,35 @@ func TestShouldReshard(t *testing.T) { startingShards int samplesIn, samplesOut, lastSendTimestamp int64 expectedToReshard bool + sendDeadline model.Duration } cases := []testcase{ { - // Resharding shouldn't take place if the last successful send was > batch send deadline*2 seconds ago. + // resharding shouldn't take place if we haven't successfully sent + // since the last shardUpdateDuration, even if the send deadline is very low startingShards: 10, samplesIn: 1000, samplesOut: 10, - lastSendTimestamp: time.Now().Unix() - int64(3*time.Duration(config.DefaultQueueConfig.BatchSendDeadline)/time.Second), + lastSendTimestamp: time.Now().Unix() - int64(shardUpdateDuration), expectedToReshard: false, + sendDeadline: model.Duration(100 * time.Millisecond), }, { - startingShards: 5, + startingShards: 10, samplesIn: 1000, samplesOut: 10, lastSendTimestamp: time.Now().Unix(), expectedToReshard: true, + sendDeadline: config.DefaultQueueConfig.BatchSendDeadline, }, } for _, c := range cases { - _, m := newTestClientAndQueueManager(t, defaultFlushDeadline, config.RemoteWriteProtoMsgV1) + _, m := newTestClientAndQueueManager(t, time.Duration(c.sendDeadline), config.RemoteWriteProtoMsgV1) m.numShards = c.startingShards m.dataIn.incr(c.samplesIn) m.dataOut.incr(c.samplesOut) m.lastSendTimestamp.Store(c.lastSendTimestamp) - m.Start() desiredShards := m.calculateDesiredShards() From 2880ee8e46e2c49e5155523b30b7878d7cc65ae8 Mon Sep 17 00:00:00 2001 From: Matthieu MOREL Date: Thu, 25 Jan 2024 07:29:48 +0100 Subject: [PATCH 030/137] chore: provide OSSF security insight Signed-off-by: Matthieu MOREL --- README.md | 3 ++- SECURITY-INSIGHTS.yml | 48 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 SECURITY-INSIGHTS.yml diff --git a/README.md b/README.md index cd14ed2ec..df974e109 100644 --- a/README.md +++ b/README.md @@ -12,9 +12,10 @@ examples and guides.

[![Docker Pulls](https://img.shields.io/docker/pulls/prom/prometheus.svg?maxAge=604800)][hub] [![Go Report Card](https://goreportcard.com/badge/github.com/prometheus/prometheus)](https://goreportcard.com/report/github.com/prometheus/prometheus) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/486/badge)](https://bestpractices.coreinfrastructure.org/projects/486) +[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/prometheus/prometheus/badge)](https://securityscorecards.dev/viewer/?uri=github.com/prometheus/prometheus) +[![CLOMonitor](https://img.shields.io/endpoint?url=https://clomonitor.io/api/projects/cncf/prometheus/badge)](https://clomonitor.io/projects/cncf/prometheus) [![Gitpod ready-to-code](https://img.shields.io/badge/Gitpod-ready--to--code-blue?logo=gitpod)](https://gitpod.io/#https://github.com/prometheus/prometheus) [![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/prometheus.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:prometheus) -[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/prometheus/prometheus/badge)](https://securityscorecards.dev/viewer/?uri=github.com/prometheus/prometheus) diff --git a/SECURITY-INSIGHTS.yml b/SECURITY-INSIGHTS.yml new file mode 100644 index 000000000..009b35621 --- /dev/null +++ b/SECURITY-INSIGHTS.yml @@ -0,0 +1,48 @@ +header: + schema-version: '1.0.0' + expiration-date: '2025-07-30T01:00:00.000Z' + last-updated: '2024-07-30' + last-reviewed: '2024-07-30' + project-url: https://github.com/prometheus/prometheus + changelog: https://github.com/prometheus/prometheus/blob/main/CHANGELOG.md + license: https://github.com/prometheus/prometheus/blob/main/LICENSE +project-lifecycle: + status: active + bug-fixes-only: false + core-maintainers: + - https://github.com/prometheus/prometheus/blob/main/MAINTAINERS.md +contribution-policy: + accepts-pull-requests: true + accepts-automated-pull-requests: true +dependencies: + third-party-packages: true + dependencies-lists: + - https://github.com/prometheus/prometheus/blob/main/go.mod + - https://github.com/prometheus/prometheus/blob/main/web/ui/package.json + env-dependencies-policy: + policy-url: https://github.com/prometheus/prometheus/blob/main/CONTRIBUTING.md#dependency-management +distribution-points: + - https://github.com/prometheus/prometheus/releases +documentation: + - https://prometheus.io/docs/introduction/overview/ +security-contacts: + - type: email + value: prometheus-team@googlegroups.com +security-testing: + - tool-type: sca + tool-name: Dependabot + tool-version: latest + integration: + ad-hoc: false + ci: true + before-release: true + - tool-type: sast + tool-name: CodeQL + tool-version: latest + integration: + ad-hoc: false + ci: true + before-release: true +vulnerability-reporting: + accepts-vulnerability-reports: true + security-policy: https://github.com/prometheus/prometheus/security/policy From 7fab72a280f139170a14e6f6a21f6396fa02899e Mon Sep 17 00:00:00 2001 From: Charles Korn Date: Wed, 31 Jul 2024 17:53:05 +1000 Subject: [PATCH 031/137] promqltest: add support for setting counter reset hint on histogram samples (#14537) * promqltest: add support for setting counter reset hint on histogram samples Signed-off-by: Charles Korn --- docs/configuration/unit_testing_rules.md | 4 +- promql/parser/generated_parser.y | 17 +- promql/parser/generated_parser.y.go | 797 ++++++++++++----------- promql/parser/lex.go | 33 +- promql/parser/parse.go | 22 + promql/parser/parse_test.go | 75 ++- 6 files changed, 531 insertions(+), 417 deletions(-) diff --git a/docs/configuration/unit_testing_rules.md b/docs/configuration/unit_testing_rules.md index 163fcb91f..7fc676a25 100644 --- a/docs/configuration/unit_testing_rules.md +++ b/docs/configuration/unit_testing_rules.md @@ -92,7 +92,7 @@ series: # # Native histogram notation: # Native histograms can be used instead of floating point numbers using the following notation: -# {{schema:1 sum:-0.3 count:3.1 z_bucket:7.1 z_bucket_w:0.05 buckets:[5.1 10 7] offset:-3 n_buckets:[4.1 5] n_offset:-5}} +# {{schema:1 sum:-0.3 count:3.1 z_bucket:7.1 z_bucket_w:0.05 buckets:[5.1 10 7] offset:-3 n_buckets:[4.1 5] n_offset:-5 counter_reset_hint:gauge}} # Native histograms support the same expanding notation as floating point numbers, i.e. 'axn', 'a+bxn' and 'a-bxn'. # All properties are optional and default to 0. The order is not important. The following properties are supported: # - schema (int): @@ -119,6 +119,8 @@ series: # Observation counts in negative buckets. Each represents an absolute count. # - n_offset (int): # The starting index of the first entry in the negative buckets. +# - counter_reset_hint (one of 'unknown', 'reset', 'not_reset' or 'gauge') +# The counter reset hint associated with this histogram. Defaults to 'unknown' if not set. values: ``` diff --git a/promql/parser/generated_parser.y b/promql/parser/generated_parser.y index b99e67424..b8e6aa373 100644 --- a/promql/parser/generated_parser.y +++ b/promql/parser/generated_parser.y @@ -84,6 +84,7 @@ NEGATIVE_BUCKETS_DESC ZERO_BUCKET_DESC ZERO_BUCKET_WIDTH_DESC CUSTOM_VALUES_DESC +COUNTER_RESET_HINT_DESC %token histogramDescEnd // Operators. @@ -149,6 +150,14 @@ START END %token preprocessorEnd +// Counter reset hints. +%token counterResetHintsStart +%token +UNKNOWN_COUNTER_RESET +COUNTER_RESET +NOT_COUNTER_RESET +GAUGE_TYPE +%token counterResetHintsEnd // Start symbols for the generated parser. %token startSymbolsStart @@ -163,7 +172,7 @@ START_METRIC_SELECTOR // Type definitions for grammar rules. %type label_match_list %type label_matcher -%type aggregate_op grouping_label match_op maybe_label metric_identifier unary_op at_modifier_preprocessors string_identifier +%type aggregate_op grouping_label match_op maybe_label metric_identifier unary_op at_modifier_preprocessors string_identifier counter_reset_hint %type label_set metric %type label_set_list %type