prometheus/storage/remote/queue_manager_test.go
Nathan Baulch 50cd453c8f
Some checks failed
buf.build / lint and publish (push) Waiting to run
CI / Go tests (push) Waiting to run
CI / More Go tests (push) Waiting to run
CI / Go tests with previous Go version (push) Waiting to run
CI / UI tests (push) Waiting to run
CI / Go tests on Windows (push) Waiting to run
CI / Mixins tests (push) Waiting to run
CI / Build Prometheus for common architectures (0) (push) Waiting to run
CI / Build Prometheus for common architectures (1) (push) Waiting to run
CI / Build Prometheus for common architectures (2) (push) Waiting to run
CI / Build Prometheus for all architectures (0) (push) Waiting to run
CI / Build Prometheus for all architectures (1) (push) Waiting to run
CI / Build Prometheus for all architectures (10) (push) Waiting to run
CI / Build Prometheus for all architectures (11) (push) Waiting to run
CI / Build Prometheus for all architectures (2) (push) Waiting to run
CI / Build Prometheus for all architectures (3) (push) Waiting to run
CI / Build Prometheus for all architectures (4) (push) Waiting to run
CI / Build Prometheus for all architectures (5) (push) Waiting to run
CI / Build Prometheus for all architectures (6) (push) Waiting to run
CI / Build Prometheus for all architectures (7) (push) Waiting to run
CI / Build Prometheus for all architectures (8) (push) Waiting to run
CI / Build Prometheus for all architectures (9) (push) Waiting to run
CI / Report status of build Prometheus for all architectures (push) Blocked by required conditions
CI / Check generated parser (push) Waiting to run
CI / golangci-lint (push) Waiting to run
CI / fuzzing (push) Waiting to run
CI / codeql (push) Waiting to run
CI / Publish main branch artifacts (push) Blocked by required conditions
CI / Publish release artefacts (push) Blocked by required conditions
CI / Publish UI on npm Registry (push) Blocked by required conditions
Scorecards supply-chain security / Scorecards analysis (push) Waiting to run
Push README to Docker Hub / Push README to Docker Hub (push) Has been cancelled
Push README to Docker Hub / Push README to quay.io (push) Has been cancelled
chore: Fix typos (#14868)
* Fix typos

---------

Signed-off-by: Nathan Baulch <nathan.baulch@gmail.com>
2024-09-10 22:32:03 +02:00

2301 lines
71 KiB
Go

// Copyright 2013 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package remote
import (
"context"
"errors"
"fmt"
"math"
"math/rand"
"os"
"runtime/pprof"
"sort"
"strconv"
"strings"
"sync"
"testing"
"time"
"github.com/go-kit/log"
"github.com/gogo/protobuf/proto"
"github.com/golang/snappy"
"github.com/google/go-cmp/cmp"
"github.com/prometheus/client_golang/prometheus"
client_testutil "github.com/prometheus/client_golang/prometheus/testutil"
"github.com/prometheus/common/model"
"github.com/stretchr/testify/require"
"go.uber.org/atomic"
"github.com/prometheus/prometheus/config"
"github.com/prometheus/prometheus/model/histogram"
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/model/relabel"
"github.com/prometheus/prometheus/model/timestamp"
"github.com/prometheus/prometheus/prompb"
writev2 "github.com/prometheus/prometheus/prompb/io/prometheus/write/v2"
"github.com/prometheus/prometheus/scrape"
"github.com/prometheus/prometheus/tsdb/chunks"
"github.com/prometheus/prometheus/tsdb/record"
"github.com/prometheus/prometheus/util/runutil"
"github.com/prometheus/prometheus/util/testutil"
)
const defaultFlushDeadline = 1 * time.Minute
func newHighestTimestampMetric() *maxTimestamp {
return &maxTimestamp{
Gauge: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "highest_timestamp_in_seconds",
Help: "Highest timestamp that has come into the remote storage via the Appender interface, in seconds since epoch. Initialized to 0 when no data has been received yet",
}),
}
}
func TestBasicContentNegotiation(t *testing.T) {
queueConfig := config.DefaultQueueConfig
queueConfig.BatchSendDeadline = model.Duration(100 * time.Millisecond)
queueConfig.MaxShards = 1
// We need to set URL's so that metric creation doesn't panic.
writeConfig := baseRemoteWriteConfig("http://test-storage.com")
writeConfig.QueueConfig = queueConfig
conf := &config.Config{
GlobalConfig: config.DefaultGlobalConfig,
RemoteWriteConfigs: []*config.RemoteWriteConfig{
writeConfig,
},
}
for _, tc := range []struct {
name string
senderProtoMsg config.RemoteWriteProtoMsg
receiverProtoMsg config.RemoteWriteProtoMsg
injectErrs []error
expectFail bool
}{
{
name: "v2 happy path",
senderProtoMsg: config.RemoteWriteProtoMsgV2, receiverProtoMsg: config.RemoteWriteProtoMsgV2,
injectErrs: []error{nil},
},
{
name: "v1 happy path",
senderProtoMsg: config.RemoteWriteProtoMsgV1, receiverProtoMsg: config.RemoteWriteProtoMsgV1,
injectErrs: []error{nil},
},
// Test a case where the v1 request has a temporary delay but goes through on retry.
{
name: "v1 happy path with one 5xx retry",
senderProtoMsg: config.RemoteWriteProtoMsgV1, receiverProtoMsg: config.RemoteWriteProtoMsgV1,
injectErrs: []error{RecoverableError{errors.New("pretend 500"), 1}, nil},
},
// Repeat the above test but with v2. The request has a temporary delay but goes through on retry.
{
name: "v2 happy path with one 5xx retry",
senderProtoMsg: config.RemoteWriteProtoMsgV2, receiverProtoMsg: config.RemoteWriteProtoMsgV2,
injectErrs: []error{RecoverableError{errors.New("pretend 500"), 1}, nil},
},
// A few error cases of v2 talking to v1.
{
name: "v2 talks to v1 that gives 400 or 415",
senderProtoMsg: config.RemoteWriteProtoMsgV2, receiverProtoMsg: config.RemoteWriteProtoMsgV1,
injectErrs: []error{errors.New("pretend unrecoverable err")},
expectFail: true,
},
{
name: "v2 talks to (broken) v1 that tries to unmarshal v2 payload with v1 proto",
senderProtoMsg: config.RemoteWriteProtoMsgV2, receiverProtoMsg: config.RemoteWriteProtoMsgV1,
injectErrs: []error{nil},
expectFail: true, // We detect this thanks to https://github.com/prometheus/prometheus/issues/14359
},
// Opposite, v1 talking to v2 only server.
{
name: "v1 talks to v2 that gives 400 or 415",
senderProtoMsg: config.RemoteWriteProtoMsgV1, receiverProtoMsg: config.RemoteWriteProtoMsgV2,
injectErrs: []error{errors.New("pretend unrecoverable err")},
expectFail: true,
},
} {
t.Run(tc.name, func(t *testing.T) {
dir := t.TempDir()
s := NewStorage(nil, nil, nil, dir, defaultFlushDeadline, nil, true)
defer s.Close()
var (
series []record.RefSeries
metadata []record.RefMetadata
samples []record.RefSample
)
// Generates same series in both cases.
samples, series = createTimeseries(1, 1)
metadata = createSeriesMetadata(series)
// Apply new config.
queueConfig.Capacity = len(samples)
queueConfig.MaxSamplesPerSend = len(samples)
// For now we only ever have a single rw config in this test.
conf.RemoteWriteConfigs[0].ProtobufMessage = tc.senderProtoMsg
require.NoError(t, s.ApplyConfig(conf))
hash, err := toHash(writeConfig)
require.NoError(t, err)
qm := s.rws.queues[hash]
c := NewTestWriteClient(tc.receiverProtoMsg)
c.injectErrors(tc.injectErrs)
qm.SetClient(c)
qm.StoreSeries(series, 0)
qm.StoreMetadata(metadata)
// Do we expect some data back?
if !tc.expectFail {
c.expectSamples(samples, series)
} else {
c.expectSamples(nil, nil)
}
// Schedule send.
qm.Append(samples)
if !tc.expectFail {
// No error expected, so wait for data.
c.waitForExpectedData(t, 5*time.Second)
require.Equal(t, 0.0, client_testutil.ToFloat64(qm.metrics.failedSamplesTotal))
} else {
// Wait for failure to be recorded in metrics.
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
require.NoError(t, runutil.Retry(500*time.Millisecond, ctx.Done(), func() error {
if client_testutil.ToFloat64(qm.metrics.failedSamplesTotal) != 1.0 {
return fmt.Errorf("expected one sample failed in qm metrics; got %v", client_testutil.ToFloat64(qm.metrics.failedSamplesTotal))
}
return nil
}))
}
// samplesTotal means attempts.
require.Equal(t, float64(len(tc.injectErrs)), client_testutil.ToFloat64(qm.metrics.samplesTotal))
require.Equal(t, float64(len(tc.injectErrs)-1), client_testutil.ToFloat64(qm.metrics.retriedSamplesTotal))
})
}
}
func TestSampleDelivery(t *testing.T) {
// Let's create an even number of send batches, so we don't run into the
// batch timeout case.
n := 3
queueConfig := config.DefaultQueueConfig
queueConfig.BatchSendDeadline = model.Duration(100 * time.Millisecond)
queueConfig.MaxShards = 1
// We need to set URL's so that metric creation doesn't panic.
writeConfig := baseRemoteWriteConfig("http://test-storage.com")
writeConfig.QueueConfig = queueConfig
writeConfig.SendExemplars = true
writeConfig.SendNativeHistograms = true
conf := &config.Config{
GlobalConfig: config.DefaultGlobalConfig,
RemoteWriteConfigs: []*config.RemoteWriteConfig{
writeConfig,
},
}
for _, tc := range []struct {
protoMsg config.RemoteWriteProtoMsg
name string
samples bool
exemplars bool
histograms bool
floatHistograms bool
}{
{protoMsg: config.RemoteWriteProtoMsgV1, samples: true, exemplars: false, histograms: false, floatHistograms: false, name: "samples only"},
{protoMsg: config.RemoteWriteProtoMsgV1, samples: true, exemplars: true, histograms: true, floatHistograms: true, name: "samples, exemplars, and histograms"},
{protoMsg: config.RemoteWriteProtoMsgV1, samples: false, exemplars: true, histograms: false, floatHistograms: false, name: "exemplars only"},
{protoMsg: config.RemoteWriteProtoMsgV1, samples: false, exemplars: false, histograms: true, floatHistograms: false, name: "histograms only"},
{protoMsg: config.RemoteWriteProtoMsgV1, samples: false, exemplars: false, histograms: false, floatHistograms: true, name: "float histograms only"},
// TODO(alexg): update some portion of this test to check for the 2.0 metadata
{protoMsg: config.RemoteWriteProtoMsgV2, samples: true, exemplars: false, histograms: false, floatHistograms: false, name: "samples only"},
{protoMsg: config.RemoteWriteProtoMsgV2, samples: true, exemplars: true, histograms: true, floatHistograms: true, name: "samples, exemplars, and histograms"},
{protoMsg: config.RemoteWriteProtoMsgV2, samples: false, exemplars: true, histograms: false, floatHistograms: false, name: "exemplars only"},
{protoMsg: config.RemoteWriteProtoMsgV2, samples: false, exemplars: false, histograms: true, floatHistograms: false, name: "histograms only"},
{protoMsg: config.RemoteWriteProtoMsgV2, samples: false, exemplars: false, histograms: false, floatHistograms: true, name: "float histograms only"},
} {
t.Run(fmt.Sprintf("%s-%s", tc.protoMsg, tc.name), func(t *testing.T) {
dir := t.TempDir()
s := NewStorage(nil, nil, nil, dir, defaultFlushDeadline, nil, true)
defer s.Close()
var (
series []record.RefSeries
metadata []record.RefMetadata
samples []record.RefSample
exemplars []record.RefExemplar
histograms []record.RefHistogramSample
floatHistograms []record.RefFloatHistogramSample
)
// Generates same series in both cases.
if tc.samples {
samples, series = createTimeseries(n, n)
}
if tc.exemplars {
exemplars, series = createExemplars(n, n)
}
if tc.histograms {
histograms, _, series = createHistograms(n, n, false)
}
if tc.floatHistograms {
_, floatHistograms, series = createHistograms(n, n, true)
}
metadata = createSeriesMetadata(series)
// Apply new config.
queueConfig.Capacity = len(samples)
queueConfig.MaxSamplesPerSend = len(samples) / 2
// For now we only ever have a single rw config in this test.
conf.RemoteWriteConfigs[0].ProtobufMessage = tc.protoMsg
require.NoError(t, s.ApplyConfig(conf))
hash, err := toHash(writeConfig)
require.NoError(t, err)
qm := s.rws.queues[hash]
c := NewTestWriteClient(tc.protoMsg)
qm.SetClient(c)
qm.StoreSeries(series, 0)
qm.StoreMetadata(metadata)
// Send first half of data.
c.expectSamples(samples[:len(samples)/2], series)
c.expectExemplars(exemplars[:len(exemplars)/2], series)
c.expectHistograms(histograms[:len(histograms)/2], series)
c.expectFloatHistograms(floatHistograms[:len(floatHistograms)/2], series)
qm.Append(samples[:len(samples)/2])
qm.AppendExemplars(exemplars[:len(exemplars)/2])
qm.AppendHistograms(histograms[:len(histograms)/2])
qm.AppendFloatHistograms(floatHistograms[:len(floatHistograms)/2])
c.waitForExpectedData(t, 30*time.Second)
// Send second half of data.
c.expectSamples(samples[len(samples)/2:], series)
c.expectExemplars(exemplars[len(exemplars)/2:], series)
c.expectHistograms(histograms[len(histograms)/2:], series)
c.expectFloatHistograms(floatHistograms[len(floatHistograms)/2:], series)
qm.Append(samples[len(samples)/2:])
qm.AppendExemplars(exemplars[len(exemplars)/2:])
qm.AppendHistograms(histograms[len(histograms)/2:])
qm.AppendFloatHistograms(floatHistograms[len(floatHistograms)/2:])
c.waitForExpectedData(t, 30*time.Second)
})
}
}
func newTestClientAndQueueManager(t testing.TB, flushDeadline time.Duration, protoMsg config.RemoteWriteProtoMsg) (*TestWriteClient, *QueueManager) {
c := NewTestWriteClient(protoMsg)
cfg := config.DefaultQueueConfig
mcfg := config.DefaultMetadataConfig
return c, newTestQueueManager(t, cfg, mcfg, flushDeadline, c, protoMsg)
}
func newTestQueueManager(t testing.TB, cfg config.QueueConfig, mcfg config.MetadataConfig, deadline time.Duration, c WriteClient, protoMsg config.RemoteWriteProtoMsg) *QueueManager {
dir := t.TempDir()
metrics := newQueueManagerMetrics(nil, "", "")
m := NewQueueManager(metrics, nil, nil, nil, dir, newEWMARate(ewmaWeight, shardUpdateDuration), cfg, mcfg, labels.EmptyLabels(), nil, c, deadline, newPool(), newHighestTimestampMetric(), nil, false, false, protoMsg)
return m
}
func testDefaultQueueConfig() config.QueueConfig {
cfg := config.DefaultQueueConfig
// For faster unit tests we don't wait default 5 seconds.
cfg.BatchSendDeadline = model.Duration(100 * time.Millisecond)
return cfg
}
func TestMetadataDelivery(t *testing.T) {
c, m := newTestClientAndQueueManager(t, defaultFlushDeadline, config.RemoteWriteProtoMsgV1)
m.Start()
defer m.Stop()
metadata := []scrape.MetricMetadata{}
numMetadata := 1532
for i := 0; i < numMetadata; i++ {
metadata = append(metadata, scrape.MetricMetadata{
Metric: "prometheus_remote_storage_sent_metadata_bytes_total_" + strconv.Itoa(i),
Type: model.MetricTypeCounter,
Help: "a nice help text",
Unit: "",
})
}
m.AppendWatcherMetadata(context.Background(), metadata)
require.Equal(t, 0.0, client_testutil.ToFloat64(m.metrics.failedMetadataTotal))
require.Len(t, c.receivedMetadata, numMetadata)
// One more write than the rounded quotient should be performed in order to get samples that didn't
// fit into MaxSamplesPerSend.
require.Equal(t, numMetadata/config.DefaultMetadataConfig.MaxSamplesPerSend+1, c.writesReceived)
// Make sure the last samples were sent.
require.Equal(t, c.receivedMetadata[metadata[len(metadata)-1].Metric][0].MetricFamilyName, metadata[len(metadata)-1].Metric)
}
func TestWALMetadataDelivery(t *testing.T) {
dir := t.TempDir()
s := NewStorage(nil, nil, nil, dir, defaultFlushDeadline, nil, true)
defer s.Close()
cfg := config.DefaultQueueConfig
cfg.BatchSendDeadline = model.Duration(100 * time.Millisecond)
cfg.MaxShards = 1
writeConfig := baseRemoteWriteConfig("http://test-storage.com")
writeConfig.QueueConfig = cfg
writeConfig.ProtobufMessage = config.RemoteWriteProtoMsgV2
conf := &config.Config{
GlobalConfig: config.DefaultGlobalConfig,
RemoteWriteConfigs: []*config.RemoteWriteConfig{
writeConfig,
},
}
num := 3
_, series := createTimeseries(0, num)
metadata := createSeriesMetadata(series)
require.NoError(t, s.ApplyConfig(conf))
hash, err := toHash(writeConfig)
require.NoError(t, err)
qm := s.rws.queues[hash]
c := NewTestWriteClient(config.RemoteWriteProtoMsgV1)
qm.SetClient(c)
qm.StoreSeries(series, 0)
qm.StoreMetadata(metadata)
require.Len(t, qm.seriesLabels, num)
require.Len(t, qm.seriesMetadata, num)
c.waitForExpectedData(t, 30*time.Second)
}
func TestSampleDeliveryTimeout(t *testing.T) {
for _, protoMsg := range []config.RemoteWriteProtoMsg{config.RemoteWriteProtoMsgV1, config.RemoteWriteProtoMsgV2} {
t.Run(fmt.Sprint(protoMsg), func(t *testing.T) {
// Let's send one less sample than batch size, and wait the timeout duration
n := 9
samples, series := createTimeseries(n, n)
cfg := testDefaultQueueConfig()
mcfg := config.DefaultMetadataConfig
cfg.MaxShards = 1
c := NewTestWriteClient(protoMsg)
m := newTestQueueManager(t, cfg, mcfg, defaultFlushDeadline, c, protoMsg)
m.StoreSeries(series, 0)
m.Start()
defer m.Stop()
// Send the samples twice, waiting for the samples in the meantime.
c.expectSamples(samples, series)
m.Append(samples)
c.waitForExpectedData(t, 30*time.Second)
c.expectSamples(samples, series)
m.Append(samples)
c.waitForExpectedData(t, 30*time.Second)
})
}
}
func TestSampleDeliveryOrder(t *testing.T) {
for _, protoMsg := range []config.RemoteWriteProtoMsg{config.RemoteWriteProtoMsgV1, config.RemoteWriteProtoMsgV2} {
t.Run(fmt.Sprint(protoMsg), func(t *testing.T) {
ts := 10
n := config.DefaultQueueConfig.MaxSamplesPerSend * ts
samples := make([]record.RefSample, 0, n)
series := make([]record.RefSeries, 0, n)
for i := 0; i < n; i++ {
name := fmt.Sprintf("test_metric_%d", i%ts)
samples = append(samples, record.RefSample{
Ref: chunks.HeadSeriesRef(i),
T: int64(i),
V: float64(i),
})
series = append(series, record.RefSeries{
Ref: chunks.HeadSeriesRef(i),
Labels: labels.FromStrings("__name__", name),
})
}
c, m := newTestClientAndQueueManager(t, defaultFlushDeadline, protoMsg)
c.expectSamples(samples, series)
m.StoreSeries(series, 0)
m.Start()
defer m.Stop()
// These should be received by the client.
m.Append(samples)
c.waitForExpectedData(t, 30*time.Second)
})
}
}
func TestShutdown(t *testing.T) {
deadline := 1 * time.Second
c := NewTestBlockedWriteClient()
cfg := config.DefaultQueueConfig
mcfg := config.DefaultMetadataConfig
m := newTestQueueManager(t, cfg, mcfg, deadline, c, config.RemoteWriteProtoMsgV1)
n := 2 * config.DefaultQueueConfig.MaxSamplesPerSend
samples, series := createTimeseries(n, n)
m.StoreSeries(series, 0)
m.Start()
// Append blocks to guarantee delivery, so we do it in the background.
go func() {
m.Append(samples)
}()
time.Sleep(100 * time.Millisecond)
// Test to ensure that Stop doesn't block.
start := time.Now()
m.Stop()
// The samples will never be delivered, so duration should
// be at least equal to deadline, otherwise the flush deadline
// was not respected.
duration := time.Since(start)
if duration > deadline+(deadline/10) {
t.Errorf("Took too long to shutdown: %s > %s", duration, deadline)
}
if duration < deadline {
t.Errorf("Shutdown occurred before flush deadline: %s < %s", duration, deadline)
}
}
func TestSeriesReset(t *testing.T) {
c := NewTestBlockedWriteClient()
deadline := 5 * time.Second
numSegments := 4
numSeries := 25
cfg := config.DefaultQueueConfig
mcfg := config.DefaultMetadataConfig
m := newTestQueueManager(t, cfg, mcfg, deadline, c, config.RemoteWriteProtoMsgV1)
for i := 0; i < numSegments; i++ {
series := []record.RefSeries{}
for j := 0; j < numSeries; j++ {
series = append(series, record.RefSeries{Ref: chunks.HeadSeriesRef((i * 100) + j), Labels: labels.FromStrings("a", "a")})
}
m.StoreSeries(series, i)
}
require.Len(t, m.seriesLabels, numSegments*numSeries)
m.SeriesReset(2)
require.Len(t, m.seriesLabels, numSegments*numSeries/2)
}
func TestReshard(t *testing.T) {
for _, protoMsg := range []config.RemoteWriteProtoMsg{config.RemoteWriteProtoMsgV1, config.RemoteWriteProtoMsgV2} {
t.Run(fmt.Sprint(protoMsg), func(t *testing.T) {
size := 10 // Make bigger to find more races.
nSeries := 6
nSamples := config.DefaultQueueConfig.Capacity * size
samples, series := createTimeseries(nSamples, nSeries)
cfg := config.DefaultQueueConfig
cfg.MaxShards = 1
c := NewTestWriteClient(protoMsg)
m := newTestQueueManager(t, cfg, config.DefaultMetadataConfig, defaultFlushDeadline, c, protoMsg)
c.expectSamples(samples, series)
m.StoreSeries(series, 0)
m.Start()
defer m.Stop()
go func() {
for i := 0; i < len(samples); i += config.DefaultQueueConfig.Capacity {
sent := m.Append(samples[i : i+config.DefaultQueueConfig.Capacity])
require.True(t, sent, "samples not sent")
time.Sleep(100 * time.Millisecond)
}
}()
for i := 1; i < len(samples)/config.DefaultQueueConfig.Capacity; i++ {
m.shards.stop()
m.shards.start(i)
time.Sleep(100 * time.Millisecond)
}
c.waitForExpectedData(t, 30*time.Second)
})
}
}
func TestReshardRaceWithStop(t *testing.T) {
for _, protoMsg := range []config.RemoteWriteProtoMsg{config.RemoteWriteProtoMsgV1, config.RemoteWriteProtoMsgV2} {
t.Run(fmt.Sprint(protoMsg), func(t *testing.T) {
c := NewTestWriteClient(protoMsg)
var m *QueueManager
h := sync.Mutex{}
h.Lock()
cfg := testDefaultQueueConfig()
mcfg := config.DefaultMetadataConfig
exitCh := make(chan struct{})
go func() {
for {
m = newTestQueueManager(t, cfg, mcfg, defaultFlushDeadline, c, protoMsg)
m.Start()
h.Unlock()
h.Lock()
m.Stop()
select {
case exitCh <- struct{}{}:
return
default:
}
}
}()
for i := 1; i < 100; i++ {
h.Lock()
m.reshardChan <- i
h.Unlock()
}
<-exitCh
})
}
}
func TestReshardPartialBatch(t *testing.T) {
for _, protoMsg := range []config.RemoteWriteProtoMsg{config.RemoteWriteProtoMsgV1, config.RemoteWriteProtoMsgV2} {
t.Run(fmt.Sprint(protoMsg), func(t *testing.T) {
samples, series := createTimeseries(1, 10)
c := NewTestBlockedWriteClient()
cfg := testDefaultQueueConfig()
mcfg := config.DefaultMetadataConfig
cfg.MaxShards = 1
batchSendDeadline := time.Millisecond
flushDeadline := 10 * time.Millisecond
cfg.BatchSendDeadline = model.Duration(batchSendDeadline)
m := newTestQueueManager(t, cfg, mcfg, flushDeadline, c, protoMsg)
m.StoreSeries(series, 0)
m.Start()
for i := 0; i < 100; i++ {
done := make(chan struct{})
go func() {
m.Append(samples)
time.Sleep(batchSendDeadline)
m.shards.stop()
m.shards.start(1)
done <- struct{}{}
}()
select {
case <-done:
case <-time.After(2 * time.Second):
t.Error("Deadlock between sending and stopping detected")
pprof.Lookup("goroutine").WriteTo(os.Stdout, 1)
t.FailNow()
}
}
// We can only call stop if there was not a deadlock.
m.Stop()
})
}
}
// TestQueueFilledDeadlock makes sure the code does not deadlock in the case
// where a large scrape (> capacity + max samples per send) is appended at the
// same time as a batch times out according to the batch send deadline.
func TestQueueFilledDeadlock(t *testing.T) {
for _, protoMsg := range []config.RemoteWriteProtoMsg{config.RemoteWriteProtoMsgV1, config.RemoteWriteProtoMsgV2} {
t.Run(fmt.Sprint(protoMsg), func(t *testing.T) {
samples, series := createTimeseries(50, 1)
c := NewNopWriteClient()
cfg := testDefaultQueueConfig()
mcfg := config.DefaultMetadataConfig
cfg.MaxShards = 1
cfg.MaxSamplesPerSend = 10
cfg.Capacity = 20
flushDeadline := time.Second
batchSendDeadline := time.Millisecond
cfg.BatchSendDeadline = model.Duration(batchSendDeadline)
m := newTestQueueManager(t, cfg, mcfg, flushDeadline, c, protoMsg)
m.StoreSeries(series, 0)
m.Start()
defer m.Stop()
for i := 0; i < 100; i++ {
done := make(chan struct{})
go func() {
time.Sleep(batchSendDeadline)
m.Append(samples)
done <- struct{}{}
}()
select {
case <-done:
case <-time.After(2 * time.Second):
t.Error("Deadlock between sending and appending detected")
pprof.Lookup("goroutine").WriteTo(os.Stdout, 1)
t.FailNow()
}
}
})
}
}
func TestReleaseNoninternedString(t *testing.T) {
for _, protoMsg := range []config.RemoteWriteProtoMsg{config.RemoteWriteProtoMsgV1, config.RemoteWriteProtoMsgV2} {
t.Run(fmt.Sprint(protoMsg), func(t *testing.T) {
_, m := newTestClientAndQueueManager(t, defaultFlushDeadline, protoMsg)
m.Start()
defer m.Stop()
for i := 1; i < 1000; i++ {
m.StoreSeries([]record.RefSeries{
{
Ref: chunks.HeadSeriesRef(i),
Labels: labels.FromStrings("asdf", strconv.Itoa(i)),
},
}, 0)
m.SeriesReset(1)
}
metric := client_testutil.ToFloat64(noReferenceReleases)
require.Equal(t, 0.0, metric, "expected there to be no calls to release for strings that were not already interned: %d", int(metric))
})
}
}
func TestShouldReshard(t *testing.T) {
type testcase struct {
startingShards int
samplesIn, samplesOut, lastSendTimestamp int64
expectedToReshard bool
sendDeadline model.Duration
}
cases := []testcase{
{
// resharding shouldn't take place if we haven't successfully sent
// since the last shardUpdateDuration, even if the send deadline is very low
startingShards: 10,
samplesIn: 1000,
samplesOut: 10,
lastSendTimestamp: time.Now().Unix() - int64(shardUpdateDuration),
expectedToReshard: false,
sendDeadline: model.Duration(100 * time.Millisecond),
},
{
startingShards: 10,
samplesIn: 1000,
samplesOut: 10,
lastSendTimestamp: time.Now().Unix(),
expectedToReshard: true,
sendDeadline: config.DefaultQueueConfig.BatchSendDeadline,
},
}
for _, c := range cases {
_, m := newTestClientAndQueueManager(t, time.Duration(c.sendDeadline), config.RemoteWriteProtoMsgV1)
m.numShards = c.startingShards
m.dataIn.incr(c.samplesIn)
m.dataOut.incr(c.samplesOut)
m.lastSendTimestamp.Store(c.lastSendTimestamp)
m.Start()
desiredShards := m.calculateDesiredShards()
shouldReshard := m.shouldReshard(desiredShards)
m.Stop()
require.Equal(t, c.expectedToReshard, shouldReshard)
}
}
// TestDisableReshardOnRetry asserts that resharding should be disabled when a
// recoverable error is returned from remote_write.
func TestDisableReshardOnRetry(t *testing.T) {
onStoredContext, onStoreCalled := context.WithCancel(context.Background())
defer onStoreCalled()
var (
fakeSamples, fakeSeries = createTimeseries(100, 100)
cfg = config.DefaultQueueConfig
mcfg = config.DefaultMetadataConfig
retryAfter = time.Second
metrics = newQueueManagerMetrics(nil, "", "")
client = &MockWriteClient{
StoreFunc: func(ctx context.Context, b []byte, i int) (WriteResponseStats, error) {
onStoreCalled()
return WriteResponseStats{}, RecoverableError{
error: fmt.Errorf("fake error"),
retryAfter: model.Duration(retryAfter),
}
},
NameFunc: func() string { return "mock" },
EndpointFunc: func() string { return "http://fake:9090/api/v1/write" },
}
)
m := NewQueueManager(metrics, nil, nil, nil, "", newEWMARate(ewmaWeight, shardUpdateDuration), cfg, mcfg, labels.EmptyLabels(), nil, client, 0, newPool(), newHighestTimestampMetric(), nil, false, false, config.RemoteWriteProtoMsgV1)
m.StoreSeries(fakeSeries, 0)
// Attempt to samples while the manager is running. We immediately stop the
// manager after the recoverable error is generated to prevent the manager
// from resharding itself.
m.Start()
{
m.Append(fakeSamples)
select {
case <-onStoredContext.Done():
case <-time.After(time.Minute):
require.FailNow(t, "timed out waiting for client to be sent metrics")
}
}
m.Stop()
require.Eventually(t, func() bool {
// Force m.lastSendTimestamp to be current so the last send timestamp isn't
// the reason resharding is disabled.
m.lastSendTimestamp.Store(time.Now().Unix())
return m.shouldReshard(m.numShards+1) == false
}, time.Minute, 10*time.Millisecond, "shouldReshard was never disabled")
// After 2x retryAfter, resharding should be enabled again.
require.Eventually(t, func() bool {
// Force m.lastSendTimestamp to be current so the last send timestamp isn't
// the reason resharding is disabled.
m.lastSendTimestamp.Store(time.Now().Unix())
return m.shouldReshard(m.numShards+1) == true
}, time.Minute, retryAfter, "shouldReshard should have been re-enabled")
}
func createTimeseries(numSamples, numSeries int, extraLabels ...labels.Label) ([]record.RefSample, []record.RefSeries) {
samples := make([]record.RefSample, 0, numSamples)
series := make([]record.RefSeries, 0, numSeries)
lb := labels.NewScratchBuilder(1 + len(extraLabels))
for i := 0; i < numSeries; i++ {
name := fmt.Sprintf("test_metric_%d", i)
for j := 0; j < numSamples; j++ {
samples = append(samples, record.RefSample{
Ref: chunks.HeadSeriesRef(i),
T: int64(j),
V: float64(i),
})
}
// Create Labels that is name of series plus any extra labels supplied.
lb.Reset()
lb.Add(labels.MetricName, name)
rand.Shuffle(len(extraLabels), func(i, j int) {
extraLabels[i], extraLabels[j] = extraLabels[j], extraLabels[i]
})
for _, l := range extraLabels {
lb.Add(l.Name, l.Value)
}
lb.Sort()
series = append(series, record.RefSeries{
Ref: chunks.HeadSeriesRef(i),
Labels: lb.Labels(),
})
}
return samples, series
}
func createProtoTimeseriesWithOld(numSamples, baseTs int64, extraLabels ...labels.Label) []prompb.TimeSeries {
samples := make([]prompb.TimeSeries, numSamples)
// use a fixed rand source so tests are consistent
r := rand.New(rand.NewSource(99))
for j := int64(0); j < numSamples; j++ {
name := fmt.Sprintf("test_metric_%d", j)
samples[j] = prompb.TimeSeries{
Labels: []prompb.Label{{Name: "__name__", Value: name}},
Samples: []prompb.Sample{
{
Timestamp: baseTs + j,
Value: float64(j),
},
},
}
// 10% of the time use a ts that is too old
if r.Intn(10) == 0 {
samples[j].Samples[0].Timestamp = baseTs - 5
}
}
return samples
}
func createExemplars(numExemplars, numSeries int) ([]record.RefExemplar, []record.RefSeries) {
exemplars := make([]record.RefExemplar, 0, numExemplars)
series := make([]record.RefSeries, 0, numSeries)
for i := 0; i < numSeries; i++ {
name := fmt.Sprintf("test_metric_%d", i)
for j := 0; j < numExemplars; j++ {
e := record.RefExemplar{
Ref: chunks.HeadSeriesRef(i),
T: int64(j),
V: float64(i),
Labels: labels.FromStrings("trace_id", fmt.Sprintf("trace-%d", i)),
}
exemplars = append(exemplars, e)
}
series = append(series, record.RefSeries{
Ref: chunks.HeadSeriesRef(i),
Labels: labels.FromStrings("__name__", name),
})
}
return exemplars, series
}
func createHistograms(numSamples, numSeries int, floatHistogram bool) ([]record.RefHistogramSample, []record.RefFloatHistogramSample, []record.RefSeries) {
histograms := make([]record.RefHistogramSample, 0, numSamples)
floatHistograms := make([]record.RefFloatHistogramSample, 0, numSamples)
series := make([]record.RefSeries, 0, numSeries)
for i := 0; i < numSeries; i++ {
name := fmt.Sprintf("test_metric_%d", i)
for j := 0; j < numSamples; j++ {
hist := &histogram.Histogram{
Schema: 2,
ZeroThreshold: 1e-128,
ZeroCount: 0,
Count: 2,
Sum: 0,
PositiveSpans: []histogram.Span{{Offset: 0, Length: 1}},
PositiveBuckets: []int64{int64(i) + 1},
NegativeSpans: []histogram.Span{{Offset: 0, Length: 1}},
NegativeBuckets: []int64{int64(-i) - 1},
}
if floatHistogram {
fh := record.RefFloatHistogramSample{
Ref: chunks.HeadSeriesRef(i),
T: int64(j),
FH: hist.ToFloat(nil),
}
floatHistograms = append(floatHistograms, fh)
} else {
h := record.RefHistogramSample{
Ref: chunks.HeadSeriesRef(i),
T: int64(j),
H: hist,
}
histograms = append(histograms, h)
}
}
series = append(series, record.RefSeries{
Ref: chunks.HeadSeriesRef(i),
Labels: labels.FromStrings("__name__", name),
})
}
if floatHistogram {
return nil, floatHistograms, series
}
return histograms, nil, series
}
func createSeriesMetadata(series []record.RefSeries) []record.RefMetadata {
metas := make([]record.RefMetadata, 0, len(series))
for _, s := range series {
metas = append(metas, record.RefMetadata{
Ref: s.Ref,
Type: uint8(record.Counter),
Unit: "unit text",
Help: "help text",
})
}
return metas
}
func getSeriesIDFromRef(r record.RefSeries) string {
return r.Labels.String()
}
// TestWriteClient represents write client which does not call remote storage,
// but instead re-implements fake WriteHandler for test purposes.
type TestWriteClient struct {
receivedSamples map[string][]prompb.Sample
expectedSamples map[string][]prompb.Sample
receivedExemplars map[string][]prompb.Exemplar
expectedExemplars map[string][]prompb.Exemplar
receivedHistograms map[string][]prompb.Histogram
receivedFloatHistograms map[string][]prompb.Histogram
expectedHistograms map[string][]prompb.Histogram
expectedFloatHistograms map[string][]prompb.Histogram
receivedMetadata map[string][]prompb.MetricMetadata
writesReceived int
mtx sync.Mutex
buf []byte
protoMsg config.RemoteWriteProtoMsg
injectedErrs []error
currErr int
retry bool
storeWait time.Duration
// TODO(npazosmendez): maybe replaceable with injectedErrs?
returnError error
}
// NewTestWriteClient creates a new testing write client.
func NewTestWriteClient(protoMsg config.RemoteWriteProtoMsg) *TestWriteClient {
return &TestWriteClient{
receivedSamples: map[string][]prompb.Sample{},
expectedSamples: map[string][]prompb.Sample{},
receivedMetadata: map[string][]prompb.MetricMetadata{},
protoMsg: protoMsg,
storeWait: 0,
returnError: nil,
}
}
func (c *TestWriteClient) injectErrors(injectedErrs []error) {
c.injectedErrs = injectedErrs
c.currErr = -1
c.retry = false
}
func (c *TestWriteClient) expectSamples(ss []record.RefSample, series []record.RefSeries) {
c.mtx.Lock()
defer c.mtx.Unlock()
c.expectedSamples = map[string][]prompb.Sample{}
c.receivedSamples = map[string][]prompb.Sample{}
for _, s := range ss {
tsID := getSeriesIDFromRef(series[s.Ref])
c.expectedSamples[tsID] = append(c.expectedSamples[tsID], prompb.Sample{
Timestamp: s.T,
Value: s.V,
})
}
}
func (c *TestWriteClient) expectExemplars(ss []record.RefExemplar, series []record.RefSeries) {
c.mtx.Lock()
defer c.mtx.Unlock()
c.expectedExemplars = map[string][]prompb.Exemplar{}
c.receivedExemplars = map[string][]prompb.Exemplar{}
for _, s := range ss {
tsID := getSeriesIDFromRef(series[s.Ref])
e := prompb.Exemplar{
Labels: prompb.FromLabels(s.Labels, nil),
Timestamp: s.T,
Value: s.V,
}
c.expectedExemplars[tsID] = append(c.expectedExemplars[tsID], e)
}
}
func (c *TestWriteClient) expectHistograms(hh []record.RefHistogramSample, series []record.RefSeries) {
c.mtx.Lock()
defer c.mtx.Unlock()
c.expectedHistograms = map[string][]prompb.Histogram{}
c.receivedHistograms = map[string][]prompb.Histogram{}
for _, h := range hh {
tsID := getSeriesIDFromRef(series[h.Ref])
c.expectedHistograms[tsID] = append(c.expectedHistograms[tsID], prompb.FromIntHistogram(h.T, h.H))
}
}
func (c *TestWriteClient) expectFloatHistograms(fhs []record.RefFloatHistogramSample, series []record.RefSeries) {
c.mtx.Lock()
defer c.mtx.Unlock()
c.expectedFloatHistograms = map[string][]prompb.Histogram{}
c.receivedFloatHistograms = map[string][]prompb.Histogram{}
for _, fh := range fhs {
tsID := getSeriesIDFromRef(series[fh.Ref])
c.expectedFloatHistograms[tsID] = append(c.expectedFloatHistograms[tsID], prompb.FromFloatHistogram(fh.T, fh.FH))
}
}
func deepLen[M any](ms ...map[string][]M) int {
l := 0
for _, m := range ms {
for _, v := range m {
l += len(v)
}
}
return l
}
func (c *TestWriteClient) waitForExpectedData(tb testing.TB, timeout time.Duration) {
tb.Helper()
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
if err := runutil.Retry(500*time.Millisecond, ctx.Done(), func() error {
c.mtx.Lock()
exp := deepLen(c.expectedSamples) + deepLen(c.expectedExemplars) + deepLen(c.expectedHistograms, c.expectedFloatHistograms)
got := deepLen(c.receivedSamples) + deepLen(c.receivedExemplars) + deepLen(c.receivedHistograms, c.receivedFloatHistograms)
c.mtx.Unlock()
if got < exp {
return fmt.Errorf("expected %v samples/exemplars/histograms/floathistograms, got %v", exp, got)
}
return nil
}); err != nil {
tb.Error(err)
}
c.mtx.Lock()
defer c.mtx.Unlock()
for ts, expectedSamples := range c.expectedSamples {
require.Equal(tb, expectedSamples, c.receivedSamples[ts], ts)
}
for ts, expectedExemplar := range c.expectedExemplars {
require.Equal(tb, expectedExemplar, c.receivedExemplars[ts], ts)
}
for ts, expectedHistogram := range c.expectedHistograms {
require.Equal(tb, expectedHistogram, c.receivedHistograms[ts], ts)
}
for ts, expectedFloatHistogram := range c.expectedFloatHistograms {
require.Equal(tb, expectedFloatHistogram, c.receivedFloatHistograms[ts], ts)
}
}
func (c *TestWriteClient) SetStoreWait(w time.Duration) {
c.mtx.Lock()
defer c.mtx.Unlock()
c.storeWait = w
}
func (c *TestWriteClient) SetReturnError(err error) {
c.mtx.Lock()
defer c.mtx.Unlock()
c.returnError = err
}
func (c *TestWriteClient) Store(_ context.Context, req []byte, _ int) (WriteResponseStats, error) {
c.mtx.Lock()
defer c.mtx.Unlock()
if c.storeWait > 0 {
time.Sleep(c.storeWait)
}
if c.returnError != nil {
return WriteResponseStats{}, c.returnError
}
// nil buffers are ok for snappy, ignore cast error.
if c.buf != nil {
c.buf = c.buf[:cap(c.buf)]
}
reqBuf, err := snappy.Decode(c.buf, req)
c.buf = reqBuf
if err != nil {
return WriteResponseStats{}, err
}
// Check if we've been told to inject err for this call.
if len(c.injectedErrs) > 0 {
c.currErr++
if err = c.injectedErrs[c.currErr]; err != nil {
return WriteResponseStats{}, err
}
}
var reqProto *prompb.WriteRequest
switch c.protoMsg {
case config.RemoteWriteProtoMsgV1:
reqProto = &prompb.WriteRequest{}
err = proto.Unmarshal(reqBuf, reqProto)
case config.RemoteWriteProtoMsgV2:
// NOTE(bwplotka): v1 msg can be unmarshaled to v2 sometimes, without
// errors.
var reqProtoV2 writev2.Request
err = proto.Unmarshal(reqBuf, &reqProtoV2)
if err == nil {
reqProto, err = v2RequestToWriteRequest(&reqProtoV2)
}
}
if err != nil {
return WriteResponseStats{}, err
}
rs := WriteResponseStats{}
b := labels.NewScratchBuilder(0)
for _, ts := range reqProto.Timeseries {
labels := ts.ToLabels(&b, nil)
tsID := labels.String()
if len(ts.Samples) > 0 {
c.receivedSamples[tsID] = append(c.receivedSamples[tsID], ts.Samples...)
}
rs.Samples += len(ts.Samples)
if len(ts.Exemplars) > 0 {
c.receivedExemplars[tsID] = append(c.receivedExemplars[tsID], ts.Exemplars...)
}
rs.Exemplars += len(ts.Exemplars)
for _, h := range ts.Histograms {
if h.IsFloatHistogram() {
c.receivedFloatHistograms[tsID] = append(c.receivedFloatHistograms[tsID], h)
} else {
c.receivedHistograms[tsID] = append(c.receivedHistograms[tsID], h)
}
}
rs.Histograms += len(ts.Histograms)
}
for _, m := range reqProto.Metadata {
c.receivedMetadata[m.MetricFamilyName] = append(c.receivedMetadata[m.MetricFamilyName], m)
}
c.writesReceived++
return rs, nil
}
func (c *TestWriteClient) Name() string {
return "testwriteclient"
}
func (c *TestWriteClient) Endpoint() string {
return "http://test-remote.com/1234"
}
func v2RequestToWriteRequest(v2Req *writev2.Request) (*prompb.WriteRequest, error) {
req := &prompb.WriteRequest{
Timeseries: make([]prompb.TimeSeries, len(v2Req.Timeseries)),
// TODO handle metadata?
}
b := labels.NewScratchBuilder(0)
for i, rts := range v2Req.Timeseries {
rts.ToLabels(&b, v2Req.Symbols).Range(func(l labels.Label) {
req.Timeseries[i].Labels = append(req.Timeseries[i].Labels, prompb.Label{
Name: l.Name,
Value: l.Value,
})
})
exemplars := make([]prompb.Exemplar, len(rts.Exemplars))
for j, e := range rts.Exemplars {
exemplars[j].Value = e.Value
exemplars[j].Timestamp = e.Timestamp
e.ToExemplar(&b, v2Req.Symbols).Labels.Range(func(l labels.Label) {
exemplars[j].Labels = append(exemplars[j].Labels, prompb.Label{
Name: l.Name,
Value: l.Value,
})
})
}
req.Timeseries[i].Exemplars = exemplars
req.Timeseries[i].Samples = make([]prompb.Sample, len(rts.Samples))
for j, s := range rts.Samples {
req.Timeseries[i].Samples[j].Timestamp = s.Timestamp
req.Timeseries[i].Samples[j].Value = s.Value
}
req.Timeseries[i].Histograms = make([]prompb.Histogram, len(rts.Histograms))
for j, h := range rts.Histograms {
if h.IsFloatHistogram() {
req.Timeseries[i].Histograms[j] = prompb.FromFloatHistogram(h.Timestamp, h.ToFloatHistogram())
continue
}
req.Timeseries[i].Histograms[j] = prompb.FromIntHistogram(h.Timestamp, h.ToIntHistogram())
}
}
return req, nil
}
// TestBlockingWriteClient is a queue_manager WriteClient which will block
// on any calls to Store(), until the request's Context is cancelled, at which
// point the `numCalls` property will contain a count of how many times Store()
// was called.
type TestBlockingWriteClient struct {
numCalls atomic.Uint64
}
func NewTestBlockedWriteClient() *TestBlockingWriteClient {
return &TestBlockingWriteClient{}
}
func (c *TestBlockingWriteClient) Store(ctx context.Context, _ []byte, _ int) (WriteResponseStats, error) {
c.numCalls.Inc()
<-ctx.Done()
return WriteResponseStats{}, nil
}
func (c *TestBlockingWriteClient) NumCalls() uint64 {
return c.numCalls.Load()
}
func (c *TestBlockingWriteClient) Name() string {
return "testblockingwriteclient"
}
func (c *TestBlockingWriteClient) Endpoint() string {
return "http://test-remote-blocking.com/1234"
}
// For benchmarking the send and not the receive side.
type NopWriteClient struct{}
func NewNopWriteClient() *NopWriteClient { return &NopWriteClient{} }
func (c *NopWriteClient) Store(context.Context, []byte, int) (WriteResponseStats, error) {
return WriteResponseStats{}, nil
}
func (c *NopWriteClient) Name() string { return "nopwriteclient" }
func (c *NopWriteClient) Endpoint() string { return "http://test-remote.com/1234" }
type MockWriteClient struct {
StoreFunc func(context.Context, []byte, int) (WriteResponseStats, error)
NameFunc func() string
EndpointFunc func() string
}
func (c *MockWriteClient) Store(ctx context.Context, bb []byte, n int) (WriteResponseStats, error) {
return c.StoreFunc(ctx, bb, n)
}
func (c *MockWriteClient) Name() string { return c.NameFunc() }
func (c *MockWriteClient) Endpoint() string { return c.EndpointFunc() }
// Extra labels to make a more realistic workload - taken from Kubernetes' embedded cAdvisor metrics.
var extraLabels []labels.Label = []labels.Label{
{Name: "kubernetes_io_arch", Value: "amd64"},
{Name: "kubernetes_io_instance_type", Value: "c3.somesize"},
{Name: "kubernetes_io_os", Value: "linux"},
{Name: "container_name", Value: "some-name"},
{Name: "failure_domain_kubernetes_io_region", Value: "somewhere-1"},
{Name: "failure_domain_kubernetes_io_zone", Value: "somewhere-1b"},
{Name: "id", Value: "/kubepods/burstable/pod6e91c467-e4c5-11e7-ace3-0a97ed59c75e/a3c8498918bd6866349fed5a6f8c643b77c91836427fb6327913276ebc6bde28"},
{Name: "image", Value: "registry/organisation/name@sha256:dca3d877a80008b45d71d7edc4fd2e44c0c8c8e7102ba5cbabec63a374d1d506"},
{Name: "instance", Value: "ip-111-11-1-11.ec2.internal"},
{Name: "job", Value: "kubernetes-cadvisor"},
{Name: "kubernetes_io_hostname", Value: "ip-111-11-1-11"},
{Name: "monitor", Value: "prod"},
{Name: "name", Value: "k8s_some-name_some-other-name-5j8s8_kube-system_6e91c467-e4c5-11e7-ace3-0a97ed59c75e_0"},
{Name: "namespace", Value: "kube-system"},
{Name: "pod_name", Value: "some-other-name-5j8s8"},
}
func BenchmarkSampleSend(b *testing.B) {
// Send one sample per series, which is the typical remote_write case
const numSamples = 1
const numSeries = 10000
samples, series := createTimeseries(numSamples, numSeries, extraLabels...)
c := NewNopWriteClient()
cfg := testDefaultQueueConfig()
mcfg := config.DefaultMetadataConfig
cfg.BatchSendDeadline = model.Duration(100 * time.Millisecond)
cfg.MinShards = 20
cfg.MaxShards = 20
// todo: test with new proto type(s)
m := newTestQueueManager(b, cfg, mcfg, defaultFlushDeadline, c, config.RemoteWriteProtoMsgV1)
m.StoreSeries(series, 0)
// These should be received by the client.
m.Start()
defer m.Stop()
b.ResetTimer()
for i := 0; i < b.N; i++ {
m.Append(samples)
m.UpdateSeriesSegment(series, i+1) // simulate what wlog.Watcher.garbageCollectSeries does
m.SeriesReset(i + 1)
}
// Do not include shutdown
b.StopTimer()
}
// Check how long it takes to add N series, including external labels processing.
func BenchmarkStoreSeries(b *testing.B) {
externalLabels := []labels.Label{
{Name: "cluster", Value: "mycluster"},
{Name: "replica", Value: "1"},
}
relabelConfigs := []*relabel.Config{{
SourceLabels: model.LabelNames{"namespace"},
Separator: ";",
Regex: relabel.MustNewRegexp("kube.*"),
TargetLabel: "job",
Replacement: "$1",
Action: relabel.Replace,
}}
testCases := []struct {
name string
externalLabels []labels.Label
ts []prompb.TimeSeries
relabelConfigs []*relabel.Config
}{
{name: "plain"},
{name: "externalLabels", externalLabels: externalLabels},
{name: "relabel", relabelConfigs: relabelConfigs},
{
name: "externalLabels+relabel",
externalLabels: externalLabels,
relabelConfigs: relabelConfigs,
},
}
// numSeries chosen to be big enough that StoreSeries dominates creating a new queue manager.
const numSeries = 1000
_, series := createTimeseries(0, numSeries, extraLabels...)
for _, tc := range testCases {
b.Run(tc.name, func(b *testing.B) {
for i := 0; i < b.N; i++ {
c := NewTestWriteClient(config.RemoteWriteProtoMsgV1)
dir := b.TempDir()
cfg := config.DefaultQueueConfig
mcfg := config.DefaultMetadataConfig
metrics := newQueueManagerMetrics(nil, "", "")
m := NewQueueManager(metrics, nil, nil, nil, dir, newEWMARate(ewmaWeight, shardUpdateDuration), cfg, mcfg, labels.EmptyLabels(), nil, c, defaultFlushDeadline, newPool(), newHighestTimestampMetric(), nil, false, false, config.RemoteWriteProtoMsgV1)
m.externalLabels = tc.externalLabels
m.relabelConfigs = tc.relabelConfigs
m.StoreSeries(series, 0)
}
})
}
}
func BenchmarkStartup(b *testing.B) {
dir := os.Getenv("WALDIR")
if dir == "" {
b.Skip("WALDIR env var not set")
}
// Find the second largest segment; we will replay up to this.
// (Second largest as WALWatcher will start tailing the largest).
dirents, err := os.ReadDir(dir)
require.NoError(b, err)
var segments []int
for _, dirent := range dirents {
if i, err := strconv.Atoi(dirent.Name()); err != nil {
segments = append(segments, i)
}
}
sort.Ints(segments)
logger := log.NewLogfmtLogger(log.NewSyncWriter(os.Stdout))
logger = log.With(logger, "caller", log.DefaultCaller)
cfg := testDefaultQueueConfig()
mcfg := config.DefaultMetadataConfig
for n := 0; n < b.N; n++ {
metrics := newQueueManagerMetrics(nil, "", "")
c := NewTestBlockedWriteClient()
// todo: test with new proto type(s)
m := NewQueueManager(metrics, nil, nil, logger, dir,
newEWMARate(ewmaWeight, shardUpdateDuration),
cfg, mcfg, labels.EmptyLabels(), nil, c, 1*time.Minute, newPool(), newHighestTimestampMetric(), nil, false, false, config.RemoteWriteProtoMsgV1)
m.watcher.SetStartTime(timestamp.Time(math.MaxInt64))
m.watcher.MaxSegment = segments[len(segments)-2]
err := m.watcher.Run()
require.NoError(b, err)
}
}
func TestProcessExternalLabels(t *testing.T) {
b := labels.NewBuilder(labels.EmptyLabels())
for i, tc := range []struct {
labels labels.Labels
externalLabels []labels.Label
expected labels.Labels
}{
// Test adding labels at the end.
{
labels: labels.FromStrings("a", "b"),
externalLabels: []labels.Label{{Name: "c", Value: "d"}},
expected: labels.FromStrings("a", "b", "c", "d"),
},
// Test adding labels at the beginning.
{
labels: labels.FromStrings("c", "d"),
externalLabels: []labels.Label{{Name: "a", Value: "b"}},
expected: labels.FromStrings("a", "b", "c", "d"),
},
// Test we don't override existing labels.
{
labels: labels.FromStrings("a", "b"),
externalLabels: []labels.Label{{Name: "a", Value: "c"}},
expected: labels.FromStrings("a", "b"),
},
// Test empty externalLabels.
{
labels: labels.FromStrings("a", "b"),
externalLabels: []labels.Label{},
expected: labels.FromStrings("a", "b"),
},
// Test empty labels.
{
labels: labels.EmptyLabels(),
externalLabels: []labels.Label{{Name: "a", Value: "b"}},
expected: labels.FromStrings("a", "b"),
},
// Test labels is longer than externalLabels.
{
labels: labels.FromStrings("a", "b", "c", "d"),
externalLabels: []labels.Label{{Name: "e", Value: "f"}},
expected: labels.FromStrings("a", "b", "c", "d", "e", "f"),
},
// Test externalLabels is longer than labels.
{
labels: labels.FromStrings("c", "d"),
externalLabels: []labels.Label{{Name: "a", Value: "b"}, {Name: "e", Value: "f"}},
expected: labels.FromStrings("a", "b", "c", "d", "e", "f"),
},
// Adding with and without clashing labels.
{
labels: labels.FromStrings("a", "b", "c", "d"),
externalLabels: []labels.Label{{Name: "a", Value: "xxx"}, {Name: "c", Value: "yyy"}, {Name: "e", Value: "f"}},
expected: labels.FromStrings("a", "b", "c", "d", "e", "f"),
},
} {
b.Reset(tc.labels)
processExternalLabels(b, tc.externalLabels)
testutil.RequireEqual(t, tc.expected, b.Labels(), "test %d", i)
}
}
func TestCalculateDesiredShards(t *testing.T) {
cfg := config.DefaultQueueConfig
_, m := newTestClientAndQueueManager(t, defaultFlushDeadline, config.RemoteWriteProtoMsgV1)
samplesIn := m.dataIn
// Need to start the queue manager so the proper metrics are initialized.
// However we can stop it right away since we don't need to do any actual
// processing.
m.Start()
m.Stop()
inputRate := int64(50000)
var pendingSamples int64
// Two minute startup, no samples are sent.
startedAt := time.Now().Add(-2 * time.Minute)
// helper function for adding samples.
addSamples := func(s int64, ts time.Duration) {
pendingSamples += s
samplesIn.incr(s)
samplesIn.tick()
m.highestRecvTimestamp.Set(float64(startedAt.Add(ts).Unix()))
}
// helper function for sending samples.
sendSamples := func(s int64, ts time.Duration) {
pendingSamples -= s
m.dataOut.incr(s)
m.dataOutDuration.incr(int64(m.numShards) * int64(shardUpdateDuration))
// highest sent is how far back pending samples would be at our input rate.
highestSent := startedAt.Add(ts - time.Duration(pendingSamples/inputRate)*time.Second)
m.metrics.highestSentTimestamp.Set(float64(highestSent.Unix()))
m.lastSendTimestamp.Store(time.Now().Unix())
}
ts := time.Duration(0)
for ; ts < 120*time.Second; ts += shardUpdateDuration {
addSamples(inputRate*int64(shardUpdateDuration/time.Second), ts)
m.numShards = m.calculateDesiredShards()
require.Equal(t, 1, m.numShards)
}
// Assume 100ms per request, or 10 requests per second per shard.
// Shard calculation should never drop below barely keeping up.
minShards := int(inputRate) / cfg.MaxSamplesPerSend / 10
// This test should never go above 200 shards, that would be more resources than needed.
maxShards := 200
for ; ts < 15*time.Minute; ts += shardUpdateDuration {
sin := inputRate * int64(shardUpdateDuration/time.Second)
addSamples(sin, ts)
sout := int64(m.numShards*cfg.MaxSamplesPerSend) * int64(shardUpdateDuration/(100*time.Millisecond))
// You can't send samples that don't exist so cap at the number of pending samples.
if sout > pendingSamples {
sout = pendingSamples
}
sendSamples(sout, ts)
t.Log("desiredShards", m.numShards, "pendingSamples", pendingSamples)
m.numShards = m.calculateDesiredShards()
require.GreaterOrEqual(t, m.numShards, minShards, "Shards are too low. desiredShards=%d, minShards=%d, t_seconds=%d", m.numShards, minShards, ts/time.Second)
require.LessOrEqual(t, m.numShards, maxShards, "Shards are too high. desiredShards=%d, maxShards=%d, t_seconds=%d", m.numShards, maxShards, ts/time.Second)
}
require.Equal(t, int64(0), pendingSamples, "Remote write never caught up, there are still %d pending samples.", pendingSamples)
}
func TestCalculateDesiredShardsDetail(t *testing.T) {
_, m := newTestClientAndQueueManager(t, defaultFlushDeadline, config.RemoteWriteProtoMsgV1)
samplesIn := m.dataIn
for _, tc := range []struct {
name string
prevShards int
dataIn int64 // Quantities normalised to seconds.
dataOut int64
dataDropped int64
dataOutDuration float64
backlog float64
expectedShards int
}{
{
name: "nothing in or out 1",
prevShards: 1,
expectedShards: 1, // Shards stays the same.
},
{
name: "nothing in or out 10",
prevShards: 10,
expectedShards: 10, // Shards stays the same.
},
{
name: "steady throughput",
prevShards: 1,
dataIn: 10,
dataOut: 10,
dataOutDuration: 1,
expectedShards: 1,
},
{
name: "scale down",
prevShards: 10,
dataIn: 10,
dataOut: 10,
dataOutDuration: 5,
expectedShards: 5,
},
{
name: "scale down constrained",
prevShards: 7,
dataIn: 10,
dataOut: 10,
dataOutDuration: 5,
expectedShards: 7,
},
{
name: "scale up",
prevShards: 1,
dataIn: 10,
dataOut: 10,
dataOutDuration: 10,
expectedShards: 10,
},
{
name: "scale up constrained",
prevShards: 8,
dataIn: 10,
dataOut: 10,
dataOutDuration: 10,
expectedShards: 8,
},
{
name: "backlogged 20s",
prevShards: 2,
dataIn: 10,
dataOut: 10,
dataOutDuration: 2,
backlog: 20,
expectedShards: 4,
},
{
name: "backlogged 90s",
prevShards: 4,
dataIn: 10,
dataOut: 10,
dataOutDuration: 4,
backlog: 90,
expectedShards: 22,
},
{
name: "backlog reduced",
prevShards: 22,
dataIn: 10,
dataOut: 20,
dataOutDuration: 4,
backlog: 10,
expectedShards: 3,
},
{
name: "backlog eliminated",
prevShards: 3,
dataIn: 10,
dataOut: 10,
dataOutDuration: 2,
backlog: 0,
expectedShards: 2, // Shard back down.
},
{
name: "slight slowdown",
prevShards: 1,
dataIn: 10,
dataOut: 10,
dataOutDuration: 1.2,
expectedShards: 2, // 1.2 is rounded up to 2.
},
{
name: "bigger slowdown",
prevShards: 1,
dataIn: 10,
dataOut: 10,
dataOutDuration: 1.4,
expectedShards: 2,
},
{
name: "speed up",
prevShards: 2,
dataIn: 10,
dataOut: 10,
dataOutDuration: 1.2,
backlog: 0,
expectedShards: 2, // No reaction - 1.2 is rounded up to 2.
},
{
name: "speed up more",
prevShards: 2,
dataIn: 10,
dataOut: 10,
dataOutDuration: 0.9,
backlog: 0,
expectedShards: 1,
},
{
name: "marginal decision A",
prevShards: 3,
dataIn: 10,
dataOut: 10,
dataOutDuration: 2.01,
backlog: 0,
expectedShards: 3, // 2.01 rounds up to 3.
},
{
name: "marginal decision B",
prevShards: 3,
dataIn: 10,
dataOut: 10,
dataOutDuration: 1.99,
backlog: 0,
expectedShards: 2, // 1.99 rounds up to 2.
},
} {
t.Run(tc.name, func(t *testing.T) {
m.numShards = tc.prevShards
forceEMWA(samplesIn, tc.dataIn*int64(shardUpdateDuration/time.Second))
samplesIn.tick()
forceEMWA(m.dataOut, tc.dataOut*int64(shardUpdateDuration/time.Second))
forceEMWA(m.dataDropped, tc.dataDropped*int64(shardUpdateDuration/time.Second))
forceEMWA(m.dataOutDuration, int64(tc.dataOutDuration*float64(shardUpdateDuration)))
m.highestRecvTimestamp.value = tc.backlog // Not Set() because it can only increase value.
require.Equal(t, tc.expectedShards, m.calculateDesiredShards())
})
}
}
func forceEMWA(r *ewmaRate, rate int64) {
r.init = false
r.newEvents.Store(rate)
}
func TestQueueManagerMetrics(t *testing.T) {
reg := prometheus.NewPedanticRegistry()
metrics := newQueueManagerMetrics(reg, "name", "http://localhost:1234")
// Make sure metrics pass linting.
problems, err := client_testutil.GatherAndLint(reg)
require.NoError(t, err)
require.Empty(t, problems, "Metric linting problems detected: %v", problems)
// Make sure all metrics were unregistered. A failure here means you need
// unregister a metric in `queueManagerMetrics.unregister()`.
metrics.unregister()
err = client_testutil.GatherAndCompare(reg, strings.NewReader(""))
require.NoError(t, err)
}
func TestQueue_FlushAndShutdownDoesNotDeadlock(t *testing.T) {
capacity := 100
batchSize := 10
queue := newQueue(batchSize, capacity)
for i := 0; i < capacity+batchSize; i++ {
queue.Append(timeSeries{})
}
done := make(chan struct{})
go queue.FlushAndShutdown(done)
go func() {
// Give enough time for FlushAndShutdown to acquire the lock. queue.Batch()
// should not block forever even if the lock is acquired.
time.Sleep(10 * time.Millisecond)
queue.Batch()
close(done)
}()
select {
case <-done:
case <-time.After(2 * time.Second):
t.Error("Deadlock in FlushAndShutdown detected")
pprof.Lookup("goroutine").WriteTo(os.Stdout, 1)
t.FailNow()
}
}
func createDummyTimeSeries(instances int) []timeSeries {
metrics := []labels.Labels{
labels.FromStrings("__name__", "go_gc_duration_seconds", "quantile", "0"),
labels.FromStrings("__name__", "go_gc_duration_seconds", "quantile", "0.25"),
labels.FromStrings("__name__", "go_gc_duration_seconds", "quantile", "0.5"),
labels.FromStrings("__name__", "go_gc_duration_seconds", "quantile", "0.75"),
labels.FromStrings("__name__", "go_gc_duration_seconds", "quantile", "1"),
labels.FromStrings("__name__", "go_gc_duration_seconds_sum"),
labels.FromStrings("__name__", "go_gc_duration_seconds_count"),
labels.FromStrings("__name__", "go_memstats_alloc_bytes_total"),
labels.FromStrings("__name__", "go_memstats_frees_total"),
labels.FromStrings("__name__", "go_memstats_lookups_total"),
labels.FromStrings("__name__", "go_memstats_mallocs_total"),
labels.FromStrings("__name__", "go_goroutines"),
labels.FromStrings("__name__", "go_info", "version", "go1.19.3"),
labels.FromStrings("__name__", "go_memstats_alloc_bytes"),
labels.FromStrings("__name__", "go_memstats_buck_hash_sys_bytes"),
labels.FromStrings("__name__", "go_memstats_gc_sys_bytes"),
labels.FromStrings("__name__", "go_memstats_heap_alloc_bytes"),
labels.FromStrings("__name__", "go_memstats_heap_idle_bytes"),
labels.FromStrings("__name__", "go_memstats_heap_inuse_bytes"),
labels.FromStrings("__name__", "go_memstats_heap_objects"),
labels.FromStrings("__name__", "go_memstats_heap_released_bytes"),
labels.FromStrings("__name__", "go_memstats_heap_sys_bytes"),
labels.FromStrings("__name__", "go_memstats_last_gc_time_seconds"),
labels.FromStrings("__name__", "go_memstats_mcache_inuse_bytes"),
labels.FromStrings("__name__", "go_memstats_mcache_sys_bytes"),
labels.FromStrings("__name__", "go_memstats_mspan_inuse_bytes"),
labels.FromStrings("__name__", "go_memstats_mspan_sys_bytes"),
labels.FromStrings("__name__", "go_memstats_next_gc_bytes"),
labels.FromStrings("__name__", "go_memstats_other_sys_bytes"),
labels.FromStrings("__name__", "go_memstats_stack_inuse_bytes"),
labels.FromStrings("__name__", "go_memstats_stack_sys_bytes"),
labels.FromStrings("__name__", "go_memstats_sys_bytes"),
labels.FromStrings("__name__", "go_threads"),
}
commonLabels := labels.FromStrings(
"cluster", "some-cluster-0",
"container", "prometheus",
"job", "some-namespace/prometheus",
"namespace", "some-namespace")
var result []timeSeries
r := rand.New(rand.NewSource(0))
for i := 0; i < instances; i++ {
b := labels.NewBuilder(commonLabels)
b.Set("pod", "prometheus-"+strconv.Itoa(i))
for _, lbls := range metrics {
lbls.Range(func(l labels.Label) {
b.Set(l.Name, l.Value)
})
result = append(result, timeSeries{
seriesLabels: b.Labels(),
value: r.Float64(),
})
}
}
return result
}
func BenchmarkBuildWriteRequest(b *testing.B) {
noopLogger := log.NewNopLogger()
bench := func(b *testing.B, batch []timeSeries) {
buff := make([]byte, 0)
seriesBuff := make([]prompb.TimeSeries, len(batch))
for i := range seriesBuff {
seriesBuff[i].Samples = []prompb.Sample{{}}
seriesBuff[i].Exemplars = []prompb.Exemplar{{}}
}
pBuf := proto.NewBuffer(nil)
// Warmup buffers
for i := 0; i < 10; i++ {
populateTimeSeries(batch, seriesBuff, true, true)
buildWriteRequest(noopLogger, seriesBuff, nil, pBuf, &buff, nil, "snappy")
}
b.ResetTimer()
totalSize := 0
for i := 0; i < b.N; i++ {
populateTimeSeries(batch, seriesBuff, true, true)
req, _, _, err := buildWriteRequest(noopLogger, seriesBuff, nil, pBuf, &buff, nil, "snappy")
if err != nil {
b.Fatal(err)
}
totalSize += len(req)
b.ReportMetric(float64(totalSize)/float64(b.N), "compressedSize/op")
}
}
twoBatch := createDummyTimeSeries(2)
tenBatch := createDummyTimeSeries(10)
hundredBatch := createDummyTimeSeries(100)
b.Run("2 instances", func(b *testing.B) {
bench(b, twoBatch)
})
b.Run("10 instances", func(b *testing.B) {
bench(b, tenBatch)
})
b.Run("1k instances", func(b *testing.B) {
bench(b, hundredBatch)
})
}
func BenchmarkBuildV2WriteRequest(b *testing.B) {
noopLogger := log.NewNopLogger()
type testcase struct {
batch []timeSeries
}
testCases := []testcase{
{createDummyTimeSeries(2)},
{createDummyTimeSeries(10)},
{createDummyTimeSeries(100)},
}
for _, tc := range testCases {
symbolTable := writev2.NewSymbolTable()
buff := make([]byte, 0)
seriesBuff := make([]writev2.TimeSeries, len(tc.batch))
for i := range seriesBuff {
seriesBuff[i].Samples = []writev2.Sample{{}}
seriesBuff[i].Exemplars = []writev2.Exemplar{{}}
}
pBuf := []byte{}
// Warmup buffers
for i := 0; i < 10; i++ {
populateV2TimeSeries(&symbolTable, tc.batch, seriesBuff, true, true)
buildV2WriteRequest(noopLogger, seriesBuff, symbolTable.Symbols(), &pBuf, &buff, nil, "snappy")
}
b.Run(fmt.Sprintf("%d-instances", len(tc.batch)), func(b *testing.B) {
totalSize := 0
for j := 0; j < b.N; j++ {
populateV2TimeSeries(&symbolTable, tc.batch, seriesBuff, true, true)
b.ResetTimer()
req, _, _, err := buildV2WriteRequest(noopLogger, seriesBuff, symbolTable.Symbols(), &pBuf, &buff, nil, "snappy")
if err != nil {
b.Fatal(err)
}
symbolTable.Reset()
totalSize += len(req)
b.ReportMetric(float64(totalSize)/float64(b.N), "compressedSize/op")
}
})
}
}
func TestDropOldTimeSeries(t *testing.T) {
size := 10
nSeries := 6
nSamples := config.DefaultQueueConfig.Capacity * size
samples, newSamples, series := createTimeseriesWithOldSamples(nSamples, nSeries)
// TODO(alexg): test with new version
c := NewTestWriteClient(config.RemoteWriteProtoMsgV1)
c.expectSamples(newSamples, series)
cfg := config.DefaultQueueConfig
mcfg := config.DefaultMetadataConfig
cfg.MaxShards = 1
cfg.SampleAgeLimit = model.Duration(60 * time.Second)
m := newTestQueueManager(t, cfg, mcfg, defaultFlushDeadline, c, config.RemoteWriteProtoMsgV1)
m.StoreSeries(series, 0)
m.Start()
defer m.Stop()
m.Append(samples)
c.waitForExpectedData(t, 30*time.Second)
}
func TestIsSampleOld(t *testing.T) {
currentTime := time.Now()
require.True(t, isSampleOld(currentTime, 60*time.Second, timestamp.FromTime(currentTime.Add(-61*time.Second))))
require.False(t, isSampleOld(currentTime, 60*time.Second, timestamp.FromTime(currentTime.Add(-59*time.Second))))
}
// Simulates scenario in which remote write endpoint is down and a subset of samples is dropped due to age limit while backoffing.
func TestSendSamplesWithBackoffWithSampleAgeLimit(t *testing.T) {
maxSamplesPerSend := 10
sampleAgeLimit := time.Second
cfg := config.DefaultQueueConfig
cfg.MaxShards = 1
cfg.SampleAgeLimit = model.Duration(sampleAgeLimit)
// Set the batch send deadline to 5 minutes to effectively disable it.
cfg.BatchSendDeadline = model.Duration(time.Minute * 5)
cfg.Capacity = 10 * maxSamplesPerSend // more than the amount of data we append in the test
cfg.MaxBackoff = model.Duration(time.Millisecond * 100)
cfg.MinBackoff = model.Duration(time.Millisecond * 100)
cfg.MaxSamplesPerSend = maxSamplesPerSend
metadataCfg := config.DefaultMetadataConfig
metadataCfg.Send = true
metadataCfg.SendInterval = model.Duration(time.Second * 60)
metadataCfg.MaxSamplesPerSend = maxSamplesPerSend
c := NewTestWriteClient(config.RemoteWriteProtoMsgV1)
m := newTestQueueManager(t, cfg, metadataCfg, time.Second, c, config.RemoteWriteProtoMsgV1)
m.Start()
batchID := 0
expectedSamples := map[string][]prompb.Sample{}
appendData := func(numberOfSeries int, timeAdd time.Duration, shouldBeDropped bool) {
t.Log(">>>> Appending series ", numberOfSeries, " as batch ID ", batchID, " with timeAdd ", timeAdd, " and should be dropped ", shouldBeDropped)
samples, series := createTimeseriesWithRandomLabelCount(strconv.Itoa(batchID), numberOfSeries, timeAdd, 9)
m.StoreSeries(series, batchID)
sent := m.Append(samples)
require.True(t, sent, "samples not sent")
if !shouldBeDropped {
for _, s := range samples {
tsID := getSeriesIDFromRef(series[s.Ref])
expectedSamples[tsID] = append(c.expectedSamples[tsID], prompb.Sample{
Timestamp: s.T,
Value: s.V,
})
}
}
batchID++
}
timeShift := -time.Millisecond * 5
c.SetReturnError(RecoverableError{context.DeadlineExceeded, defaultBackoff})
appendData(maxSamplesPerSend/2, timeShift, true)
time.Sleep(sampleAgeLimit)
appendData(maxSamplesPerSend/2, timeShift, true)
time.Sleep(sampleAgeLimit / 10)
appendData(maxSamplesPerSend/2, timeShift, true)
time.Sleep(2 * sampleAgeLimit)
appendData(2*maxSamplesPerSend, timeShift, false)
time.Sleep(sampleAgeLimit / 2)
c.SetReturnError(nil)
appendData(5, timeShift, false)
m.Stop()
if diff := cmp.Diff(expectedSamples, c.receivedSamples); diff != "" {
t.Errorf("mismatch (-want +got):\n%s", diff)
}
}
func createTimeseriesWithRandomLabelCount(id string, seriesCount int, timeAdd time.Duration, maxLabels int) ([]record.RefSample, []record.RefSeries) {
samples := []record.RefSample{}
series := []record.RefSeries{}
// use a fixed rand source so tests are consistent
r := rand.New(rand.NewSource(99))
for i := 0; i < seriesCount; i++ {
s := record.RefSample{
Ref: chunks.HeadSeriesRef(i),
T: time.Now().Add(timeAdd).UnixMilli(),
V: r.Float64(),
}
samples = append(samples, s)
labelsCount := r.Intn(maxLabels)
lb := labels.NewScratchBuilder(1 + labelsCount)
lb.Add("__name__", "batch_"+id+"_id_"+strconv.Itoa(i))
for j := 1; j < labelsCount+1; j++ {
// same for both name and value
label := "batch_" + id + "_label_" + strconv.Itoa(j)
lb.Add(label, label)
}
series = append(series, record.RefSeries{
Ref: chunks.HeadSeriesRef(i),
Labels: lb.Labels(),
})
}
return samples, series
}
func createTimeseriesWithOldSamples(numSamples, numSeries int, extraLabels ...labels.Label) ([]record.RefSample, []record.RefSample, []record.RefSeries) {
newSamples := make([]record.RefSample, 0, numSamples)
samples := make([]record.RefSample, 0, numSamples)
series := make([]record.RefSeries, 0, numSeries)
lb := labels.NewScratchBuilder(1 + len(extraLabels))
for i := 0; i < numSeries; i++ {
name := fmt.Sprintf("test_metric_%d", i)
// We create half of the samples in the past.
past := timestamp.FromTime(time.Now().Add(-5 * time.Minute))
for j := 0; j < numSamples/2; j++ {
samples = append(samples, record.RefSample{
Ref: chunks.HeadSeriesRef(i),
T: past + int64(j),
V: float64(i),
})
}
for j := 0; j < numSamples/2; j++ {
sample := record.RefSample{
Ref: chunks.HeadSeriesRef(i),
T: int64(int(time.Now().UnixMilli()) + j),
V: float64(i),
}
samples = append(samples, sample)
newSamples = append(newSamples, sample)
}
// Create Labels that is name of series plus any extra labels supplied.
lb.Reset()
lb.Add(labels.MetricName, name)
for _, l := range extraLabels {
lb.Add(l.Name, l.Value)
}
lb.Sort()
series = append(series, record.RefSeries{
Ref: chunks.HeadSeriesRef(i),
Labels: lb.Labels(),
})
}
return samples, newSamples, series
}
func filterTsLimit(limit int64, ts prompb.TimeSeries) bool {
return limit > ts.Samples[0].Timestamp
}
func TestBuildTimeSeries(t *testing.T) {
testCases := []struct {
name string
ts []prompb.TimeSeries
filter func(ts prompb.TimeSeries) bool
lowestTs int64
highestTs int64
droppedSamples int
responseLen int
}{
{
name: "No filter applied",
ts: []prompb.TimeSeries{
{
Samples: []prompb.Sample{
{
Timestamp: 1234567890,
Value: 1.23,
},
},
},
{
Samples: []prompb.Sample{
{
Timestamp: 1234567891,
Value: 2.34,
},
},
},
{
Samples: []prompb.Sample{
{
Timestamp: 1234567892,
Value: 3.34,
},
},
},
},
filter: nil,
responseLen: 3,
lowestTs: 1234567890,
highestTs: 1234567892,
},
{
name: "Filter applied, samples in order",
ts: []prompb.TimeSeries{
{
Samples: []prompb.Sample{
{
Timestamp: 1234567890,
Value: 1.23,
},
},
},
{
Samples: []prompb.Sample{
{
Timestamp: 1234567891,
Value: 2.34,
},
},
},
{
Samples: []prompb.Sample{
{
Timestamp: 1234567892,
Value: 3.45,
},
},
},
{
Samples: []prompb.Sample{
{
Timestamp: 1234567893,
Value: 3.45,
},
},
},
},
filter: func(ts prompb.TimeSeries) bool { return filterTsLimit(1234567892, ts) },
responseLen: 2,
lowestTs: 1234567892,
highestTs: 1234567893,
droppedSamples: 2,
},
{
name: "Filter applied, samples out of order",
ts: []prompb.TimeSeries{
{
Samples: []prompb.Sample{
{
Timestamp: 1234567892,
Value: 3.45,
},
},
},
{
Samples: []prompb.Sample{
{
Timestamp: 1234567890,
Value: 1.23,
},
},
},
{
Samples: []prompb.Sample{
{
Timestamp: 1234567893,
Value: 3.45,
},
},
},
{
Samples: []prompb.Sample{
{
Timestamp: 1234567891,
Value: 2.34,
},
},
},
},
filter: func(ts prompb.TimeSeries) bool { return filterTsLimit(1234567892, ts) },
responseLen: 2,
lowestTs: 1234567892,
highestTs: 1234567893,
droppedSamples: 2,
},
{
name: "Filter applied, samples not consecutive",
ts: []prompb.TimeSeries{
{
Samples: []prompb.Sample{
{
Timestamp: 1234567890,
Value: 1.23,
},
},
},
{
Samples: []prompb.Sample{
{
Timestamp: 1234567892,
Value: 3.45,
},
},
},
{
Samples: []prompb.Sample{
{
Timestamp: 1234567895,
Value: 6.78,
},
},
},
{
Samples: []prompb.Sample{
{
Timestamp: 1234567897,
Value: 6.78,
},
},
},
},
filter: func(ts prompb.TimeSeries) bool { return filterTsLimit(1234567895, ts) },
responseLen: 2,
lowestTs: 1234567895,
highestTs: 1234567897,
droppedSamples: 2,
},
}
// Run the test cases
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
highest, lowest, result, droppedSamples, _, _ := buildTimeSeries(tc.ts, tc.filter)
require.NotNil(t, result)
require.Len(t, result, tc.responseLen)
require.Equal(t, tc.highestTs, highest)
require.Equal(t, tc.lowestTs, lowest)
require.Equal(t, tc.droppedSamples, droppedSamples)
})
}
}
func BenchmarkBuildTimeSeries(b *testing.B) {
// Send one sample per series, which is the typical remote_write case
const numSamples = 10000
filter := func(ts prompb.TimeSeries) bool { return filterTsLimit(99, ts) }
for i := 0; i < b.N; i++ {
samples := createProtoTimeseriesWithOld(numSamples, 100, extraLabels...)
_, _, result, _, _, _ := buildTimeSeries(samples, filter)
require.NotNil(b, result)
}
}