[ENHANCEMEN] Remote-Write: optionally use a DNS resolver that picks a random IP (#15329)
Some checks failed
buf.build / lint and publish (push) Has been cancelled
CI / Go tests (push) Has been cancelled
CI / More Go tests (push) Has been cancelled
CI / Go tests with previous Go version (push) Has been cancelled
CI / UI tests (push) Has been cancelled
CI / Go tests on Windows (push) Has been cancelled
CI / Mixins tests (push) Has been cancelled
CI / Build Prometheus for common architectures (0) (push) Has been cancelled
CI / Build Prometheus for common architectures (1) (push) Has been cancelled
CI / Build Prometheus for common architectures (2) (push) Has been cancelled
CI / Build Prometheus for all architectures (0) (push) Has been cancelled
CI / Build Prometheus for all architectures (1) (push) Has been cancelled
CI / Build Prometheus for all architectures (10) (push) Has been cancelled
CI / Build Prometheus for all architectures (11) (push) Has been cancelled
CI / Build Prometheus for all architectures (2) (push) Has been cancelled
CI / Build Prometheus for all architectures (3) (push) Has been cancelled
CI / Build Prometheus for all architectures (4) (push) Has been cancelled
CI / Build Prometheus for all architectures (5) (push) Has been cancelled
CI / Build Prometheus for all architectures (6) (push) Has been cancelled
CI / Build Prometheus for all architectures (7) (push) Has been cancelled
CI / Build Prometheus for all architectures (8) (push) Has been cancelled
CI / Build Prometheus for all architectures (9) (push) Has been cancelled
CI / Check generated parser (push) Has been cancelled
CI / golangci-lint (push) Has been cancelled
CI / fuzzing (push) Has been cancelled
CI / codeql (push) Has been cancelled
Scorecards supply-chain security / Scorecards analysis (push) Has been cancelled
CI / Report status of build Prometheus for all architectures (push) Has been cancelled
CI / Publish main branch artifacts (push) Has been cancelled
CI / Publish release artefacts (push) Has been cancelled
CI / Publish UI on npm Registry (push) Has been cancelled

When a remote-write is executed towards a host name that is resolved to multiple IP addresses, this PR introduces a possibility to force creation of new connections used for the remote-write request to a randomly chosen IP address from the ones corresponding to the host name. The default behavior remains unchanged, i.s., the IP address used for the connection creation remains the one chosen by Go.

This is an experimental feature, it is disabled by default.

Signed-off-by: Yuri Nikolic <durica.nikolic@grafana.com>
This commit is contained in:
Đurica Yuri Nikolić 2024-11-15 16:41:49 +01:00 committed by GitHub
parent e13c28bd4a
commit 101b1c307f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 237 additions and 1 deletions

View file

@ -1195,6 +1195,7 @@ type RemoteWriteConfig struct {
Name string `yaml:"name,omitempty"`
SendExemplars bool `yaml:"send_exemplars,omitempty"`
SendNativeHistograms bool `yaml:"send_native_histograms,omitempty"`
RoundRobinDNS bool `yaml:"round_robin_dns,omitempty"`
// ProtobufMessage specifies the protobuf message to use against the remote
// receiver as specified in https://prometheus.io/docs/specs/remote_write_spec_2_0/
ProtobufMessage RemoteWriteProtoMsg `yaml:"protobuf_message,omitempty"`

View file

@ -2797,6 +2797,12 @@ write_relabel_configs:
# For the `io.prometheus.write.v2.Request` message, this option is noop (always true).
[ send_native_histograms: <boolean> | default = false ]
# When enabled, remote-write will resolve the URL host name via DNS, choose one of the IP addresses at random, and connect to it.
# When disabled, remote-write relies on Go's standard behavior, which is to try to connect to each address in turn.
# The connection timeout applies to the whole operation, i.e. in the latter case it is spread over all attempt.
# This is an experimental feature, and its behavior might still change, or even get removed.
[ round_robin_dns: <boolean> | default = false ]
# Optionally configures AWS's Signature Verification 4 signing process to
# sign requests. Cannot be set at the same time as basic_auth, authorization, oauth2, or azuread.
# To use the default credentials from the AWS SDK, use `sigv4: {}`.

View file

@ -145,6 +145,7 @@ type ClientConfig struct {
RetryOnRateLimit bool
WriteProtoMsg config.RemoteWriteProtoMsg
ChunkedReadLimit uint64
RoundRobinDNS bool
}
// ReadClient will request the STREAMED_XOR_CHUNKS method of remote read but can
@ -180,7 +181,11 @@ func NewReadClient(name string, conf *ClientConfig) (ReadClient, error) {
// NewWriteClient creates a new client for remote write.
func NewWriteClient(name string, conf *ClientConfig) (WriteClient, error) {
httpClient, err := config_util.NewClientFromConfig(conf.HTTPClientConfig, "remote_storage_write_client")
var httpOpts []config_util.HTTPClientOption
if conf.RoundRobinDNS {
httpOpts = []config_util.HTTPClientOption{config_util.WithDialContextFunc(newDialContextWithRoundRobinDNS().dialContextFn())}
}
httpClient, err := config_util.NewClientFromConfig(conf.HTTPClientConfig, "remote_storage_write_client", httpOpts...)
if err != nil {
return nil, err
}

View file

@ -0,0 +1,62 @@
// Copyright 2024 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package remote
import (
"context"
"math/rand"
"net"
"net/http"
"time"
"github.com/prometheus/common/config"
)
type hostResolver interface {
LookupHost(context.Context, string) ([]string, error)
}
type dialContextWithRoundRobinDNS struct {
dialContext config.DialContextFunc
resolver hostResolver
rand *rand.Rand
}
// newDialContextWithRoundRobinDNS creates a new dialContextWithRoundRobinDNS.
// We discourage creating new instances of struct dialContextWithRoundRobinDNS by explicitly setting its members,
// except for testing purposes, and recommend using newDialContextWithRoundRobinDNS.
func newDialContextWithRoundRobinDNS() *dialContextWithRoundRobinDNS {
return &dialContextWithRoundRobinDNS{
dialContext: http.DefaultTransport.(*http.Transport).DialContext,
resolver: net.DefaultResolver,
rand: rand.New(rand.NewSource(time.Now().Unix())),
}
}
func (dc *dialContextWithRoundRobinDNS) dialContextFn() config.DialContextFunc {
return func(ctx context.Context, network, addr string) (net.Conn, error) {
host, port, err := net.SplitHostPort(addr)
if err != nil {
return dc.dialContext(ctx, network, addr)
}
addrs, err := dc.resolver.LookupHost(ctx, host)
if err != nil || len(addrs) == 0 {
return dc.dialContext(ctx, network, addr)
}
randomAddr := net.JoinHostPort(addrs[dc.rand.Intn(len(addrs))], port)
return dc.dialContext(ctx, network, randomAddr)
}
}

View file

@ -0,0 +1,161 @@
// Copyright 2024 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package remote
import (
"context"
"errors"
"math/rand"
"net"
"sync"
"testing"
"time"
"github.com/prometheus/common/config"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
)
const (
testNetwork = "tcp"
testAddrWithoutPort = "this-is-my-addr.without-port"
testAddrWithPort = "this-is-my-addr.without-port:123"
testPort = "123"
ip1 = "1.2.3.4"
ip2 = "5.6.7.8"
ip3 = "9.0.1.2"
randSeed int64 = 123456789
)
var (
errMockLookupHost = errors.New("this is a mocked error")
testLookupResult = []string{ip1, ip2, ip3}
testLookupResultWithPort = []string{net.JoinHostPort(ip1, testPort), net.JoinHostPort(ip2, testPort), net.JoinHostPort(ip3, testPort)}
)
type mockDialContext struct {
mock.Mock
addrFrequencyMu sync.Mutex
addrFrequency map[string]int
}
func newMockDialContext(acceptableAddresses []string) *mockDialContext {
m := &mockDialContext{
addrFrequencyMu: sync.Mutex{},
addrFrequency: make(map[string]int),
}
for _, acceptableAddr := range acceptableAddresses {
m.On("dialContext", mock.Anything, mock.Anything, acceptableAddr).Return(nil, nil)
}
return m
}
func (dc *mockDialContext) dialContext(ctx context.Context, network, addr string) (net.Conn, error) {
dc.addrFrequencyMu.Lock()
defer dc.addrFrequencyMu.Unlock()
args := dc.MethodCalled("dialContext", ctx, network, addr)
dc.addrFrequency[addr]++
return nil, args.Error(1)
}
func (dc *mockDialContext) getCount(addr string) int {
dc.addrFrequencyMu.Lock()
defer dc.addrFrequencyMu.Unlock()
return dc.addrFrequency[addr]
}
type mockedLookupHost struct {
withErr bool
result []string
}
func (lh *mockedLookupHost) LookupHost(context.Context, string) ([]string, error) {
if lh.withErr {
return nil, errMockLookupHost
}
return lh.result, nil
}
func createDialContextWithRoundRobinDNS(dialContext config.DialContextFunc, resolver hostResolver, r *rand.Rand) dialContextWithRoundRobinDNS {
return dialContextWithRoundRobinDNS{
dialContext: dialContext,
resolver: resolver,
rand: r,
}
}
func TestDialContextWithRandomConnections(t *testing.T) {
numberOfRuns := 2 * len(testLookupResult)
var mdc *mockDialContext
testCases := map[string]struct {
addr string
setup func() dialContextWithRoundRobinDNS
check func()
}{
"if address contains no port call default DealContext": {
addr: testAddrWithoutPort,
setup: func() dialContextWithRoundRobinDNS {
mdc = newMockDialContext([]string{testAddrWithoutPort})
return createDialContextWithRoundRobinDNS(mdc.dialContext, &mockedLookupHost{withErr: false}, rand.New(rand.NewSource(time.Now().Unix())))
},
check: func() {
require.Equal(t, numberOfRuns, mdc.getCount(testAddrWithoutPort))
},
},
"if lookup host returns error call default DealContext": {
addr: testAddrWithPort,
setup: func() dialContextWithRoundRobinDNS {
mdc = newMockDialContext([]string{testAddrWithPort})
return createDialContextWithRoundRobinDNS(mdc.dialContext, &mockedLookupHost{withErr: true}, rand.New(rand.NewSource(time.Now().Unix())))
},
check: func() {
require.Equal(t, numberOfRuns, mdc.getCount(testAddrWithPort))
},
},
"if lookup returns no addresses call default DealContext": {
addr: testAddrWithPort,
setup: func() dialContextWithRoundRobinDNS {
mdc = newMockDialContext([]string{testAddrWithPort})
return createDialContextWithRoundRobinDNS(mdc.dialContext, &mockedLookupHost{}, rand.New(rand.NewSource(time.Now().Unix())))
},
check: func() {
require.Equal(t, numberOfRuns, mdc.getCount(testAddrWithPort))
},
},
"if lookup host is successful, shuffle results": {
addr: testAddrWithPort,
setup: func() dialContextWithRoundRobinDNS {
mdc = newMockDialContext(testLookupResultWithPort)
return createDialContextWithRoundRobinDNS(mdc.dialContext, &mockedLookupHost{result: testLookupResult}, rand.New(rand.NewSource(randSeed)))
},
check: func() {
// we ensure that not all runs will choose the first element of the lookup
require.NotEqual(t, numberOfRuns, mdc.getCount(testLookupResultWithPort[0]))
},
},
}
for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
dc := tc.setup()
require.NotNil(t, dc)
for i := 0; i < numberOfRuns; i++ {
_, err := dc.dialContextFn()(context.Background(), testNetwork, tc.addr)
require.NoError(t, err)
}
tc.check()
})
}
}

View file

@ -180,6 +180,7 @@ func (rws *WriteStorage) ApplyConfig(conf *config.Config) error {
GoogleIAMConfig: rwConf.GoogleIAMConfig,
Headers: rwConf.Headers,
RetryOnRateLimit: rwConf.QueueConfig.RetryOnRateLimit,
RoundRobinDNS: rwConf.RoundRobinDNS,
})
if err != nil {
return err