prometheus/discovery/consul/consul_test.go

491 lines
13 KiB
Go
Raw Normal View History

// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package consul
import (
"context"
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
"net/http"
"net/http/httptest"
"net/url"
"testing"
"time"
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
"github.com/go-kit/log"
"github.com/prometheus/common/config"
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
"github.com/prometheus/common/model"
"github.com/stretchr/testify/require"
"go.uber.org/goleak"
"gopkg.in/yaml.v2"
"github.com/prometheus/prometheus/discovery/targetgroup"
)
func TestMain(m *testing.M) {
goleak.VerifyTestMain(m)
}
func TestConfiguredService(t *testing.T) {
Refactor SD configuration to remove `config` dependency (#3629) * refactor: move targetGroup struct and CheckOverflow() to their own package * refactor: move auth and security related structs to a utility package, fix import error in utility package * refactor: Azure SD, remove SD struct from config * refactor: DNS SD, remove SD struct from config into dns package * refactor: ec2 SD, move SD struct from config into the ec2 package * refactor: file SD, move SD struct from config to file discovery package * refactor: gce, move SD struct from config to gce discovery package * refactor: move HTTPClientConfig and URL into util/config, fix import error in httputil * refactor: consul, move SD struct from config into consul discovery package * refactor: marathon, move SD struct from config into marathon discovery package * refactor: triton, move SD struct from config to triton discovery package, fix test * refactor: zookeeper, move SD structs from config to zookeeper discovery package * refactor: openstack, remove SD struct from config, move into openstack discovery package * refactor: kubernetes, move SD struct from config into kubernetes discovery package * refactor: notifier, use targetgroup package instead of config * refactor: tests for file, marathon, triton SD - use targetgroup package instead of config.TargetGroup * refactor: retrieval, use targetgroup package instead of config.TargetGroup * refactor: storage, use config util package * refactor: discovery manager, use targetgroup package instead of config.TargetGroup * refactor: use HTTPClient and TLS config from configUtil instead of config * refactor: tests, use targetgroup package instead of config.TargetGroup * refactor: fix tagetgroup.Group pointers that were removed by mistake * refactor: openstack, kubernetes: drop prefixes * refactor: remove import aliases forced due to vscode bug * refactor: move main SD struct out of config into discovery/config * refactor: rename configUtil to config_util * refactor: rename yamlUtil to yaml_config * refactor: kubernetes, remove prefixes * refactor: move the TargetGroup package to discovery/ * refactor: fix order of imports
2017-12-29 12:01:34 -08:00
conf := &SDConfig{
Services: []string{"configuredServiceName"},
}
2017-08-11 11:45:52 -07:00
consulDiscovery, err := NewDiscovery(conf, nil)
if err != nil {
t.Errorf("Unexpected error when initializing discovery %v", err)
}
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
if !consulDiscovery.shouldWatch("configuredServiceName", []string{""}) {
t.Errorf("Expected service %s to be watched", "configuredServiceName")
}
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
if consulDiscovery.shouldWatch("nonConfiguredServiceName", []string{""}) {
t.Errorf("Expected service %s to not be watched", "nonConfiguredServiceName")
}
}
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
func TestConfiguredServiceWithTag(t *testing.T) {
conf := &SDConfig{
Services: []string{"configuredServiceName"},
ServiceTags: []string{"http"},
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
}
consulDiscovery, err := NewDiscovery(conf, nil)
if err != nil {
t.Errorf("Unexpected error when initializing discovery %v", err)
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
}
if consulDiscovery.shouldWatch("configuredServiceName", []string{""}) {
t.Errorf("Expected service %s to not be watched without tag", "configuredServiceName")
}
if !consulDiscovery.shouldWatch("configuredServiceName", []string{"http"}) {
t.Errorf("Expected service %s to be watched with tag %s", "configuredServiceName", "http")
}
if consulDiscovery.shouldWatch("nonConfiguredServiceName", []string{""}) {
t.Errorf("Expected service %s to not be watched without tag", "nonConfiguredServiceName")
}
if consulDiscovery.shouldWatch("nonConfiguredServiceName", []string{"http"}) {
t.Errorf("Expected service %s to not be watched with tag %s", "nonConfiguredServiceName", "http")
}
}
func TestConfiguredServiceWithTags(t *testing.T) {
type testcase struct {
// What we've configured to watch.
conf *SDConfig
// The service we're checking if we should watch or not.
serviceName string
serviceTags []string
shouldWatch bool
}
cases := []testcase{
{
conf: &SDConfig{
Services: []string{"configuredServiceName"},
ServiceTags: []string{"http", "v1"},
},
serviceName: "configuredServiceName",
serviceTags: []string{""},
shouldWatch: false,
},
{
conf: &SDConfig{
Services: []string{"configuredServiceName"},
ServiceTags: []string{"http", "v1"},
},
serviceName: "configuredServiceName",
serviceTags: []string{"http", "v1"},
shouldWatch: true,
},
{
conf: &SDConfig{
Services: []string{"configuredServiceName"},
ServiceTags: []string{"http", "v1"},
},
serviceName: "nonConfiguredServiceName",
serviceTags: []string{""},
shouldWatch: false,
},
{
conf: &SDConfig{
Services: []string{"configuredServiceName"},
ServiceTags: []string{"http", "v1"},
},
serviceName: "nonConfiguredServiceName",
serviceTags: []string{"http, v1"},
shouldWatch: false,
},
{
conf: &SDConfig{
Services: []string{"configuredServiceName"},
ServiceTags: []string{"http", "v1"},
},
serviceName: "configuredServiceName",
serviceTags: []string{"http", "v1", "foo"},
shouldWatch: true,
},
{
conf: &SDConfig{
Services: []string{"configuredServiceName"},
ServiceTags: []string{"http", "v1", "foo"},
},
serviceName: "configuredServiceName",
serviceTags: []string{"http", "v1", "foo"},
shouldWatch: true,
},
{
conf: &SDConfig{
Services: []string{"configuredServiceName"},
ServiceTags: []string{"http", "v1"},
},
serviceName: "configuredServiceName",
serviceTags: []string{"http", "v1", "v1"},
shouldWatch: true,
},
}
for _, tc := range cases {
consulDiscovery, err := NewDiscovery(tc.conf, nil)
if err != nil {
t.Errorf("Unexpected error when initializing discovery %v", err)
}
ret := consulDiscovery.shouldWatch(tc.serviceName, tc.serviceTags)
if ret != tc.shouldWatch {
t.Errorf("Expected should watch? %t, got %t. Watched service and tags: %s %+v, input was %s %+v", tc.shouldWatch, ret, tc.conf.Services, tc.conf.ServiceTags, tc.serviceName, tc.serviceTags)
}
}
}
func TestNonConfiguredService(t *testing.T) {
Refactor SD configuration to remove `config` dependency (#3629) * refactor: move targetGroup struct and CheckOverflow() to their own package * refactor: move auth and security related structs to a utility package, fix import error in utility package * refactor: Azure SD, remove SD struct from config * refactor: DNS SD, remove SD struct from config into dns package * refactor: ec2 SD, move SD struct from config into the ec2 package * refactor: file SD, move SD struct from config to file discovery package * refactor: gce, move SD struct from config to gce discovery package * refactor: move HTTPClientConfig and URL into util/config, fix import error in httputil * refactor: consul, move SD struct from config into consul discovery package * refactor: marathon, move SD struct from config into marathon discovery package * refactor: triton, move SD struct from config to triton discovery package, fix test * refactor: zookeeper, move SD structs from config to zookeeper discovery package * refactor: openstack, remove SD struct from config, move into openstack discovery package * refactor: kubernetes, move SD struct from config into kubernetes discovery package * refactor: notifier, use targetgroup package instead of config * refactor: tests for file, marathon, triton SD - use targetgroup package instead of config.TargetGroup * refactor: retrieval, use targetgroup package instead of config.TargetGroup * refactor: storage, use config util package * refactor: discovery manager, use targetgroup package instead of config.TargetGroup * refactor: use HTTPClient and TLS config from configUtil instead of config * refactor: tests, use targetgroup package instead of config.TargetGroup * refactor: fix tagetgroup.Group pointers that were removed by mistake * refactor: openstack, kubernetes: drop prefixes * refactor: remove import aliases forced due to vscode bug * refactor: move main SD struct out of config into discovery/config * refactor: rename configUtil to config_util * refactor: rename yamlUtil to yaml_config * refactor: kubernetes, remove prefixes * refactor: move the TargetGroup package to discovery/ * refactor: fix order of imports
2017-12-29 12:01:34 -08:00
conf := &SDConfig{}
2017-08-11 11:45:52 -07:00
consulDiscovery, err := NewDiscovery(conf, nil)
if err != nil {
t.Errorf("Unexpected error when initializing discovery %v", err)
}
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
if !consulDiscovery.shouldWatch("nonConfiguredServiceName", []string{""}) {
t.Errorf("Expected service %s to be watched", "nonConfiguredServiceName")
}
}
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
const (
AgentAnswer = `{"Config": {"Datacenter": "test-dc"}}`
ServiceTestAnswer = `
[{
"Node": {
"ID": "b78c2e48-5ef3-1814-31b8-0d880f50471e",
"Node": "node1",
"Address": "1.1.1.1",
"Datacenter": "test-dc",
"TaggedAddresses": {
"lan": "192.168.10.10",
"wan": "10.0.10.10"
},
"Meta": {"rack_name": "2304"},
"CreateIndex": 1,
"ModifyIndex": 1
},
"Service": {
"ID": "test",
"Service": "test",
"Tags": ["tag1"],
"Address": "",
"Meta": {"version":"1.0.0","environment":"staging"},
"Port": 3341,
"Weights": {
"Passing": 1,
"Warning": 1
},
"EnableTagOverride": false,
"ProxyDestination": "",
"Proxy": {},
"Connect": {},
"CreateIndex": 1,
"ModifyIndex": 1
},
"Checks": [{
"Node": "node1",
"CheckID": "serfHealth",
"Name": "Serf Health Status",
"Status": "passing"
}]
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
}]`
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
ServicesTestAnswer = `{"test": ["tag1"], "other": ["tag2"]}`
)
func newServer(t *testing.T) (*httptest.Server, *SDConfig) {
// github.com/hashicorp/consul/testutil/ would be nice but it needs a local consul binary.
stub := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
response := ""
switch r.URL.String() {
case "/v1/agent/self":
response = AgentAnswer
case "/v1/health/service/test?node-meta=rack_name%3A2304&stale=&tag=tag1&wait=120000ms":
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
response = ServiceTestAnswer
case "/v1/health/service/test?wait=120000ms":
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
response = ServiceTestAnswer
case "/v1/health/service/other?wait=120000ms":
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
response = `[]`
case "/v1/catalog/services?node-meta=rack_name%3A2304&stale=&wait=120000ms":
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
response = ServicesTestAnswer
case "/v1/catalog/services?wait=120000ms":
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
response = ServicesTestAnswer
case "/v1/catalog/services?index=1&node-meta=rack_name%3A2304&stale=&wait=120000ms":
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
time.Sleep(5 * time.Second)
response = ServicesTestAnswer
case "/v1/catalog/services?index=1&wait=120000ms":
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
time.Sleep(5 * time.Second)
response = ServicesTestAnswer
default:
t.Errorf("Unhandled consul call: %s", r.URL)
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
}
w.Header().Add("X-Consul-Index", "1")
w.Write([]byte(response))
}))
stuburl, err := url.Parse(stub.URL)
require.NoError(t, err)
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
config := &SDConfig{
Server: stuburl.Host,
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
Token: "fake-token",
RefreshInterval: model.Duration(1 * time.Second),
}
return stub, config
}
func newDiscovery(t *testing.T, config *SDConfig) *Discovery {
logger := log.NewNopLogger()
d, err := NewDiscovery(config, logger)
require.NoError(t, err)
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
return d
}
func checkOneTarget(t *testing.T, tg []*targetgroup.Group) {
require.Equal(t, 1, len(tg))
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
target := tg[0]
require.Equal(t, "test-dc", string(target.Labels["__meta_consul_dc"]))
require.Equal(t, target.Source, string(target.Labels["__meta_consul_service"]))
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
if target.Source == "test" {
// test service should have one node.
require.Greater(t, len(target.Targets), 0, "Test service should have one node")
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
}
}
// Watch all the services in the catalog.
func TestAllServices(t *testing.T) {
stub, config := newServer(t)
defer stub.Close()
d := newDiscovery(t, config)
ctx, cancel := context.WithCancel(context.Background())
ch := make(chan []*targetgroup.Group)
go func() {
d.Run(ctx, ch)
close(ch)
}()
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
checkOneTarget(t, <-ch)
checkOneTarget(t, <-ch)
cancel()
<-ch
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
}
// targetgroup with no targets is emitted if no services were discovered.
func TestNoTargets(t *testing.T) {
stub, config := newServer(t)
defer stub.Close()
config.ServiceTags = []string{"missing"}
d := newDiscovery(t, config)
ctx, cancel := context.WithCancel(context.Background())
ch := make(chan []*targetgroup.Group)
go func() {
d.Run(ctx, ch)
close(ch)
}()
targets := (<-ch)[0].Targets
require.Equal(t, 0, len(targets))
cancel()
<-ch
}
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
// Watch only the test service.
func TestOneService(t *testing.T) {
stub, config := newServer(t)
defer stub.Close()
config.Services = []string{"test"}
d := newDiscovery(t, config)
ctx, cancel := context.WithCancel(context.Background())
ch := make(chan []*targetgroup.Group)
go d.Run(ctx, ch)
checkOneTarget(t, <-ch)
cancel()
}
// Watch the test service with a specific tag and node-meta.
func TestAllOptions(t *testing.T) {
stub, config := newServer(t)
defer stub.Close()
config.Services = []string{"test"}
config.NodeMeta = map[string]string{"rack_name": "2304"}
config.ServiceTags = []string{"tag1"}
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
config.AllowStale = true
config.Token = "fake-token"
d := newDiscovery(t, config)
ctx, cancel := context.WithCancel(context.Background())
ch := make(chan []*targetgroup.Group)
go func() {
d.Run(ctx, ch)
close(ch)
}()
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
checkOneTarget(t, <-ch)
cancel()
<-ch
consul: improve consul service discovery (#3814) * consul: improve consul service discovery Related to #3711 - Add the ability to filter by tag and node-meta in an efficient way (`/catalog/services` allow filtering by node-meta, and returns a `map[string]string` or `service`->`tags`). Tags and nore-meta are also used in `/catalog/service` requests. - Do not require a call to the catalog if services are specified by name. This is important because on large cluster `/catalog/services` changes all the time. - Add `allow_stale` configuration option to do stale reads. Non-stale reads can be costly, even more when you are doing them to a remote datacenter with 10k+ targets over WAN (which is common for federation). - Add `refresh_interval` to minimize the strain on the catalog and on the service endpoint. This is needed because of that kind of behavior from consul: https://github.com/hashicorp/consul/issues/3712 and because a catalog on a large cluster would basically change *all* the time. No need to discover targets in 1sec if we scrape them every minute. - Added plenty of unit tests. Benchmarks ---------- ```yaml scrape_configs: - job_name: prometheus scrape_interval: 60s static_configs: - targets: ["127.0.0.1:9090"] - job_name: "observability-by-tag" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 tag: marathon-user-observability # Used in After refresh_interval: 30s # Used in After+delay relabel_configs: - source_labels: [__meta_consul_tags] regex: ^(.*,)?marathon-user-observability(,.*)?$ action: keep - job_name: "observability-by-name" scrape_interval: "60s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - observability-cerebro - observability-portal-web - job_name: "fake-fake-fake" scrape_interval: "15s" metrics_path: "/metrics" consul_sd_configs: - server: consul.service.par.consul.prod.crto.in:8500 services: - fake-fake-fake ``` Note: tested with ~1200 services, ~5000 nodes. | Resource | Empty | Before | After | After + delay | | -------- |:-----:|:------:|:-----:|:-------------:| |/service-discovery size|5K|85MiB|27k|27k|27k| |`go_memstats_heap_objects`|100k|1M|120k|110k| |`go_memstats_heap_alloc_bytes`|24MB|150MB|28MB|27MB| |`rate(go_memstats_alloc_bytes_total[5m])`|0.2MB/s|28MB/s|2MB/s|0.3MB/s| |`rate(process_cpu_seconds_total[5m])`|0.1%|15%|2%|0.01%| |`process_open_fds`|16|*1236*|22|22| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="services"}[5m])`|~0|1|1|*0.03*| |`rate(prometheus_sd_consul_rpc_duration_seconds_count{call="service"}[5m])`|0.1|*80*|0.5|0.5| |`prometheus_target_sync_length_seconds{quantile="0.9",scrape_job="observability-by-tag"}`|N/A|200ms|0.2ms|0.2ms| |Network bandwidth|~10kbps|~2.8Mbps|~1.6Mbps|~10kbps| Filtering by tag using relabel_configs uses **100kiB and 23kiB/s per service per job** and quite a lot of CPU. Also sends and additional *1Mbps* of traffic to consul. Being a little bit smarter about this reduces the overhead quite a lot. Limiting the number of `/catalog/services` queries per second almost removes the overhead of service discovery. * consul: tweak `refresh_interval` behavior `refresh_interval` now does what is advertised in the documentation, there won't be more that one update per `refresh_interval`. It now defaults to 30s (which was also the current waitTime in the consul query). This also make sure we don't wait another 30s if we already waited 29s in the blocking call by substracting the number of elapsed seconds. Hopefully this will do what people expect it does and will be safer for existing consul infrastructures.
2018-03-23 07:48:43 -07:00
}
func TestGetDatacenterShouldReturnError(t *testing.T) {
for _, tc := range []struct {
handler func(http.ResponseWriter, *http.Request)
errMessage string
}{
{
// Define a handler that will return status 500.
handler: func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
},
errMessage: "Unexpected response code: 500 ()",
},
{
// Define a handler that will return incorrect response.
handler: func(w http.ResponseWriter, r *http.Request) {
w.Write([]byte(`{"Config": {"Not-Datacenter": "test-dc"}}`))
},
errMessage: "invalid value '<nil>' for Config.Datacenter",
},
} {
stub := httptest.NewServer(http.HandlerFunc(tc.handler))
stuburl, err := url.Parse(stub.URL)
require.NoError(t, err)
config := &SDConfig{
Server: stuburl.Host,
Token: "fake-token",
RefreshInterval: model.Duration(1 * time.Second),
}
defer stub.Close()
d := newDiscovery(t, config)
// Should be empty if not initialized.
require.Equal(t, "", d.clientDatacenter)
err = d.getDatacenter()
// An error should be returned.
require.Equal(t, tc.errMessage, err.Error())
// Should still be empty.
require.Equal(t, "", d.clientDatacenter)
}
}
func TestUnmarshalConfig(t *testing.T) {
unmarshal := func(d []byte) func(interface{}) error {
return func(o interface{}) error {
return yaml.Unmarshal(d, o)
}
}
goodConfig := DefaultSDConfig
goodConfig.Username = "123"
goodConfig.Password = "1234"
goodConfig.HTTPClientConfig = config.HTTPClientConfig{
BasicAuth: &config.BasicAuth{
Username: "123",
Password: "1234",
},
FollowRedirects: true,
EnableHTTP2: true,
}
cases := []struct {
name string
config string
expected SDConfig
errMessage string
}{
{
name: "good",
config: `
server: localhost:8500
username: 123
password: 1234
`,
expected: goodConfig,
},
{
name: "username and password and basic auth configured",
config: `
server: localhost:8500
username: 123
password: 1234
basic_auth:
username: 12345
password: 123456
`,
errMessage: "at most one of consul SD configuration username and password and basic auth can be configured",
},
{
name: "token and authorization configured",
config: `
server: localhost:8500
token: 1234567
authorization:
credentials: 12345678
`,
errMessage: "at most one of consul SD token, authorization, or oauth2 can be configured",
},
{
name: "token and oauth2 configured",
config: `
server: localhost:8500
token: 1234567
oauth2:
client_id: 10
client_secret: 11
token_url: http://example.com
`,
errMessage: "at most one of consul SD token, authorization, or oauth2 can be configured",
},
}
for _, test := range cases {
t.Run(test.name, func(t *testing.T) {
var config SDConfig
err := config.UnmarshalYAML(unmarshal([]byte(test.config)))
if err != nil {
require.Equalf(t, err.Error(), test.errMessage, "Expected error '%s', got '%v'", test.errMessage, err)
return
}
if test.errMessage != "" {
t.Errorf("Expected error %s, got none", test.errMessage)
return
}
require.Equal(t, config, test.expected)
})
}
}