prometheus/retrieval/discovery/consul/consul.go
Fabian Reinartz b4d7ce1370 discovery: respect context cancellation everywhere
This also removes closing of the target group channel everywhere
as the contexts cancels across all stages and we don't care about
draining all events once that happened.
2016-11-18 10:55:29 +01:00

307 lines
8.8 KiB
Go

// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package consul
import (
"fmt"
"net"
"strconv"
"strings"
"time"
consul "github.com/hashicorp/consul/api"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/log"
"github.com/prometheus/common/model"
"golang.org/x/net/context"
"github.com/prometheus/prometheus/config"
)
const (
watchTimeout = 30 * time.Second
retryInterval = 15 * time.Second
// addressLabel is the name for the label containing a target's address.
addressLabel = model.MetaLabelPrefix + "consul_address"
// nodeLabel is the name for the label containing a target's node name.
nodeLabel = model.MetaLabelPrefix + "consul_node"
// tagsLabel is the name of the label containing the tags assigned to the target.
tagsLabel = model.MetaLabelPrefix + "consul_tags"
// serviceLabel is the name of the label containing the service name.
serviceLabel = model.MetaLabelPrefix + "consul_service"
// serviceAddressLabel is the name of the label containing the (optional) service address.
serviceAddressLabel = model.MetaLabelPrefix + "consul_service_address"
//servicePortLabel is the name of the label containing the service port.
servicePortLabel = model.MetaLabelPrefix + "consul_service_port"
// datacenterLabel is the name of the label containing the datacenter ID.
datacenterLabel = model.MetaLabelPrefix + "consul_dc"
// serviceIDLabel is the name of the label containing the service ID.
serviceIDLabel = model.MetaLabelPrefix + "consul_service_id"
// Constants for instrumentation.
namespace = "prometheus"
)
var (
rpcFailuresCount = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: namespace,
Name: "sd_consul_rpc_failures_total",
Help: "The number of Consul RPC call failures.",
})
rpcDuration = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Namespace: namespace,
Name: "sd_consul_rpc_duration_seconds",
Help: "The duration of a Consul RPC call in seconds.",
},
[]string{"endpoint", "call"},
)
)
func init() {
prometheus.MustRegister(rpcFailuresCount)
prometheus.MustRegister(rpcDuration)
// Initialize metric vectors.
rpcDuration.WithLabelValues("catalog", "service")
rpcDuration.WithLabelValues("catalog", "services")
}
// Discovery retrieves target information from a Consul server
// and updates them via watches.
type Discovery struct {
client *consul.Client
clientConf *consul.Config
clientDatacenter string
tagSeparator string
watchedServices []string // Set of services which will be discovered.
}
// NewDiscovery returns a new Discovery for the given config.
func NewDiscovery(conf *config.ConsulSDConfig) (*Discovery, error) {
clientConf := &consul.Config{
Address: conf.Server,
Scheme: conf.Scheme,
Datacenter: conf.Datacenter,
Token: conf.Token,
HttpAuth: &consul.HttpBasicAuth{
Username: conf.Username,
Password: conf.Password,
},
}
client, err := consul.NewClient(clientConf)
if err != nil {
return nil, err
}
cd := &Discovery{
client: client,
clientConf: clientConf,
tagSeparator: conf.TagSeparator,
watchedServices: conf.Services,
clientDatacenter: clientConf.Datacenter,
}
return cd, nil
}
// shouldWatch returns whether the service of the given name should be watched.
func (cd *Discovery) shouldWatch(name string) bool {
// If there's no fixed set of watched services, we watch everything.
if len(cd.watchedServices) == 0 {
return true
}
for _, sn := range cd.watchedServices {
if sn == name {
return true
}
}
return false
}
// Run implements the TargetProvider interface.
func (cd *Discovery) Run(ctx context.Context, ch chan<- []*config.TargetGroup) {
// Watched services and their cancelation functions.
services := map[string]func(){}
var lastIndex uint64
for {
catalog := cd.client.Catalog()
t0 := time.Now()
srvs, meta, err := catalog.Services(&consul.QueryOptions{
WaitIndex: lastIndex,
WaitTime: watchTimeout,
})
rpcDuration.WithLabelValues("catalog", "services").Observe(time.Since(t0).Seconds())
// We have to check the context at least once. The checks during channel sends
// do not guarantee that.
select {
case <-ctx.Done():
return
default:
}
if err != nil {
log.Errorf("Error refreshing service list: %s", err)
rpcFailuresCount.Inc()
time.Sleep(retryInterval)
continue
}
// If the index equals the previous one, the watch timed out with no update.
if meta.LastIndex == lastIndex {
continue
}
lastIndex = meta.LastIndex
// If the datacenter was not set from clientConf, let's get it from the local Consul agent
// (Consul default is to use local node's datacenter if one isn't given for a query).
if cd.clientDatacenter == "" {
info, err := cd.client.Agent().Self()
if err != nil {
log.Errorf("Error retrieving datacenter name: %s", err)
time.Sleep(retryInterval)
continue
}
cd.clientDatacenter = info["Config"]["Datacenter"].(string)
}
// Check for new services.
for name := range srvs {
if !cd.shouldWatch(name) {
continue
}
if _, ok := services[name]; ok {
continue // We are already watching the service.
}
srv := &consulService{
client: cd.client,
name: name,
labels: model.LabelSet{
serviceLabel: model.LabelValue(name),
datacenterLabel: model.LabelValue(cd.clientDatacenter),
},
tagSeparator: cd.tagSeparator,
}
wctx, cancel := context.WithCancel(ctx)
srv.watch(wctx, ch)
services[name] = cancel
}
// Check for removed services.
for name, cancel := range services {
if _, ok := srvs[name]; !ok {
// Call the watch cancelation function.
cancel()
delete(services, name)
// Send clearing target group.
select {
case <-ctx.Done():
return
case ch <- []*config.TargetGroup{{Source: name}}:
}
}
}
}
}
// consulService contains data belonging to the same service.
type consulService struct {
name string
labels model.LabelSet
client *consul.Client
tagSeparator string
}
func (srv *consulService) watch(ctx context.Context, ch chan<- []*config.TargetGroup) {
catalog := srv.client.Catalog()
lastIndex := uint64(0)
for {
t0 := time.Now()
nodes, meta, err := catalog.Service(srv.name, "", &consul.QueryOptions{
WaitIndex: lastIndex,
WaitTime: watchTimeout,
})
rpcDuration.WithLabelValues("catalog", "service").Observe(time.Since(t0).Seconds())
// Check the context before potentially falling in a continue-loop.
select {
case <-ctx.Done():
return
default:
// Continue.
}
if err != nil {
log.Errorf("Error refreshing service %s: %s", srv.name, err)
rpcFailuresCount.Inc()
time.Sleep(retryInterval)
continue
}
// If the index equals the previous one, the watch timed out with no update.
if meta.LastIndex == lastIndex {
continue
}
lastIndex = meta.LastIndex
tgroup := config.TargetGroup{
Source: srv.name,
Labels: srv.labels,
Targets: make([]model.LabelSet, 0, len(nodes)),
}
for _, node := range nodes {
// We surround the separated list with the separator as well. This way regular expressions
// in relabeling rules don't have to consider tag positions.
var tags = srv.tagSeparator + strings.Join(node.ServiceTags, srv.tagSeparator) + srv.tagSeparator
// If the service address is not empty it should be used instead of the node address
// since the service may be registered remotely through a different node
var addr string
if node.ServiceAddress != "" {
addr = net.JoinHostPort(node.ServiceAddress, fmt.Sprintf("%d", node.ServicePort))
} else {
addr = net.JoinHostPort(node.Address, fmt.Sprintf("%d", node.ServicePort))
}
tgroup.Targets = append(tgroup.Targets, model.LabelSet{
model.AddressLabel: model.LabelValue(addr),
addressLabel: model.LabelValue(node.Address),
nodeLabel: model.LabelValue(node.Node),
tagsLabel: model.LabelValue(tags),
serviceAddressLabel: model.LabelValue(node.ServiceAddress),
servicePortLabel: model.LabelValue(strconv.Itoa(node.ServicePort)),
serviceIDLabel: model.LabelValue(node.ServiceID),
})
}
// Check context twice to ensure we always catch cancelation.
select {
case <-ctx.Done():
return
default:
}
select {
case <-ctx.Done():
return
case ch <- []*config.TargetGroup{&tgroup}:
}
}
}