prometheus/retrieval/discovery/azure.go
2016-10-20 08:23:50 +01:00

278 lines
8.5 KiB
Go

// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package discovery
import (
"fmt"
"net"
"strings"
"time"
"github.com/Azure/azure-sdk-for-go/arm/compute"
"github.com/Azure/azure-sdk-for-go/arm/network"
"github.com/Azure/go-autorest/autorest/azure"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/log"
"github.com/prometheus/common/model"
"golang.org/x/net/context"
"github.com/prometheus/prometheus/config"
"github.com/prometheus/prometheus/util/strutil"
)
const (
azureLabel = model.MetaLabelPrefix + "azure_"
azureLabelMachineID = azureLabel + "machine_id"
azureLabelMachineResourceGroup = azureLabel + "machine_resource_group"
azureLabelMachineName = azureLabel + "machine_name"
azureLabelMachineLocation = azureLabel + "machine_location"
azureLabelMachinePrivateIP = azureLabel + "machine_private_ip"
azureLabelMachineTag = azureLabel + "machine_tag_"
)
var (
azureSDScrapeFailuresCount = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: namespace,
Name: "azure_sd_scrape_failures_total",
Help: "Number of Azure-SD scrape failures.",
})
azureSDScrapeDuration = prometheus.NewSummary(
prometheus.SummaryOpts{
Namespace: namespace,
Name: "azure_sd_scrape_duration_seconds",
Help: "The duration of a Azure-SD scrape in seconds.",
})
)
func init() {
prometheus.MustRegister(azureSDScrapeDuration)
prometheus.MustRegister(azureSDScrapeFailuresCount)
}
// AzureDiscovery periodically performs Azure-SD requests. It implements
// the TargetProvider interface.
type AzureDiscovery struct {
cfg *config.AzureSDConfig
interval time.Duration
port int
}
// NewAzureDiscovery returns a new AzureDiscovery which periodically refreshes its targets.
func NewAzureDiscovery(cfg *config.AzureSDConfig) *AzureDiscovery {
return &AzureDiscovery{
cfg: cfg,
interval: time.Duration(cfg.RefreshInterval),
port: cfg.Port,
}
}
// Run implements the TargetProvider interface.
func (ad *AzureDiscovery) Run(ctx context.Context, ch chan<- []*config.TargetGroup) {
defer close(ch)
ticker := time.NewTicker(ad.interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
default:
}
tg, err := ad.refresh()
if err != nil {
log.Errorf("unable to refresh during Azure discovery: %s", err)
} else {
ch <- []*config.TargetGroup{tg}
}
select {
case <-ticker.C:
case <-ctx.Done():
return
}
}
}
// azureClient represents multiple Azure Resource Manager providers.
type azureClient struct {
nic network.InterfacesClient
vm compute.VirtualMachinesClient
}
// createAzureClient is a helper function for creating an Azure compute client to ARM.
func createAzureClient(cfg config.AzureSDConfig) (azureClient, error) {
var c azureClient
oauthConfig, err := azure.PublicCloud.OAuthConfigForTenant(cfg.TenantID)
if err != nil {
return azureClient{}, err
}
spt, err := azure.NewServicePrincipalToken(*oauthConfig, cfg.ClientID, cfg.ClientSecret, azure.PublicCloud.ResourceManagerEndpoint)
if err != nil {
return azureClient{}, err
}
c.vm = compute.NewVirtualMachinesClient(cfg.SubscriptionID)
c.vm.Authorizer = spt
c.nic = network.NewInterfacesClient(cfg.SubscriptionID)
c.nic.Authorizer = spt
return c, nil
}
// azureResource represents a resource identifier in Azure.
type azureResource struct {
Name string
ResourceGroup string
}
// Create a new azureResource object from an ID string.
func newAzureResourceFromID(id string) (azureResource, error) {
// Resource IDs have the following format.
// /subscriptions/SUBSCRIPTION_ID/resourceGroups/RESOURCE_GROUP/providers/PROVIDER/TYPE/NAME
s := strings.Split(id, "/")
if len(s) != 9 {
err := fmt.Errorf("invalid ID '%s'. Refusing to create azureResource", id)
log.Error(err)
return azureResource{}, err
}
return azureResource{
Name: strings.ToLower(s[8]),
ResourceGroup: strings.ToLower(s[4]),
}, nil
}
func (ad *AzureDiscovery) refresh() (tg *config.TargetGroup, err error) {
t0 := time.Now()
defer func() {
azureSDScrapeDuration.Observe(time.Since(t0).Seconds())
if err != nil {
azureSDScrapeFailuresCount.Inc()
}
}()
tg = &config.TargetGroup{}
client, err := createAzureClient(*ad.cfg)
if err != nil {
return tg, fmt.Errorf("could not create Azure client: %s", err)
}
var machines []compute.VirtualMachine
result, err := client.vm.ListAll()
if err != nil {
return tg, fmt.Errorf("could not list virtual machines: %s", err)
}
machines = append(machines, *result.Value...)
// If we still have results, keep going until we have no more.
for result.NextLink != nil {
result, err = client.vm.ListAllNextResults(result)
if err != nil {
return tg, fmt.Errorf("could not list virtual machines: %s", err)
}
machines = append(machines, *result.Value...)
}
log.Debugf("Found %d virtual machines during Azure discovery.", len(machines))
// We have the slice of machines. Now turn them into targets.
// Doing them in go routines because the network interface calls are slow.
type target struct {
labelSet model.LabelSet
err error
}
ch := make(chan target, len(machines))
for i, vm := range machines {
go func(i int, vm compute.VirtualMachine) {
r, err := newAzureResourceFromID(*vm.ID)
if err != nil {
ch <- target{labelSet: nil, err: err}
return
}
labels := model.LabelSet{
azureLabelMachineID: model.LabelValue(*vm.ID),
azureLabelMachineName: model.LabelValue(*vm.Name),
azureLabelMachineLocation: model.LabelValue(*vm.Location),
azureLabelMachineResourceGroup: model.LabelValue(r.ResourceGroup),
}
if vm.Tags != nil {
for k, v := range *vm.Tags {
name := strutil.SanitizeLabelName(k)
labels[azureLabelMachineTag+model.LabelName(name)] = model.LabelValue(*v)
}
}
// Get the IP address information via separate call to the network provider.
for _, nic := range *vm.Properties.NetworkProfile.NetworkInterfaces {
r, err := newAzureResourceFromID(*nic.ID)
if err != nil {
ch <- target{labelSet: nil, err: err}
return
}
networkInterface, err := client.nic.Get(r.ResourceGroup, r.Name, "")
if err != nil {
log.Errorf("Unable to get network interface %s: %s", r.Name, err)
ch <- target{labelSet: nil, err: err}
// Get out of this routine because we cannot continue without a network interface.
return
}
// Unfortunately Azure does not return information on whether a VM is deallocated.
// This information is available via another API call however the Go SDK does not
// yet support this. On deallocated machines, this value happens to be nil so it
// is a cheap and easy way to determine if a machine is allocated or not.
if networkInterface.Properties.Primary == nil {
log.Debugf("Virtual machine %s is deallocated. Skipping during Azure SD.", *vm.Name)
ch <- target{}
return
}
if *networkInterface.Properties.Primary {
for _, ip := range *networkInterface.Properties.IPConfigurations {
if ip.Properties.PrivateIPAddress != nil {
labels[azureLabelMachinePrivateIP] = model.LabelValue(*ip.Properties.PrivateIPAddress)
address := net.JoinHostPort(*ip.Properties.PrivateIPAddress, fmt.Sprintf("%d", ad.port))
labels[model.AddressLabel] = model.LabelValue(address)
ch <- target{labelSet: labels, err: nil}
return
}
// If we made it here, we don't have a private IP which should be impossible.
// Return an empty target and error to ensure an all or nothing situation.
err = fmt.Errorf("unable to find a private IP for VM %s", *vm.Name)
ch <- target{labelSet: nil, err: err}
return
}
}
}
}(i, vm)
}
for range machines {
tgt := <-ch
if tgt.err != nil {
return nil, fmt.Errorf("unable to complete Azure service discovery: %s", err)
}
if tgt.labelSet != nil {
tg.Targets = append(tg.Targets, tgt.labelSet)
}
}
log.Debugf("Azure discovery completed.")
return tg, nil
}