Azure SD Failure metric and 404 Handling (#10476)

* For Azure sd, added failure counter and skipping of 404's from Nic lookup

Signed-off-by: David N Perkins <David.N.Perkins@ibm.com>
This commit is contained in:
David N Perkins 2022-03-31 08:50:17 -04:00 committed by GitHub
parent c31a3e4c92
commit ed0c68224e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -30,6 +30,7 @@ import (
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
config_util "github.com/prometheus/common/config"
"github.com/prometheus/common/model"
"github.com/prometheus/common/version"
@ -70,10 +71,17 @@ var (
AuthenticationMethod: authMethodOAuth,
HTTPClientConfig: config_util.DefaultHTTPClientConfig,
}
failuresCount = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "prometheus_sd_azure_failures_total",
Help: "Number of Azure service discovery refresh failures.",
})
)
func init() {
discovery.RegisterConfig(&SDConfig{})
prometheus.MustRegister(failuresCount)
}
// SDConfig is the configuration for Azure based service discovery.
@ -279,11 +287,13 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) {
client, err := createAzureClient(*d.cfg)
if err != nil {
failuresCount.Inc()
return nil, errors.Wrap(err, "could not create Azure client")
}
machines, err := client.getVMs(ctx, d.cfg.ResourceGroup)
if err != nil {
failuresCount.Inc()
return nil, errors.Wrap(err, "could not get virtual machines")
}
@ -292,12 +302,14 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) {
// Load the vms managed by scale sets.
scaleSets, err := client.getScaleSets(ctx, d.cfg.ResourceGroup)
if err != nil {
failuresCount.Inc()
return nil, errors.Wrap(err, "could not get virtual machine scale sets")
}
for _, scaleSet := range scaleSets {
scaleSetVms, err := client.getScaleSetVMs(ctx, scaleSet)
if err != nil {
failuresCount.Inc()
return nil, errors.Wrap(err, "could not get virtual machine scale set vms")
}
machines = append(machines, scaleSetVms...)
@ -313,8 +325,8 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) {
var wg sync.WaitGroup
wg.Add(len(machines))
ch := make(chan target, len(machines))
for i, vm := range machines {
go func(i int, vm virtualMachine) {
for _, vm := range machines {
go func(vm virtualMachine) {
defer wg.Done()
r, err := newAzureResourceFromID(vm.ID, d.logger)
if err != nil {
@ -346,8 +358,11 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) {
for _, nicID := range vm.NetworkInterfaces {
networkInterface, err := client.getNetworkInterfaceByID(ctx, nicID)
if err != nil {
level.Error(d.logger).Log("msg", "Unable to get network interface", "name", nicID, "err", err)
if errors.Is(err, errorNotFound) {
level.Warn(d.logger).Log("msg", "Network interface does not exist", "name", nicID, "err", err)
} else {
ch <- target{labelSet: nil, err: err}
}
// Get out of this routine because we cannot continue without a network interface.
return
}
@ -387,7 +402,7 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) {
}
}
}
}(i, vm)
}(vm)
}
wg.Wait()
@ -396,6 +411,7 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) {
var tg targetgroup.Group
for tgt := range ch {
if tgt.err != nil {
failuresCount.Inc()
return nil, errors.Wrap(tgt.err, "unable to complete Azure service discovery")
}
if tgt.labelSet != nil {
@ -561,6 +577,11 @@ func mapFromVMScaleSetVM(vm compute.VirtualMachineScaleSetVM, scaleSetName strin
}
}
var errorNotFound = errors.New("network interface does not exist")
// getNetworkInterfaceByID gets the network interface.
// If a 404 is returned from the Azure API, `errorNotFound` is returned.
// On all other errors, an autorest.DetailedError is returned.
func (client *azureClient) getNetworkInterfaceByID(ctx context.Context, networkInterfaceID string) (*network.Interface, error) {
result := network.Interface{}
queryParameters := map[string]interface{}{
@ -585,6 +606,9 @@ func (client *azureClient) getNetworkInterfaceByID(ctx context.Context, networkI
result, err = client.nic.GetResponder(resp)
if err != nil {
if resp.StatusCode == http.StatusNotFound {
return nil, errorNotFound
}
return nil, autorest.NewErrorWithError(err, "network.InterfacesClient", "Get", resp, "Failure responding to request")
}