mirror of
https://github.com/prometheus/prometheus.git
synced 2025-01-11 13:57:36 -08:00
Azure SD Failure metric and 404 Handling (#10476)
* For Azure sd, added failure counter and skipping of 404's from Nic lookup Signed-off-by: David N Perkins <David.N.Perkins@ibm.com>
This commit is contained in:
parent
c31a3e4c92
commit
ed0c68224e
|
@ -30,6 +30,7 @@ import (
|
|||
"github.com/go-kit/log"
|
||||
"github.com/go-kit/log/level"
|
||||
"github.com/pkg/errors"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
config_util "github.com/prometheus/common/config"
|
||||
"github.com/prometheus/common/model"
|
||||
"github.com/prometheus/common/version"
|
||||
|
@ -70,10 +71,17 @@ var (
|
|||
AuthenticationMethod: authMethodOAuth,
|
||||
HTTPClientConfig: config_util.DefaultHTTPClientConfig,
|
||||
}
|
||||
|
||||
failuresCount = prometheus.NewCounter(
|
||||
prometheus.CounterOpts{
|
||||
Name: "prometheus_sd_azure_failures_total",
|
||||
Help: "Number of Azure service discovery refresh failures.",
|
||||
})
|
||||
)
|
||||
|
||||
func init() {
|
||||
discovery.RegisterConfig(&SDConfig{})
|
||||
prometheus.MustRegister(failuresCount)
|
||||
}
|
||||
|
||||
// SDConfig is the configuration for Azure based service discovery.
|
||||
|
@ -279,11 +287,13 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) {
|
|||
|
||||
client, err := createAzureClient(*d.cfg)
|
||||
if err != nil {
|
||||
failuresCount.Inc()
|
||||
return nil, errors.Wrap(err, "could not create Azure client")
|
||||
}
|
||||
|
||||
machines, err := client.getVMs(ctx, d.cfg.ResourceGroup)
|
||||
if err != nil {
|
||||
failuresCount.Inc()
|
||||
return nil, errors.Wrap(err, "could not get virtual machines")
|
||||
}
|
||||
|
||||
|
@ -292,12 +302,14 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) {
|
|||
// Load the vms managed by scale sets.
|
||||
scaleSets, err := client.getScaleSets(ctx, d.cfg.ResourceGroup)
|
||||
if err != nil {
|
||||
failuresCount.Inc()
|
||||
return nil, errors.Wrap(err, "could not get virtual machine scale sets")
|
||||
}
|
||||
|
||||
for _, scaleSet := range scaleSets {
|
||||
scaleSetVms, err := client.getScaleSetVMs(ctx, scaleSet)
|
||||
if err != nil {
|
||||
failuresCount.Inc()
|
||||
return nil, errors.Wrap(err, "could not get virtual machine scale set vms")
|
||||
}
|
||||
machines = append(machines, scaleSetVms...)
|
||||
|
@ -313,8 +325,8 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) {
|
|||
var wg sync.WaitGroup
|
||||
wg.Add(len(machines))
|
||||
ch := make(chan target, len(machines))
|
||||
for i, vm := range machines {
|
||||
go func(i int, vm virtualMachine) {
|
||||
for _, vm := range machines {
|
||||
go func(vm virtualMachine) {
|
||||
defer wg.Done()
|
||||
r, err := newAzureResourceFromID(vm.ID, d.logger)
|
||||
if err != nil {
|
||||
|
@ -346,8 +358,11 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) {
|
|||
for _, nicID := range vm.NetworkInterfaces {
|
||||
networkInterface, err := client.getNetworkInterfaceByID(ctx, nicID)
|
||||
if err != nil {
|
||||
level.Error(d.logger).Log("msg", "Unable to get network interface", "name", nicID, "err", err)
|
||||
if errors.Is(err, errorNotFound) {
|
||||
level.Warn(d.logger).Log("msg", "Network interface does not exist", "name", nicID, "err", err)
|
||||
} else {
|
||||
ch <- target{labelSet: nil, err: err}
|
||||
}
|
||||
// Get out of this routine because we cannot continue without a network interface.
|
||||
return
|
||||
}
|
||||
|
@ -387,7 +402,7 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) {
|
|||
}
|
||||
}
|
||||
}
|
||||
}(i, vm)
|
||||
}(vm)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
@ -396,6 +411,7 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) {
|
|||
var tg targetgroup.Group
|
||||
for tgt := range ch {
|
||||
if tgt.err != nil {
|
||||
failuresCount.Inc()
|
||||
return nil, errors.Wrap(tgt.err, "unable to complete Azure service discovery")
|
||||
}
|
||||
if tgt.labelSet != nil {
|
||||
|
@ -561,6 +577,11 @@ func mapFromVMScaleSetVM(vm compute.VirtualMachineScaleSetVM, scaleSetName strin
|
|||
}
|
||||
}
|
||||
|
||||
var errorNotFound = errors.New("network interface does not exist")
|
||||
|
||||
// getNetworkInterfaceByID gets the network interface.
|
||||
// If a 404 is returned from the Azure API, `errorNotFound` is returned.
|
||||
// On all other errors, an autorest.DetailedError is returned.
|
||||
func (client *azureClient) getNetworkInterfaceByID(ctx context.Context, networkInterfaceID string) (*network.Interface, error) {
|
||||
result := network.Interface{}
|
||||
queryParameters := map[string]interface{}{
|
||||
|
@ -585,6 +606,9 @@ func (client *azureClient) getNetworkInterfaceByID(ctx context.Context, networkI
|
|||
|
||||
result, err = client.nic.GetResponder(resp)
|
||||
if err != nil {
|
||||
if resp.StatusCode == http.StatusNotFound {
|
||||
return nil, errorNotFound
|
||||
}
|
||||
return nil, autorest.NewErrorWithError(err, "network.InterfacesClient", "Get", resp, "Failure responding to request")
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue