diff --git a/discovery/http/http.go b/discovery/http/http.go index 6dd949e47..217abb192 100644 --- a/discovery/http/http.go +++ b/discovery/http/http.go @@ -28,6 +28,7 @@ import ( "github.com/go-kit/log" "github.com/grafana/regexp" "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/config" "github.com/prometheus/common/model" "github.com/prometheus/common/version" @@ -45,10 +46,17 @@ var ( } userAgent = fmt.Sprintf("Prometheus/%s", version.Version) matchContentType = regexp.MustCompile(`^(?i:application\/json(;\s*charset=("utf-8"|utf-8))?)$`) + + failuresCount = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "prometheus_sd_http_failures_total", + Help: "Number of HTTP service discovery refresh failures.", + }) ) func init() { discovery.RegisterConfig(&SDConfig{}) + prometheus.MustRegister(failuresCount) } // SDConfig is the configuration for HTTP based discovery. @@ -145,6 +153,7 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) { resp, err := d.client.Do(req.WithContext(ctx)) if err != nil { + failuresCount.Inc() return nil, err } defer func() { @@ -153,26 +162,31 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) { }() if resp.StatusCode != http.StatusOK { + failuresCount.Inc() return nil, errors.Errorf("server returned HTTP status %s", resp.Status) } if !matchContentType.MatchString(strings.TrimSpace(resp.Header.Get("Content-Type"))) { + failuresCount.Inc() return nil, errors.Errorf("unsupported content type %q", resp.Header.Get("Content-Type")) } b, err := ioutil.ReadAll(resp.Body) if err != nil { + failuresCount.Inc() return nil, err } var targetGroups []*targetgroup.Group if err := json.Unmarshal(b, &targetGroups); err != nil { + failuresCount.Inc() return nil, err } for i, tg := range targetGroups { if tg == nil { + failuresCount.Inc() err = errors.New("nil target group item found") return nil, err } diff --git a/discovery/http/http_test.go b/discovery/http/http_test.go index 896eec1be..1ab034dfc 100644 --- a/discovery/http/http_test.go +++ b/discovery/http/http_test.go @@ -22,6 +22,8 @@ import ( "time" "github.com/go-kit/log" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" "github.com/prometheus/common/config" "github.com/prometheus/common/model" "github.com/stretchr/testify/require" @@ -61,6 +63,7 @@ func TestHTTPValidRefresh(t *testing.T) { }, } require.Equal(t, tgs, expectedTargets) + require.Equal(t, 0.0, getFailureCount()) } func TestHTTPInvalidCode(t *testing.T) { @@ -82,6 +85,7 @@ func TestHTTPInvalidCode(t *testing.T) { ctx := context.Background() _, err = d.refresh(ctx) require.EqualError(t, err, "server returned HTTP status 400 Bad Request") + require.Equal(t, 1.0, getFailureCount()) } func TestHTTPInvalidFormat(t *testing.T) { @@ -103,6 +107,32 @@ func TestHTTPInvalidFormat(t *testing.T) { ctx := context.Background() _, err = d.refresh(ctx) require.EqualError(t, err, `unsupported content type "text/plain; charset=utf-8"`) + require.Equal(t, 1.0, getFailureCount()) +} + +var lastFailureCount float64 + +func getFailureCount() float64 { + failureChan := make(chan prometheus.Metric) + + go func() { + failuresCount.Collect(failureChan) + close(failureChan) + }() + + var counter dto.Metric + for { + metric, ok := <-failureChan + if ok == false { + break + } + metric.Write(&counter) + } + + // account for failures in prior tests + count := *counter.Counter.Value - lastFailureCount + lastFailureCount = *counter.Counter.Value + return count } func TestContentTypeRegex(t *testing.T) { diff --git a/docs/configuration/configuration.md b/docs/configuration/configuration.md index 4d730a6b3..2f8ec2751 100644 --- a/docs/configuration/configuration.md +++ b/docs/configuration/configuration.md @@ -1448,8 +1448,9 @@ Example response body: ] ``` -The endpoint is queried periodically at the specified -refresh interval. +The endpoint is queried periodically at the specified refresh interval. +The `prometheus_sd_http_failures_total` counter metric tracks the number of +refresh failures. Each target has a meta label `__meta_url` during the [relabeling phase](#relabel_config). Its value is set to the diff --git a/docs/http_sd.md b/docs/http_sd.md index ab88886cc..884deb9f3 100644 --- a/docs/http_sd.md +++ b/docs/http_sd.md @@ -40,7 +40,8 @@ an empty list `[]`. Target lists are unordered. Prometheus caches target lists. If an error occurs while fetching an updated targets list, Prometheus keeps using the current targets list. The targets list -is not saved across restart. +is not saved across restart. The `prometheus_sd_http_failures_total` counter +metric tracks the number of refresh failures. The whole list of targets must be returned on every scrape. There is no support for incremental updates. A Prometheus instance does not send its hostname and it