Added a failure counter to the HTTP service discovery (#10372)

* Added a failure counter to the http service discovery

Signed-off-by: David N Perkins <David.N.Perkins@ibm.com>
This commit is contained in:
David N Perkins 2022-03-08 08:10:45 -05:00 committed by GitHub
parent 025528a5d6
commit 097b359b41
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 49 additions and 3 deletions

View file

@ -28,6 +28,7 @@ import (
"github.com/go-kit/log" "github.com/go-kit/log"
"github.com/grafana/regexp" "github.com/grafana/regexp"
"github.com/pkg/errors" "github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/config" "github.com/prometheus/common/config"
"github.com/prometheus/common/model" "github.com/prometheus/common/model"
"github.com/prometheus/common/version" "github.com/prometheus/common/version"
@ -45,10 +46,17 @@ var (
} }
userAgent = fmt.Sprintf("Prometheus/%s", version.Version) userAgent = fmt.Sprintf("Prometheus/%s", version.Version)
matchContentType = regexp.MustCompile(`^(?i:application\/json(;\s*charset=("utf-8"|utf-8))?)$`) matchContentType = regexp.MustCompile(`^(?i:application\/json(;\s*charset=("utf-8"|utf-8))?)$`)
failuresCount = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "prometheus_sd_http_failures_total",
Help: "Number of HTTP service discovery refresh failures.",
})
) )
func init() { func init() {
discovery.RegisterConfig(&SDConfig{}) discovery.RegisterConfig(&SDConfig{})
prometheus.MustRegister(failuresCount)
} }
// SDConfig is the configuration for HTTP based discovery. // SDConfig is the configuration for HTTP based discovery.
@ -145,6 +153,7 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) {
resp, err := d.client.Do(req.WithContext(ctx)) resp, err := d.client.Do(req.WithContext(ctx))
if err != nil { if err != nil {
failuresCount.Inc()
return nil, err return nil, err
} }
defer func() { defer func() {
@ -153,26 +162,31 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) {
}() }()
if resp.StatusCode != http.StatusOK { if resp.StatusCode != http.StatusOK {
failuresCount.Inc()
return nil, errors.Errorf("server returned HTTP status %s", resp.Status) return nil, errors.Errorf("server returned HTTP status %s", resp.Status)
} }
if !matchContentType.MatchString(strings.TrimSpace(resp.Header.Get("Content-Type"))) { if !matchContentType.MatchString(strings.TrimSpace(resp.Header.Get("Content-Type"))) {
failuresCount.Inc()
return nil, errors.Errorf("unsupported content type %q", resp.Header.Get("Content-Type")) return nil, errors.Errorf("unsupported content type %q", resp.Header.Get("Content-Type"))
} }
b, err := ioutil.ReadAll(resp.Body) b, err := ioutil.ReadAll(resp.Body)
if err != nil { if err != nil {
failuresCount.Inc()
return nil, err return nil, err
} }
var targetGroups []*targetgroup.Group var targetGroups []*targetgroup.Group
if err := json.Unmarshal(b, &targetGroups); err != nil { if err := json.Unmarshal(b, &targetGroups); err != nil {
failuresCount.Inc()
return nil, err return nil, err
} }
for i, tg := range targetGroups { for i, tg := range targetGroups {
if tg == nil { if tg == nil {
failuresCount.Inc()
err = errors.New("nil target group item found") err = errors.New("nil target group item found")
return nil, err return nil, err
} }

View file

@ -22,6 +22,8 @@ import (
"time" "time"
"github.com/go-kit/log" "github.com/go-kit/log"
"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/prometheus/common/config" "github.com/prometheus/common/config"
"github.com/prometheus/common/model" "github.com/prometheus/common/model"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
@ -61,6 +63,7 @@ func TestHTTPValidRefresh(t *testing.T) {
}, },
} }
require.Equal(t, tgs, expectedTargets) require.Equal(t, tgs, expectedTargets)
require.Equal(t, 0.0, getFailureCount())
} }
func TestHTTPInvalidCode(t *testing.T) { func TestHTTPInvalidCode(t *testing.T) {
@ -82,6 +85,7 @@ func TestHTTPInvalidCode(t *testing.T) {
ctx := context.Background() ctx := context.Background()
_, err = d.refresh(ctx) _, err = d.refresh(ctx)
require.EqualError(t, err, "server returned HTTP status 400 Bad Request") require.EqualError(t, err, "server returned HTTP status 400 Bad Request")
require.Equal(t, 1.0, getFailureCount())
} }
func TestHTTPInvalidFormat(t *testing.T) { func TestHTTPInvalidFormat(t *testing.T) {
@ -103,6 +107,32 @@ func TestHTTPInvalidFormat(t *testing.T) {
ctx := context.Background() ctx := context.Background()
_, err = d.refresh(ctx) _, err = d.refresh(ctx)
require.EqualError(t, err, `unsupported content type "text/plain; charset=utf-8"`) require.EqualError(t, err, `unsupported content type "text/plain; charset=utf-8"`)
require.Equal(t, 1.0, getFailureCount())
}
var lastFailureCount float64
func getFailureCount() float64 {
failureChan := make(chan prometheus.Metric)
go func() {
failuresCount.Collect(failureChan)
close(failureChan)
}()
var counter dto.Metric
for {
metric, ok := <-failureChan
if ok == false {
break
}
metric.Write(&counter)
}
// account for failures in prior tests
count := *counter.Counter.Value - lastFailureCount
lastFailureCount = *counter.Counter.Value
return count
} }
func TestContentTypeRegex(t *testing.T) { func TestContentTypeRegex(t *testing.T) {

View file

@ -1448,8 +1448,9 @@ Example response body:
] ]
``` ```
The endpoint is queried periodically at the specified The endpoint is queried periodically at the specified refresh interval.
refresh interval. The `prometheus_sd_http_failures_total` counter metric tracks the number of
refresh failures.
Each target has a meta label `__meta_url` during the Each target has a meta label `__meta_url` during the
[relabeling phase](#relabel_config). Its value is set to the [relabeling phase](#relabel_config). Its value is set to the

View file

@ -40,7 +40,8 @@ an empty list `[]`. Target lists are unordered.
Prometheus caches target lists. If an error occurs while fetching an updated Prometheus caches target lists. If an error occurs while fetching an updated
targets list, Prometheus keeps using the current targets list. The targets list targets list, Prometheus keeps using the current targets list. The targets list
is not saved across restart. is not saved across restart. The `prometheus_sd_http_failures_total` counter
metric tracks the number of refresh failures.
The whole list of targets must be returned on every scrape. There is no support The whole list of targets must be returned on every scrape. There is no support
for incremental updates. A Prometheus instance does not send its hostname and it for incremental updates. A Prometheus instance does not send its hostname and it