mirror of
https://github.com/prometheus/prometheus.git
synced 2025-01-12 14:27:27 -08:00
Added a failure counter to the HTTP service discovery (#10372)
* Added a failure counter to the http service discovery Signed-off-by: David N Perkins <David.N.Perkins@ibm.com>
This commit is contained in:
parent
025528a5d6
commit
097b359b41
|
@ -28,6 +28,7 @@ import (
|
|||
"github.com/go-kit/log"
|
||||
"github.com/grafana/regexp"
|
||||
"github.com/pkg/errors"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/common/config"
|
||||
"github.com/prometheus/common/model"
|
||||
"github.com/prometheus/common/version"
|
||||
|
@ -45,10 +46,17 @@ var (
|
|||
}
|
||||
userAgent = fmt.Sprintf("Prometheus/%s", version.Version)
|
||||
matchContentType = regexp.MustCompile(`^(?i:application\/json(;\s*charset=("utf-8"|utf-8))?)$`)
|
||||
|
||||
failuresCount = prometheus.NewCounter(
|
||||
prometheus.CounterOpts{
|
||||
Name: "prometheus_sd_http_failures_total",
|
||||
Help: "Number of HTTP service discovery refresh failures.",
|
||||
})
|
||||
)
|
||||
|
||||
func init() {
|
||||
discovery.RegisterConfig(&SDConfig{})
|
||||
prometheus.MustRegister(failuresCount)
|
||||
}
|
||||
|
||||
// SDConfig is the configuration for HTTP based discovery.
|
||||
|
@ -145,6 +153,7 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) {
|
|||
|
||||
resp, err := d.client.Do(req.WithContext(ctx))
|
||||
if err != nil {
|
||||
failuresCount.Inc()
|
||||
return nil, err
|
||||
}
|
||||
defer func() {
|
||||
|
@ -153,26 +162,31 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) {
|
|||
}()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
failuresCount.Inc()
|
||||
return nil, errors.Errorf("server returned HTTP status %s", resp.Status)
|
||||
}
|
||||
|
||||
if !matchContentType.MatchString(strings.TrimSpace(resp.Header.Get("Content-Type"))) {
|
||||
failuresCount.Inc()
|
||||
return nil, errors.Errorf("unsupported content type %q", resp.Header.Get("Content-Type"))
|
||||
}
|
||||
|
||||
b, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
failuresCount.Inc()
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var targetGroups []*targetgroup.Group
|
||||
|
||||
if err := json.Unmarshal(b, &targetGroups); err != nil {
|
||||
failuresCount.Inc()
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for i, tg := range targetGroups {
|
||||
if tg == nil {
|
||||
failuresCount.Inc()
|
||||
err = errors.New("nil target group item found")
|
||||
return nil, err
|
||||
}
|
||||
|
|
|
@ -22,6 +22,8 @@ import (
|
|||
"time"
|
||||
|
||||
"github.com/go-kit/log"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
dto "github.com/prometheus/client_model/go"
|
||||
"github.com/prometheus/common/config"
|
||||
"github.com/prometheus/common/model"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
@ -61,6 +63,7 @@ func TestHTTPValidRefresh(t *testing.T) {
|
|||
},
|
||||
}
|
||||
require.Equal(t, tgs, expectedTargets)
|
||||
require.Equal(t, 0.0, getFailureCount())
|
||||
}
|
||||
|
||||
func TestHTTPInvalidCode(t *testing.T) {
|
||||
|
@ -82,6 +85,7 @@ func TestHTTPInvalidCode(t *testing.T) {
|
|||
ctx := context.Background()
|
||||
_, err = d.refresh(ctx)
|
||||
require.EqualError(t, err, "server returned HTTP status 400 Bad Request")
|
||||
require.Equal(t, 1.0, getFailureCount())
|
||||
}
|
||||
|
||||
func TestHTTPInvalidFormat(t *testing.T) {
|
||||
|
@ -103,6 +107,32 @@ func TestHTTPInvalidFormat(t *testing.T) {
|
|||
ctx := context.Background()
|
||||
_, err = d.refresh(ctx)
|
||||
require.EqualError(t, err, `unsupported content type "text/plain; charset=utf-8"`)
|
||||
require.Equal(t, 1.0, getFailureCount())
|
||||
}
|
||||
|
||||
var lastFailureCount float64
|
||||
|
||||
func getFailureCount() float64 {
|
||||
failureChan := make(chan prometheus.Metric)
|
||||
|
||||
go func() {
|
||||
failuresCount.Collect(failureChan)
|
||||
close(failureChan)
|
||||
}()
|
||||
|
||||
var counter dto.Metric
|
||||
for {
|
||||
metric, ok := <-failureChan
|
||||
if ok == false {
|
||||
break
|
||||
}
|
||||
metric.Write(&counter)
|
||||
}
|
||||
|
||||
// account for failures in prior tests
|
||||
count := *counter.Counter.Value - lastFailureCount
|
||||
lastFailureCount = *counter.Counter.Value
|
||||
return count
|
||||
}
|
||||
|
||||
func TestContentTypeRegex(t *testing.T) {
|
||||
|
|
|
@ -1448,8 +1448,9 @@ Example response body:
|
|||
]
|
||||
```
|
||||
|
||||
The endpoint is queried periodically at the specified
|
||||
refresh interval.
|
||||
The endpoint is queried periodically at the specified refresh interval.
|
||||
The `prometheus_sd_http_failures_total` counter metric tracks the number of
|
||||
refresh failures.
|
||||
|
||||
Each target has a meta label `__meta_url` during the
|
||||
[relabeling phase](#relabel_config). Its value is set to the
|
||||
|
|
|
@ -40,7 +40,8 @@ an empty list `[]`. Target lists are unordered.
|
|||
|
||||
Prometheus caches target lists. If an error occurs while fetching an updated
|
||||
targets list, Prometheus keeps using the current targets list. The targets list
|
||||
is not saved across restart.
|
||||
is not saved across restart. The `prometheus_sd_http_failures_total` counter
|
||||
metric tracks the number of refresh failures.
|
||||
|
||||
The whole list of targets must be returned on every scrape. There is no support
|
||||
for incremental updates. A Prometheus instance does not send its hostname and it
|
||||
|
|
Loading…
Reference in a new issue