prometheus/retrieval/target.go
Bjoern Rabenstein 1909686789 Make metrics exported by the Prometheus server itself more consistent.
- Always spell out the time unit (e.g. milliseconds instead of ms).

- Remove "_total" from the names of metrics that are not counters.

- Make use of the "Namespace" and "Subsystem" fields in the options.

- Removed the "capacity" facet from all metrics about channels/queues.
  These are all fixed via command line flags and will never change
  during the runtime of a process. Also, they should not be part of
  the same metric family. I have added separate metrics for the
  capacity of queues as convenience. (They will never change and are
  only set once.)

- I left "metric_disk_latency_microseconds" unchanged, although that
  metric measures the latency of the storage device, even if it is not
  a spinning disk. "SSD" is read by many as "solid state disk", so
  it's not too far off. (It should be "solid state drive", of course,
  but "metric_drive_latency_microseconds" is probably confusing.)

- Brian suggested to not mix "failure" and "success" outcome in the
  same metric family (distinguished by labels). For now, I left it as
  it is. We are touching some bigger issue here, especially as other
  parts in the Prometheus ecosystem are following the same
  principle. We still need to come to terms here and then change
  things consistently everywhere.

Change-Id: If799458b450d18f78500f05990301c12525197d3
2014-11-25 17:02:00 +01:00

299 lines
8.4 KiB
Go

// Copyright 2013 Prometheus Team
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package retrieval
import (
"fmt"
"net/http"
"os"
"strings"
"time"
"github.com/golang/glog"
"github.com/prometheus/client_golang/extraction"
"github.com/prometheus/client_golang/prometheus"
clientmodel "github.com/prometheus/client_golang/model"
"github.com/prometheus/prometheus/utility"
)
const (
InstanceLabel clientmodel.LabelName = "instance"
// The metric name for the synthetic health variable.
ScrapeHealthMetricName clientmodel.LabelValue = "up"
// Constants for instrumentation.
namespace = "prometheus"
job = "target_job"
instance = "target_instance"
failure = "failure"
outcome = "outcome"
success = "success"
)
var (
localhostRepresentations = []string{"http://127.0.0.1", "http://localhost"}
targetOperationLatencies = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Namespace: namespace,
Name: "target_operation_latency_milliseconds",
Help: "The latencies for target operations.",
Objectives: []float64{0.01, 0.05, 0.5, 0.90, 0.99},
},
[]string{job, instance, outcome},
)
)
func init() {
prometheus.MustRegister(targetOperationLatencies)
}
// The state of the given Target.
type TargetState int
func (t TargetState) String() string {
switch t {
case UNKNOWN:
return "UNKNOWN"
case ALIVE:
return "ALIVE"
case UNREACHABLE:
return "UNREACHABLE"
}
panic("unknown state")
}
const (
// The Target has not been seen; we know nothing about it, except that it is
// on our docket for examination.
UNKNOWN TargetState = iota
// The Target has been found and successfully queried.
ALIVE
// The Target was either historically found or not found and then determined
// to be unhealthy by either not responding or disappearing.
UNREACHABLE
)
// A Target represents an endpoint that should be interrogated for metrics.
//
// The protocol described by this type will likely change in future iterations,
// as it offers no good support for aggregated targets and fan out. Thusly,
// it is likely that the current Target and target uses will be
// wrapped with some resolver type.
//
// For the future, the Target protocol will abstract away the exact means that
// metrics are retrieved and deserialized from the given instance to which it
// refers.
type Target interface {
// Retrieve values from this target.
Scrape(ingester extraction.Ingester) error
// Return the last encountered scrape error, if any.
LastError() error
// Return the health of the target.
State() TargetState
// Return the last time a scrape was attempted.
LastScrape() time.Time
// The address to which the Target corresponds. Out of all of the available
// points in this interface, this one is the best candidate to change given
// the ways to express the endpoint.
Address() string
// The address as seen from other hosts. References to localhost are resolved
// to the address of the prometheus server.
GlobalAddress() string
// Return the target's base labels.
BaseLabels() clientmodel.LabelSet
// Merge a new externally supplied target definition (e.g. with changed base
// labels) into an old target definition for the same endpoint. Preserve
// remaining information - like health state - from the old target.
Merge(newTarget Target)
}
// target is a Target that refers to a singular HTTP or HTTPS endpoint.
type target struct {
// The current health state of the target.
state TargetState
// The last encountered scrape error, if any.
lastError error
// The last time a scrape was attempted.
lastScrape time.Time
address string
// What is the deadline for the HTTP or HTTPS against this endpoint.
Deadline time.Duration
// Any base labels that are added to this target and its metrics.
baseLabels clientmodel.LabelSet
// The HTTP client used to scrape the target's endpoint.
httpClient *http.Client
}
// Furnish a reasonably configured target for querying.
func NewTarget(address string, deadline time.Duration, baseLabels clientmodel.LabelSet) Target {
target := &target{
address: address,
Deadline: deadline,
baseLabels: baseLabels,
httpClient: utility.NewDeadlineClient(deadline),
}
return target
}
func (t *target) recordScrapeHealth(ingester extraction.Ingester, timestamp clientmodel.Timestamp, healthy bool) {
metric := clientmodel.Metric{}
for label, value := range t.baseLabels {
metric[label] = value
}
metric[clientmodel.MetricNameLabel] = clientmodel.LabelValue(ScrapeHealthMetricName)
metric[InstanceLabel] = clientmodel.LabelValue(t.Address())
healthValue := clientmodel.SampleValue(0)
if healthy {
healthValue = clientmodel.SampleValue(1)
}
sample := &clientmodel.Sample{
Metric: metric,
Timestamp: timestamp,
Value: healthValue,
}
ingester.Ingest(&extraction.Result{
Err: nil,
Samples: clientmodel.Samples{sample},
})
}
func (t *target) Scrape(ingester extraction.Ingester) error {
now := clientmodel.Now()
err := t.scrape(now, ingester)
if err == nil {
t.state = ALIVE
t.recordScrapeHealth(ingester, now, true)
} else {
t.state = UNREACHABLE
t.recordScrapeHealth(ingester, now, false)
}
t.lastScrape = time.Now()
t.lastError = err
return err
}
const acceptHeader = `application/vnd.google.protobuf;proto=io.prometheus.client.MetricFamily;encoding=delimited;q=0.7,text/plain;version=0.0.4;q=0.3,application/json;schema=prometheus/telemetry;version=0.0.2;q=0.2,*/*;q=0.1`
func (t *target) scrape(timestamp clientmodel.Timestamp, ingester extraction.Ingester) (err error) {
defer func(start time.Time) {
ms := float64(time.Since(start)) / float64(time.Millisecond)
labels := prometheus.Labels{
job: string(t.baseLabels[clientmodel.JobLabel]),
instance: t.Address(),
outcome: success,
}
if err != nil {
labels[outcome] = failure
}
targetOperationLatencies.With(labels).Observe(ms)
}(time.Now())
req, err := http.NewRequest("GET", t.Address(), nil)
if err != nil {
panic(err)
}
req.Header.Add("Accept", acceptHeader)
resp, err := t.httpClient.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("server returned HTTP status %s", resp.Status)
}
processor, err := extraction.ProcessorForRequestHeader(resp.Header)
if err != nil {
return err
}
// XXX: This is a wart; we need to handle this more gracefully down the
// road, especially once we have service discovery support.
baseLabels := clientmodel.LabelSet{InstanceLabel: clientmodel.LabelValue(t.Address())}
for baseLabel, baseValue := range t.baseLabels {
baseLabels[baseLabel] = baseValue
}
i := &MergeLabelsIngester{
Labels: baseLabels,
CollisionPrefix: clientmodel.ExporterLabelPrefix,
Ingester: ingester,
}
processOptions := &extraction.ProcessOptions{
Timestamp: timestamp,
}
return processor.ProcessSingle(resp.Body, i, processOptions)
}
func (t *target) LastError() error {
return t.lastError
}
func (t *target) State() TargetState {
return t.state
}
func (t *target) LastScrape() time.Time {
return t.lastScrape
}
func (t *target) Address() string {
return t.address
}
func (t *target) GlobalAddress() string {
address := t.address
hostname, err := os.Hostname()
if err != nil {
glog.Warningf("Couldn't get hostname: %s, returning target.Address()", err)
return address
}
for _, localhostRepresentation := range localhostRepresentations {
address = strings.Replace(address, localhostRepresentation, fmt.Sprintf("http://%s", hostname), -1)
}
return address
}
func (t *target) BaseLabels() clientmodel.LabelSet {
return t.baseLabels
}
// Merge a new externally supplied target definition (e.g. with changed base
// labels) into an old target definition for the same endpoint. Preserve
// remaining information - like health state - from the old target.
func (t *target) Merge(newTarget Target) {
if t.Address() != newTarget.Address() {
panic("targets don't refer to the same endpoint")
}
t.baseLabels = newTarget.BaseLabels()
}
type targets []Target
func (t targets) Len() int {
return len(t)
}