mirror of
https://github.com/prometheus/node_exporter.git
synced 2024-11-09 23:24:09 -08:00
3762191e66
This collector is based on adjtimex(2) system call. The collector returns three values, status if time is synchronised, offset to remote reference, and local clock frequency adjustment. Values are taken from kernel time keeping data structures to avoid getting involved how the synchronisation is implemented. By that I mean one should not care if time is update using ntpd, systemd.timesyncd, ptpd, and so on. Since all time sync implementation will always end up telling to kernel what is the status with time one can simply omit the software in between, and look results of the syncing. As a positive side effect this makes collector very quick and conceptually specific, this does not monitor availability of NTP server, or network in between, or dns resolution, and other unrelated but necessary things. Minimum set of values to keep eye on are the following three: The node_timex_sync_status tells if local clock is in sync with a remote clock. Value is set to zero when synchronisation to a reliable server is lost, or a time sync software is misconfigured. The node_timex_offset_seconds tells how much local clock is off when compared to reference. In case of multiple time references this value is outcome of RFC 5905 adjustment algorithm. Ideally offset should be close to zero, and it depends about use case how large value is acceptable. For example a typical web server is probably fine if offset is about 0.1 or less, but that would not be good enough for mobile phone base station operator. The node_timex_freq tells amount of adjustment to local clock tick frequency. For example if offset is one second and growing the local clock will need instruction to tick quicker. Number value itself is not very important, and occasional small adjustments are fine. When frequency is unusually in stable one can assume quality of time stamps will not be accurate to very far in sub second range. Obviously explaining why local clock frequency behaves like a passenger in roller coaster is different matter. Explanations can vary from system load, to environmental issues such as a machine being physically too hot. Rest of the measurements can help when debugging. If you run a clock server do probably want to collect and keep track of everything. Pull-request: https://github.com/prometheus/node_exporter/pull/664
189 lines
5.7 KiB
Go
189 lines
5.7 KiB
Go
// Copyright 2015 The Prometheus Authors
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package main
|
|
|
|
import (
|
|
"fmt"
|
|
"net/http"
|
|
_ "net/http/pprof"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promhttp"
|
|
"github.com/prometheus/common/log"
|
|
"github.com/prometheus/common/version"
|
|
"github.com/prometheus/node_exporter/collector"
|
|
"gopkg.in/alecthomas/kingpin.v2"
|
|
)
|
|
|
|
const (
|
|
defaultCollectors = "arp,bcache,conntrack,cpu,diskstats,entropy,edac,exec,filefd,filesystem,hwmon,infiniband,ipvs,loadavg,mdadm,meminfo,netdev,netstat,sockstat,stat,textfile,time,timex,uname,vmstat,wifi,xfs,zfs"
|
|
)
|
|
|
|
var (
|
|
scrapeDurationDesc = prometheus.NewDesc(
|
|
prometheus.BuildFQName(collector.Namespace, "scrape", "collector_duration_seconds"),
|
|
"node_exporter: Duration of a collector scrape.",
|
|
[]string{"collector"},
|
|
nil,
|
|
)
|
|
scrapeSuccessDesc = prometheus.NewDesc(
|
|
prometheus.BuildFQName(collector.Namespace, "scrape", "collector_success"),
|
|
"node_exporter: Whether a collector succeeded.",
|
|
[]string{"collector"},
|
|
nil,
|
|
)
|
|
)
|
|
|
|
// NodeCollector implements the prometheus.Collector interface.
|
|
type NodeCollector struct {
|
|
collectors map[string]collector.Collector
|
|
}
|
|
|
|
// Describe implements the prometheus.Collector interface.
|
|
func (n NodeCollector) Describe(ch chan<- *prometheus.Desc) {
|
|
ch <- scrapeDurationDesc
|
|
ch <- scrapeSuccessDesc
|
|
}
|
|
|
|
// Collect implements the prometheus.Collector interface.
|
|
func (n NodeCollector) Collect(ch chan<- prometheus.Metric) {
|
|
wg := sync.WaitGroup{}
|
|
wg.Add(len(n.collectors))
|
|
for name, c := range n.collectors {
|
|
go func(name string, c collector.Collector) {
|
|
execute(name, c, ch)
|
|
wg.Done()
|
|
}(name, c)
|
|
}
|
|
wg.Wait()
|
|
}
|
|
|
|
func filterAvailableCollectors(collectors string) string {
|
|
var availableCollectors []string
|
|
for _, c := range strings.Split(collectors, ",") {
|
|
_, ok := collector.Factories[c]
|
|
if ok {
|
|
availableCollectors = append(availableCollectors, c)
|
|
}
|
|
}
|
|
return strings.Join(availableCollectors, ",")
|
|
}
|
|
|
|
func execute(name string, c collector.Collector, ch chan<- prometheus.Metric) {
|
|
begin := time.Now()
|
|
err := c.Update(ch)
|
|
duration := time.Since(begin)
|
|
var success float64
|
|
|
|
if err != nil {
|
|
log.Errorf("ERROR: %s collector failed after %fs: %s", name, duration.Seconds(), err)
|
|
success = 0
|
|
} else {
|
|
log.Debugf("OK: %s collector succeeded after %fs.", name, duration.Seconds())
|
|
success = 1
|
|
}
|
|
ch <- prometheus.MustNewConstMetric(scrapeDurationDesc, prometheus.GaugeValue, duration.Seconds(), name)
|
|
ch <- prometheus.MustNewConstMetric(scrapeSuccessDesc, prometheus.GaugeValue, success, name)
|
|
}
|
|
|
|
func loadCollectors(list string) (map[string]collector.Collector, error) {
|
|
collectors := map[string]collector.Collector{}
|
|
for _, name := range strings.Split(list, ",") {
|
|
fn, ok := collector.Factories[name]
|
|
if !ok {
|
|
return nil, fmt.Errorf("collector '%s' not available", name)
|
|
}
|
|
c, err := fn()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
collectors[name] = c
|
|
}
|
|
return collectors, nil
|
|
}
|
|
|
|
func init() {
|
|
prometheus.MustRegister(version.NewCollector("node_exporter"))
|
|
}
|
|
|
|
func main() {
|
|
var (
|
|
listenAddress = kingpin.Flag("web.listen-address", "Address on which to expose metrics and web interface.").Default(":9100").String()
|
|
metricsPath = kingpin.Flag("web.telemetry-path", "Path under which to expose metrics.").Default("/metrics").String()
|
|
enabledCollectors = kingpin.Flag("collectors.enabled", "Comma-separated list of collectors to use.").Default(filterAvailableCollectors(defaultCollectors)).String()
|
|
printCollectors = kingpin.Flag("collectors.print", "If true, print available collectors and exit.").Bool()
|
|
)
|
|
|
|
log.AddFlags(kingpin.CommandLine)
|
|
kingpin.Version(version.Print("node_exporter"))
|
|
kingpin.HelpFlag.Short('h')
|
|
kingpin.Parse()
|
|
|
|
log.Infoln("Starting node_exporter", version.Info())
|
|
log.Infoln("Build context", version.BuildContext())
|
|
|
|
if *printCollectors {
|
|
collectorNames := make(sort.StringSlice, 0, len(collector.Factories))
|
|
for n := range collector.Factories {
|
|
collectorNames = append(collectorNames, n)
|
|
}
|
|
collectorNames.Sort()
|
|
fmt.Printf("Available collectors:\n")
|
|
for _, n := range collectorNames {
|
|
fmt.Printf(" - %s\n", n)
|
|
}
|
|
return
|
|
}
|
|
collectors, err := loadCollectors(*enabledCollectors)
|
|
if err != nil {
|
|
log.Fatalf("Couldn't load collectors: %s", err)
|
|
}
|
|
|
|
log.Infof("Enabled collectors:")
|
|
for n := range collectors {
|
|
log.Infof(" - %s", n)
|
|
}
|
|
|
|
if err := prometheus.Register(NodeCollector{collectors: collectors}); err != nil {
|
|
log.Fatalf("Couldn't register collector: %s", err)
|
|
}
|
|
handler := promhttp.HandlerFor(prometheus.DefaultGatherer,
|
|
promhttp.HandlerOpts{
|
|
ErrorLog: log.NewErrorLogger(),
|
|
ErrorHandling: promhttp.ContinueOnError,
|
|
})
|
|
|
|
// TODO(ts): Remove deprecated and problematic InstrumentHandler usage.
|
|
http.Handle(*metricsPath, prometheus.InstrumentHandler("prometheus", handler))
|
|
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
|
|
w.Write([]byte(`<html>
|
|
<head><title>Node Exporter</title></head>
|
|
<body>
|
|
<h1>Node Exporter</h1>
|
|
<p><a href="` + *metricsPath + `">Metrics</a></p>
|
|
</body>
|
|
</html>`))
|
|
})
|
|
|
|
log.Infoln("Listening on", *listenAddress)
|
|
err = http.ListenAndServe(*listenAddress, nil)
|
|
if err != nil {
|
|
log.Fatal(err)
|
|
}
|
|
}
|