mirror of
https://github.com/prometheus/node_exporter.git
synced 2024-12-28 23:19:46 -08:00
40dce45d8d
Adds a new label called "type" systemd_unit_state which contains the Type field from the unit file. This applies only to the .service and .mount unit types. The other unit types do not include the optional type field. Fixes #1210 Signed-off-by: Paul Gier <pgier@redhat.com>
423 lines
14 KiB
Go
423 lines
14 KiB
Go
// Copyright 2015 The Prometheus Authors
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// +build !nosystemd
|
|
|
|
package collector
|
|
|
|
import (
|
|
"fmt"
|
|
"math"
|
|
"regexp"
|
|
"strings"
|
|
|
|
"github.com/coreos/go-systemd/dbus"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/common/log"
|
|
kingpin "gopkg.in/alecthomas/kingpin.v2"
|
|
)
|
|
|
|
var (
|
|
unitWhitelist = kingpin.Flag("collector.systemd.unit-whitelist", "Regexp of systemd units to whitelist. Units must both match whitelist and not match blacklist to be included.").Default(".+").String()
|
|
unitBlacklist = kingpin.Flag("collector.systemd.unit-blacklist", "Regexp of systemd units to blacklist. Units must both match whitelist and not match blacklist to be included.").Default(".+\\.scope").String()
|
|
systemdPrivate = kingpin.Flag("collector.systemd.private", "Establish a private, direct connection to systemd without dbus.").Bool()
|
|
)
|
|
|
|
type systemdCollector struct {
|
|
unitDesc *prometheus.Desc
|
|
unitStartTimeDesc *prometheus.Desc
|
|
unitTasksCurrentDesc *prometheus.Desc
|
|
unitTasksMaxDesc *prometheus.Desc
|
|
systemRunningDesc *prometheus.Desc
|
|
summaryDesc *prometheus.Desc
|
|
nRestartsDesc *prometheus.Desc
|
|
timerLastTriggerDesc *prometheus.Desc
|
|
socketAcceptedConnectionsDesc *prometheus.Desc
|
|
socketCurrentConnectionsDesc *prometheus.Desc
|
|
socketRefusedConnectionsDesc *prometheus.Desc
|
|
unitWhitelistPattern *regexp.Regexp
|
|
unitBlacklistPattern *regexp.Regexp
|
|
}
|
|
|
|
var unitStatesName = []string{"active", "activating", "deactivating", "inactive", "failed"}
|
|
|
|
func init() {
|
|
registerCollector("systemd", defaultDisabled, NewSystemdCollector)
|
|
}
|
|
|
|
// NewSystemdCollector returns a new Collector exposing systemd statistics.
|
|
func NewSystemdCollector() (Collector, error) {
|
|
const subsystem = "systemd"
|
|
|
|
unitDesc := prometheus.NewDesc(
|
|
prometheus.BuildFQName(namespace, subsystem, "unit_state"),
|
|
"Systemd unit", []string{"name", "state", "type"}, nil,
|
|
)
|
|
unitStartTimeDesc := prometheus.NewDesc(
|
|
prometheus.BuildFQName(namespace, subsystem, "unit_start_time_seconds"),
|
|
"Start time of the unit since unix epoch in seconds.", []string{"name"}, nil,
|
|
)
|
|
unitTasksCurrentDesc := prometheus.NewDesc(
|
|
prometheus.BuildFQName(namespace, subsystem, "unit_tasks_current"),
|
|
"Current number of tasks per Systemd unit", []string{"name"}, nil,
|
|
)
|
|
unitTasksMaxDesc := prometheus.NewDesc(
|
|
prometheus.BuildFQName(namespace, subsystem, "unit_tasks_max"),
|
|
"Maximum number of tasks per Systemd unit", []string{"name"}, nil,
|
|
)
|
|
systemRunningDesc := prometheus.NewDesc(
|
|
prometheus.BuildFQName(namespace, subsystem, "system_running"),
|
|
"Whether the system is operational (see 'systemctl is-system-running')",
|
|
nil, nil,
|
|
)
|
|
summaryDesc := prometheus.NewDesc(
|
|
prometheus.BuildFQName(namespace, subsystem, "units"),
|
|
"Summary of systemd unit states", []string{"state"}, nil)
|
|
nRestartsDesc := prometheus.NewDesc(
|
|
prometheus.BuildFQName(namespace, subsystem, "service_restart_total"),
|
|
"Service unit count of Restart triggers", []string{"state"}, nil)
|
|
timerLastTriggerDesc := prometheus.NewDesc(
|
|
prometheus.BuildFQName(namespace, subsystem, "timer_last_trigger_seconds"),
|
|
"Seconds since epoch of last trigger.", []string{"name"}, nil)
|
|
socketAcceptedConnectionsDesc := prometheus.NewDesc(
|
|
prometheus.BuildFQName(namespace, subsystem, "socket_accepted_connections_total"),
|
|
"Total number of accepted socket connections", []string{"name"}, nil)
|
|
socketCurrentConnectionsDesc := prometheus.NewDesc(
|
|
prometheus.BuildFQName(namespace, subsystem, "socket_current_connections"),
|
|
"Current number of socket connections", []string{"name"}, nil)
|
|
socketRefusedConnectionsDesc := prometheus.NewDesc(
|
|
prometheus.BuildFQName(namespace, subsystem, "socket_refused_connections_total"),
|
|
"Total number of refused socket connections", []string{"name"}, nil)
|
|
unitWhitelistPattern := regexp.MustCompile(fmt.Sprintf("^(?:%s)$", *unitWhitelist))
|
|
unitBlacklistPattern := regexp.MustCompile(fmt.Sprintf("^(?:%s)$", *unitBlacklist))
|
|
|
|
return &systemdCollector{
|
|
unitDesc: unitDesc,
|
|
unitStartTimeDesc: unitStartTimeDesc,
|
|
unitTasksCurrentDesc: unitTasksCurrentDesc,
|
|
unitTasksMaxDesc: unitTasksMaxDesc,
|
|
systemRunningDesc: systemRunningDesc,
|
|
summaryDesc: summaryDesc,
|
|
nRestartsDesc: nRestartsDesc,
|
|
timerLastTriggerDesc: timerLastTriggerDesc,
|
|
socketAcceptedConnectionsDesc: socketAcceptedConnectionsDesc,
|
|
socketCurrentConnectionsDesc: socketCurrentConnectionsDesc,
|
|
socketRefusedConnectionsDesc: socketRefusedConnectionsDesc,
|
|
unitWhitelistPattern: unitWhitelistPattern,
|
|
unitBlacklistPattern: unitBlacklistPattern,
|
|
}, nil
|
|
}
|
|
|
|
func (c *systemdCollector) Update(ch chan<- prometheus.Metric) error {
|
|
allUnits, err := c.getAllUnits()
|
|
if err != nil {
|
|
return fmt.Errorf("couldn't get units: %s", err)
|
|
}
|
|
|
|
summary := summarizeUnits(allUnits)
|
|
c.collectSummaryMetrics(ch, summary)
|
|
|
|
units := filterUnits(allUnits, c.unitWhitelistPattern, c.unitBlacklistPattern)
|
|
c.collectUnitStatusMetrics(ch, units)
|
|
c.collectUnitStartTimeMetrics(ch, units)
|
|
c.collectUnitTasksCurrentMetrics(ch, units)
|
|
c.collectUnitTasksMaxMetrics(ch, units)
|
|
c.collectTimers(ch, units)
|
|
c.collectSockets(ch, units)
|
|
|
|
systemState, err := c.getSystemState()
|
|
if err != nil {
|
|
return fmt.Errorf("couldn't get system state: %s", err)
|
|
}
|
|
c.collectSystemState(ch, systemState)
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *systemdCollector) collectUnitStatusMetrics(ch chan<- prometheus.Metric, units []unit) {
|
|
for _, unit := range units {
|
|
for _, stateName := range unitStatesName {
|
|
isActive := 0.0
|
|
if stateName == unit.ActiveState {
|
|
isActive = 1.0
|
|
}
|
|
ch <- prometheus.MustNewConstMetric(
|
|
c.unitDesc, prometheus.GaugeValue, isActive,
|
|
unit.Name, stateName, unit.serviceType)
|
|
}
|
|
if strings.HasSuffix(unit.Name, ".service") && unit.nRestarts != nil {
|
|
ch <- prometheus.MustNewConstMetric(
|
|
c.nRestartsDesc, prometheus.CounterValue,
|
|
float64(*unit.nRestarts), unit.Name)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (c *systemdCollector) collectSockets(ch chan<- prometheus.Metric, units []unit) {
|
|
for _, unit := range units {
|
|
if !strings.HasSuffix(unit.Name, ".socket") {
|
|
continue
|
|
}
|
|
|
|
ch <- prometheus.MustNewConstMetric(
|
|
c.socketAcceptedConnectionsDesc, prometheus.CounterValue,
|
|
float64(unit.acceptedConnections), unit.Name)
|
|
ch <- prometheus.MustNewConstMetric(
|
|
c.socketCurrentConnectionsDesc, prometheus.GaugeValue,
|
|
float64(unit.currentConnections), unit.Name)
|
|
if unit.refusedConnections != nil {
|
|
ch <- prometheus.MustNewConstMetric(
|
|
c.socketRefusedConnectionsDesc, prometheus.GaugeValue,
|
|
float64(*unit.refusedConnections), unit.Name)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (c *systemdCollector) collectUnitStartTimeMetrics(ch chan<- prometheus.Metric, units []unit) {
|
|
for _, unit := range units {
|
|
ch <- prometheus.MustNewConstMetric(
|
|
c.unitStartTimeDesc, prometheus.GaugeValue,
|
|
float64(unit.startTimeUsec)/1e6, unit.Name)
|
|
}
|
|
}
|
|
|
|
func (c *systemdCollector) collectUnitTasksCurrentMetrics(ch chan<- prometheus.Metric, units []unit) {
|
|
for _, unit := range units {
|
|
if unit.tasksCurrent != nil {
|
|
ch <- prometheus.MustNewConstMetric(
|
|
c.unitTasksCurrentDesc, prometheus.GaugeValue,
|
|
float64(*unit.tasksCurrent), unit.Name)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (c *systemdCollector) collectUnitTasksMaxMetrics(ch chan<- prometheus.Metric, units []unit) {
|
|
for _, unit := range units {
|
|
if unit.tasksMax != nil {
|
|
ch <- prometheus.MustNewConstMetric(
|
|
c.unitTasksMaxDesc, prometheus.GaugeValue,
|
|
float64(*unit.tasksMax), unit.Name)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (c *systemdCollector) collectTimers(ch chan<- prometheus.Metric, units []unit) {
|
|
for _, unit := range units {
|
|
if !strings.HasSuffix(unit.Name, ".timer") {
|
|
continue
|
|
}
|
|
|
|
ch <- prometheus.MustNewConstMetric(
|
|
c.timerLastTriggerDesc, prometheus.GaugeValue,
|
|
float64(unit.lastTriggerUsec)/1e6, unit.Name)
|
|
}
|
|
}
|
|
|
|
func (c *systemdCollector) collectSummaryMetrics(ch chan<- prometheus.Metric, summary map[string]float64) {
|
|
for stateName, count := range summary {
|
|
ch <- prometheus.MustNewConstMetric(
|
|
c.summaryDesc, prometheus.GaugeValue, count, stateName)
|
|
}
|
|
}
|
|
|
|
func (c *systemdCollector) collectSystemState(ch chan<- prometheus.Metric, systemState string) {
|
|
isSystemRunning := 0.0
|
|
if systemState == `"running"` {
|
|
isSystemRunning = 1.0
|
|
}
|
|
ch <- prometheus.MustNewConstMetric(c.systemRunningDesc, prometheus.GaugeValue, isSystemRunning)
|
|
}
|
|
|
|
func (c *systemdCollector) newDbus() (*dbus.Conn, error) {
|
|
if *systemdPrivate {
|
|
return dbus.NewSystemdConnection()
|
|
}
|
|
return dbus.New()
|
|
}
|
|
|
|
type unit struct {
|
|
dbus.UnitStatus
|
|
lastTriggerUsec uint64
|
|
startTimeUsec uint64
|
|
tasksCurrent *uint64
|
|
tasksMax *uint64
|
|
nRestarts *uint32
|
|
serviceType string
|
|
acceptedConnections uint32
|
|
currentConnections uint32
|
|
refusedConnections *uint32
|
|
}
|
|
|
|
// unitType gets the suffix after the last "." in the
|
|
// unit name and capitalizes the first letter
|
|
func (u *unit) unitType() string {
|
|
suffixIndex := strings.LastIndex(u.Name, ".") + 1
|
|
if suffixIndex < 1 || suffixIndex > len(u.Name) {
|
|
return ""
|
|
}
|
|
return strings.Title(u.Name[suffixIndex:])
|
|
}
|
|
|
|
func (c *systemdCollector) getAllUnits() ([]unit, error) {
|
|
conn, err := c.newDbus()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("couldn't get dbus connection: %s", err)
|
|
}
|
|
defer conn.Close()
|
|
|
|
// Filter out any units that are not installed and are pulled in only as dependencies.
|
|
allUnits, err := conn.ListUnits()
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
result := make([]unit, 0, len(allUnits))
|
|
for _, status := range allUnits {
|
|
unit := unit{
|
|
UnitStatus: status,
|
|
}
|
|
unitType := unit.unitType()
|
|
if unitType == "Service" || unitType == "Mount" {
|
|
serviceType, err := conn.GetUnitTypeProperty(unit.Name, unitType, "Type")
|
|
if err != nil {
|
|
log.Debugf("couldn't get type for unit '%s': %s", unit.Name, err)
|
|
} else {
|
|
unit.serviceType = serviceType.Value.Value().(string)
|
|
}
|
|
}
|
|
if strings.HasSuffix(unit.Name, ".timer") {
|
|
lastTriggerValue, err := conn.GetUnitTypeProperty(unit.Name, "Timer", "LastTriggerUSec")
|
|
if err != nil {
|
|
log.Debugf("couldn't get unit '%s' LastTriggerUSec: %s", unit.Name, err)
|
|
continue
|
|
}
|
|
|
|
unit.lastTriggerUsec = lastTriggerValue.Value.Value().(uint64)
|
|
}
|
|
if strings.HasSuffix(unit.Name, ".service") {
|
|
// NRestarts wasn't added until systemd 235.
|
|
restartsCount, err := conn.GetUnitTypeProperty(unit.Name, "Service", "NRestarts")
|
|
if err != nil {
|
|
log.Debugf("couldn't get unit '%s' NRestarts: %s", unit.Name, err)
|
|
} else {
|
|
nRestarts := restartsCount.Value.Value().(uint32)
|
|
unit.nRestarts = &nRestarts
|
|
}
|
|
|
|
tasksCurrentCount, err := conn.GetUnitTypeProperty(unit.Name, "Service", "TasksCurrent")
|
|
if err != nil {
|
|
log.Debugf("couldn't get unit '%s' TasksCurrent: %s", unit.Name, err)
|
|
} else {
|
|
val := tasksCurrentCount.Value.Value().(uint64)
|
|
// Don't set if tasksCurrent if dbus reports MaxUint64.
|
|
if val != math.MaxUint64 {
|
|
unit.tasksCurrent = &val
|
|
}
|
|
}
|
|
|
|
tasksMaxCount, err := conn.GetUnitTypeProperty(unit.Name, "Service", "TasksMax")
|
|
if err != nil {
|
|
log.Debugf("couldn't get unit '%s' TasksMax: %s", unit.Name, err)
|
|
} else {
|
|
val := tasksMaxCount.Value.Value().(uint64)
|
|
// Don't set if tasksMax if dbus reports MaxUint64.
|
|
if val != math.MaxUint64 {
|
|
unit.tasksMax = &val
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
if strings.HasSuffix(unit.Name, ".socket") {
|
|
acceptedConnectionCount, err := conn.GetUnitTypeProperty(unit.Name, "Socket", "NAccepted")
|
|
if err != nil {
|
|
log.Debugf("couldn't get unit '%s' NAccepted: %s", unit.Name, err)
|
|
continue
|
|
}
|
|
|
|
unit.acceptedConnections = acceptedConnectionCount.Value.Value().(uint32)
|
|
|
|
currentConnectionCount, err := conn.GetUnitTypeProperty(unit.Name, "Socket", "NConnections")
|
|
if err != nil {
|
|
log.Debugf("couldn't get unit '%s' NConnections: %s", unit.Name, err)
|
|
continue
|
|
}
|
|
unit.currentConnections = currentConnectionCount.Value.Value().(uint32)
|
|
|
|
// NRefused wasn't added until systemd 239.
|
|
refusedConnectionCount, err := conn.GetUnitTypeProperty(unit.Name, "Socket", "NRefused")
|
|
if err != nil {
|
|
log.Debugf("couldn't get unit '%s' NRefused: %s", unit.Name, err)
|
|
} else {
|
|
nRefused := refusedConnectionCount.Value.Value().(uint32)
|
|
unit.refusedConnections = &nRefused
|
|
}
|
|
}
|
|
|
|
if unit.ActiveState != "active" {
|
|
unit.startTimeUsec = 0
|
|
} else {
|
|
timestampValue, err := conn.GetUnitProperty(unit.Name, "ActiveEnterTimestamp")
|
|
if err != nil {
|
|
log.Debugf("couldn't get unit '%s' StartTimeUsec: %s", unit.Name, err)
|
|
continue
|
|
}
|
|
|
|
unit.startTimeUsec = timestampValue.Value.Value().(uint64)
|
|
}
|
|
|
|
result = append(result, unit)
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|
|
func summarizeUnits(units []unit) map[string]float64 {
|
|
summarized := make(map[string]float64)
|
|
|
|
for _, unitStateName := range unitStatesName {
|
|
summarized[unitStateName] = 0.0
|
|
}
|
|
|
|
for _, unit := range units {
|
|
summarized[unit.ActiveState] += 1.0
|
|
}
|
|
|
|
return summarized
|
|
}
|
|
|
|
func filterUnits(units []unit, whitelistPattern, blacklistPattern *regexp.Regexp) []unit {
|
|
filtered := make([]unit, 0, len(units))
|
|
for _, unit := range units {
|
|
if whitelistPattern.MatchString(unit.Name) && !blacklistPattern.MatchString(unit.Name) && unit.LoadState == "loaded" {
|
|
log.Debugf("Adding unit: %s", unit.Name)
|
|
filtered = append(filtered, unit)
|
|
} else {
|
|
log.Debugf("Ignoring unit: %s", unit.Name)
|
|
}
|
|
}
|
|
|
|
return filtered
|
|
}
|
|
|
|
func (c *systemdCollector) getSystemState() (state string, err error) {
|
|
conn, err := c.newDbus()
|
|
if err != nil {
|
|
return "", fmt.Errorf("couldn't get dbus connection: %s", err)
|
|
}
|
|
state, err = conn.GetManagerProperty("SystemState")
|
|
conn.Close()
|
|
return state, err
|
|
}
|