mirror of
https://github.com/prometheus/node_exporter.git
synced 2024-12-31 16:37:31 -08:00
Merge pull request #450 from roclark/add-infiniband
infiniband: Add new collector for InfiniBand statistics
This commit is contained in:
commit
38cd07ebb9
|
@ -32,6 +32,7 @@ The following individuals have contributed code to this repository
|
||||||
* Ken Herner <ken@modulus.io>
|
* Ken Herner <ken@modulus.io>
|
||||||
* Matt Layher <mdlayher@gmail.com>
|
* Matt Layher <mdlayher@gmail.com>
|
||||||
* Matthias Rampke <matthias@rampke.de>
|
* Matthias Rampke <matthias@rampke.de>
|
||||||
|
* Robert Clark <robert.d.clark@hpe.com>
|
||||||
* Siavash Safi <siavash.safi@gmail.com>
|
* Siavash Safi <siavash.safi@gmail.com>
|
||||||
* Stephen Shirley <kormat@gmail.com>
|
* Stephen Shirley <kormat@gmail.com>
|
||||||
* Steve Durrheimer <s.durrheimer@gmail.com>
|
* Steve Durrheimer <s.durrheimer@gmail.com>
|
||||||
|
|
|
@ -28,6 +28,7 @@ entropy | Exposes available entropy. | Linux
|
||||||
filefd | Exposes file descriptor statistics from `/proc/sys/fs/file-nr`. | Linux
|
filefd | Exposes file descriptor statistics from `/proc/sys/fs/file-nr`. | Linux
|
||||||
filesystem | Exposes filesystem statistics, such as disk space used. | Darwin, Dragonfly, FreeBSD, Linux, OpenBSD
|
filesystem | Exposes filesystem statistics, such as disk space used. | Darwin, Dragonfly, FreeBSD, Linux, OpenBSD
|
||||||
hwmon | Expose hardware monitoring and sensor data from `/sys/class/hwmon/`. | Linux
|
hwmon | Expose hardware monitoring and sensor data from `/sys/class/hwmon/`. | Linux
|
||||||
|
infiniband | Exposes network statistics specific to InfiniBand configurations. | Linux
|
||||||
loadavg | Exposes load average. | Darwin, Dragonfly, FreeBSD, Linux, NetBSD, OpenBSD, Solaris
|
loadavg | Exposes load average. | Darwin, Dragonfly, FreeBSD, Linux, NetBSD, OpenBSD, Solaris
|
||||||
mdadm | Exposes statistics about devices in `/proc/mdstat` (does nothing if no `/proc/mdstat` present). | Linux
|
mdadm | Exposes statistics about devices in `/proc/mdstat` (does nothing if no `/proc/mdstat` present). | Linux
|
||||||
meminfo | Exposes memory statistics. | Darwin, Dragonfly, FreeBSD, Linux
|
meminfo | Exposes memory statistics. | Darwin, Dragonfly, FreeBSD, Linux
|
||||||
|
|
|
@ -689,6 +689,38 @@ node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp2"} 84
|
||||||
node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp3"} 84
|
node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp3"} 84
|
||||||
node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp4"} 84
|
node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp4"} 84
|
||||||
node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp5"} 84
|
node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp5"} 84
|
||||||
|
# HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down
|
||||||
|
# TYPE node_infiniband_link_downed_total counter
|
||||||
|
node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0
|
||||||
|
node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0
|
||||||
|
# HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state
|
||||||
|
# TYPE node_infiniband_link_error_recovery_total counter
|
||||||
|
node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0
|
||||||
|
node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0
|
||||||
|
# HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors)
|
||||||
|
# TYPE node_infiniband_multicast_packets_received_total counter
|
||||||
|
node_infiniband_multicast_packets_received_total{device="mlx4_0",port="1"} 93
|
||||||
|
node_infiniband_multicast_packets_received_total{device="mlx4_0",port="2"} 0
|
||||||
|
# HELP node_infiniband_multicast_packets_transmitted_total Number of multicast packets transmitted (including errors)
|
||||||
|
# TYPE node_infiniband_multicast_packets_transmitted_total counter
|
||||||
|
node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16
|
||||||
|
node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0
|
||||||
|
# HELP node_infiniband_port_data_received_bytes Number of data octets received on all links
|
||||||
|
# TYPE node_infiniband_port_data_received_bytes counter
|
||||||
|
node_infiniband_port_data_received_bytes{device="mlx4_0",port="1"} 4.631917e+06
|
||||||
|
node_infiniband_port_data_received_bytes{device="mlx4_0",port="2"} 0
|
||||||
|
# HELP node_infiniband_port_data_transmitted_bytes Number of data octets transmitted on all links
|
||||||
|
# TYPE node_infiniband_port_data_transmitted_bytes counter
|
||||||
|
node_infiniband_port_data_transmitted_bytes{device="mlx4_0",port="1"} 3.73344e+06
|
||||||
|
node_infiniband_port_data_transmitted_bytes{device="mlx4_0",port="2"} 0
|
||||||
|
# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors)
|
||||||
|
# TYPE node_infiniband_unicast_packets_received_total counter
|
||||||
|
node_infiniband_unicast_packets_received_total{device="mlx4_0",port="1"} 61148
|
||||||
|
node_infiniband_unicast_packets_received_total{device="mlx4_0",port="2"} 0
|
||||||
|
# HELP node_infiniband_unicast_packets_transmitted_total Number of unicast packets transmitted (including errors)
|
||||||
|
# TYPE node_infiniband_unicast_packets_transmitted_total counter
|
||||||
|
node_infiniband_unicast_packets_transmitted_total{device="mlx4_0",port="1"} 61239
|
||||||
|
node_infiniband_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 0
|
||||||
# HELP node_intr Total number of interrupts serviced.
|
# HELP node_intr Total number of interrupts serviced.
|
||||||
# TYPE node_intr counter
|
# TYPE node_intr counter
|
||||||
node_intr 8.885917e+06
|
node_intr 8.885917e+06
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
0
|
|
@ -0,0 +1 @@
|
||||||
|
0
|
|
@ -0,0 +1 @@
|
||||||
|
93
|
|
@ -0,0 +1 @@
|
||||||
|
16
|
|
@ -0,0 +1 @@
|
||||||
|
4631917
|
|
@ -0,0 +1 @@
|
||||||
|
3733440
|
|
@ -0,0 +1 @@
|
||||||
|
61148
|
|
@ -0,0 +1 @@
|
||||||
|
61239
|
|
@ -0,0 +1 @@
|
||||||
|
0
|
|
@ -0,0 +1 @@
|
||||||
|
0
|
|
@ -0,0 +1 @@
|
||||||
|
0
|
|
@ -0,0 +1 @@
|
||||||
|
0
|
|
@ -0,0 +1 @@
|
||||||
|
0
|
|
@ -0,0 +1 @@
|
||||||
|
0
|
|
@ -0,0 +1 @@
|
||||||
|
0
|
|
@ -0,0 +1 @@
|
||||||
|
0
|
177
collector/infiniband_linux.go
Normal file
177
collector/infiniband_linux.go
Normal file
|
@ -0,0 +1,177 @@
|
||||||
|
// Copyright 2017 The Prometheus Authors
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
// +build linux
|
||||||
|
// +build !noinfiniband
|
||||||
|
|
||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"path/filepath"
|
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
"github.com/prometheus/common/log"
|
||||||
|
)
|
||||||
|
|
||||||
|
const infinibandPath = "class/infiniband"
|
||||||
|
|
||||||
|
var (
|
||||||
|
errInfinibandNoDevicesFound = errors.New("no InfiniBand devices detected")
|
||||||
|
errInfinibandNoPortsFound = errors.New("no InfiniBand ports detected")
|
||||||
|
)
|
||||||
|
|
||||||
|
type infinibandCollector struct {
|
||||||
|
metricDescs map[string]*prometheus.Desc
|
||||||
|
counters map[string]infinibandMetric
|
||||||
|
}
|
||||||
|
|
||||||
|
type infinibandMetric struct {
|
||||||
|
File string
|
||||||
|
Help string
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
Factories["infiniband"] = NewInfiniBandCollector
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewInfiniBandCollector() (Collector, error) {
|
||||||
|
var i infinibandCollector
|
||||||
|
|
||||||
|
// Filenames of all InfiniBand counter metrics including a detailed description.
|
||||||
|
i.counters = map[string]infinibandMetric{
|
||||||
|
"link_downed_total": {"link_downed", "Number of times the link failed to recover from an error state and went down"},
|
||||||
|
"link_error_recovery_total": {"link_error_recovery", "Number of times the link successfully recovered from an error state"},
|
||||||
|
"multicast_packets_received_total": {"multicast_rcv_packets", "Number of multicast packets received (including errors)"},
|
||||||
|
"multicast_packets_transmitted_total": {"multicast_xmit_packets", "Number of multicast packets transmitted (including errors)"},
|
||||||
|
"port_data_received_bytes": {"port_rcv_data", "Number of data octets received on all links"},
|
||||||
|
"port_data_transmitted_bytes": {"port_xmit_data", "Number of data octets transmitted on all links"},
|
||||||
|
"unicast_packets_received_total": {"unicast_rcv_packets", "Number of unicast packets received (including errors)"},
|
||||||
|
"unicast_packets_transmitted_total": {"unicast_xmit_packets", "Number of unicast packets transmitted (including errors)"},
|
||||||
|
}
|
||||||
|
|
||||||
|
subsystem := "infiniband"
|
||||||
|
i.metricDescs = make(map[string]*prometheus.Desc)
|
||||||
|
|
||||||
|
for metricName, infinibandMetric := range i.counters {
|
||||||
|
i.metricDescs[metricName] = prometheus.NewDesc(
|
||||||
|
prometheus.BuildFQName(Namespace, subsystem, metricName),
|
||||||
|
infinibandMetric.Help,
|
||||||
|
[]string{"device", "port"},
|
||||||
|
nil,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &i, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// infinibandDevices retrieves a list of InfiniBand devices.
|
||||||
|
func infinibandDevices(infinibandPath string) ([]string, error) {
|
||||||
|
devices, err := filepath.Glob(filepath.Join(infinibandPath, "/*"))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(devices) < 1 {
|
||||||
|
log.Debugf("Unable to detect InfiniBand devices")
|
||||||
|
err = errInfinibandNoDevicesFound
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract just the filenames which equate to the device names.
|
||||||
|
for i, device := range devices {
|
||||||
|
devices[i] = filepath.Base(device)
|
||||||
|
}
|
||||||
|
|
||||||
|
return devices, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Retrieve a list of ports for the InfiniBand device.
|
||||||
|
func infinibandPorts(infinibandPath, device string) ([]string, error) {
|
||||||
|
ports, err := filepath.Glob(filepath.Join(infinibandPath, device, "ports/*"))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(ports) < 1 {
|
||||||
|
log.Debugf("Unable to detect ports for %s", device)
|
||||||
|
err = errInfinibandNoPortsFound
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract just the filenames which equates to the port numbers.
|
||||||
|
for i, port := range ports {
|
||||||
|
ports[i] = filepath.Base(port)
|
||||||
|
}
|
||||||
|
|
||||||
|
return ports, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func readMetric(directory, metricFile string) (uint64, error) {
|
||||||
|
metric, err := readUintFromFile(filepath.Join(directory, metricFile))
|
||||||
|
if err != nil {
|
||||||
|
log.Debugf("Error reading %q file", metricFile)
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return metric, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *infinibandCollector) Update(ch chan<- prometheus.Metric) (err error) {
|
||||||
|
devices, err := infinibandDevices(sysFilePath(infinibandPath))
|
||||||
|
|
||||||
|
// If no devices are found or another error is raised while attempting to find devices,
|
||||||
|
// InfiniBand is likely not installed and the collector should be skipped.
|
||||||
|
switch err {
|
||||||
|
case nil:
|
||||||
|
case errInfinibandNoDevicesFound:
|
||||||
|
return nil
|
||||||
|
default:
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, device := range devices {
|
||||||
|
ports, err := infinibandPorts(sysFilePath(infinibandPath), device)
|
||||||
|
|
||||||
|
// If no ports are found for the specified device, skip to the next device.
|
||||||
|
switch err {
|
||||||
|
case nil:
|
||||||
|
case errInfinibandNoPortsFound:
|
||||||
|
continue
|
||||||
|
default:
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, port := range ports {
|
||||||
|
portFiles := sysFilePath(filepath.Join(infinibandPath, device, "ports", port))
|
||||||
|
|
||||||
|
// Add metrics for the InfiniBand counters.
|
||||||
|
for metricName, infinibandMetric := range c.counters {
|
||||||
|
metric, err := readMetric(filepath.Join(portFiles, "counters"), infinibandMetric.File)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
ch <- prometheus.MustNewConstMetric(
|
||||||
|
c.metricDescs[metricName],
|
||||||
|
prometheus.CounterValue,
|
||||||
|
float64(metric),
|
||||||
|
device,
|
||||||
|
port,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
40
collector/infiniband_linux_test.go
Normal file
40
collector/infiniband_linux_test.go
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
// Copyright 2017 The Prometheus Authors
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestInfiniBandDevices(t *testing.T) {
|
||||||
|
devices, err := infinibandDevices("fixtures/sys/class/infiniband")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if l := len(devices); l != 1 {
|
||||||
|
t.Fatal("Retrieved an unexpected number of InfiniBand devices: %d", l)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestInfiniBandPorts(t *testing.T) {
|
||||||
|
ports, err := infinibandPorts("fixtures/sys/class/infiniband", "mlx4_0")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if l := len(ports); l != 2 {
|
||||||
|
t.Fatal("Retrieved an unexpected number of InfiniBand ports: %d", l)
|
||||||
|
}
|
||||||
|
}
|
|
@ -11,6 +11,7 @@ collectors=$(cat << COLLECTORS
|
||||||
entropy
|
entropy
|
||||||
filefd
|
filefd
|
||||||
hwmon
|
hwmon
|
||||||
|
infiniband
|
||||||
ksmd
|
ksmd
|
||||||
loadavg
|
loadavg
|
||||||
mdadm
|
mdadm
|
||||||
|
|
|
@ -32,7 +32,7 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
defaultCollectors = "conntrack,cpu,diskstats,entropy,edac,filefd,filesystem,hwmon,loadavg,mdadm,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname,vmstat,wifi,zfs"
|
defaultCollectors = "conntrack,cpu,diskstats,entropy,edac,filefd,filesystem,hwmon,infiniband,loadavg,mdadm,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname,vmstat,wifi,zfs"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
|
Loading…
Reference in a new issue