Merge pull request #450 from roclark/add-infiniband

infiniband: Add new collector for InfiniBand statistics
This commit is contained in:
Ben Kochie 2017-02-16 14:33:19 +01:00 committed by GitHub
commit 38cd07ebb9
23 changed files with 269 additions and 1 deletions

View file

@ -32,6 +32,7 @@ The following individuals have contributed code to this repository
* Ken Herner <ken@modulus.io> * Ken Herner <ken@modulus.io>
* Matt Layher <mdlayher@gmail.com> * Matt Layher <mdlayher@gmail.com>
* Matthias Rampke <matthias@rampke.de> * Matthias Rampke <matthias@rampke.de>
* Robert Clark <robert.d.clark@hpe.com>
* Siavash Safi <siavash.safi@gmail.com> * Siavash Safi <siavash.safi@gmail.com>
* Stephen Shirley <kormat@gmail.com> * Stephen Shirley <kormat@gmail.com>
* Steve Durrheimer <s.durrheimer@gmail.com> * Steve Durrheimer <s.durrheimer@gmail.com>

View file

@ -28,6 +28,7 @@ entropy | Exposes available entropy. | Linux
filefd | Exposes file descriptor statistics from `/proc/sys/fs/file-nr`. | Linux filefd | Exposes file descriptor statistics from `/proc/sys/fs/file-nr`. | Linux
filesystem | Exposes filesystem statistics, such as disk space used. | Darwin, Dragonfly, FreeBSD, Linux, OpenBSD filesystem | Exposes filesystem statistics, such as disk space used. | Darwin, Dragonfly, FreeBSD, Linux, OpenBSD
hwmon | Expose hardware monitoring and sensor data from `/sys/class/hwmon/`. | Linux hwmon | Expose hardware monitoring and sensor data from `/sys/class/hwmon/`. | Linux
infiniband | Exposes network statistics specific to InfiniBand configurations. | Linux
loadavg | Exposes load average. | Darwin, Dragonfly, FreeBSD, Linux, NetBSD, OpenBSD, Solaris loadavg | Exposes load average. | Darwin, Dragonfly, FreeBSD, Linux, NetBSD, OpenBSD, Solaris
mdadm | Exposes statistics about devices in `/proc/mdstat` (does nothing if no `/proc/mdstat` present). | Linux mdadm | Exposes statistics about devices in `/proc/mdstat` (does nothing if no `/proc/mdstat` present). | Linux
meminfo | Exposes memory statistics. | Darwin, Dragonfly, FreeBSD, Linux meminfo | Exposes memory statistics. | Darwin, Dragonfly, FreeBSD, Linux

View file

@ -689,6 +689,38 @@ node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp2"} 84
node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp3"} 84 node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp3"} 84
node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp4"} 84 node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp4"} 84
node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp5"} 84 node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp5"} 84
# HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down
# TYPE node_infiniband_link_downed_total counter
node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0
node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0
# HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state
# TYPE node_infiniband_link_error_recovery_total counter
node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0
node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0
# HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors)
# TYPE node_infiniband_multicast_packets_received_total counter
node_infiniband_multicast_packets_received_total{device="mlx4_0",port="1"} 93
node_infiniband_multicast_packets_received_total{device="mlx4_0",port="2"} 0
# HELP node_infiniband_multicast_packets_transmitted_total Number of multicast packets transmitted (including errors)
# TYPE node_infiniband_multicast_packets_transmitted_total counter
node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16
node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0
# HELP node_infiniband_port_data_received_bytes Number of data octets received on all links
# TYPE node_infiniband_port_data_received_bytes counter
node_infiniband_port_data_received_bytes{device="mlx4_0",port="1"} 4.631917e+06
node_infiniband_port_data_received_bytes{device="mlx4_0",port="2"} 0
# HELP node_infiniband_port_data_transmitted_bytes Number of data octets transmitted on all links
# TYPE node_infiniband_port_data_transmitted_bytes counter
node_infiniband_port_data_transmitted_bytes{device="mlx4_0",port="1"} 3.73344e+06
node_infiniband_port_data_transmitted_bytes{device="mlx4_0",port="2"} 0
# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors)
# TYPE node_infiniband_unicast_packets_received_total counter
node_infiniband_unicast_packets_received_total{device="mlx4_0",port="1"} 61148
node_infiniband_unicast_packets_received_total{device="mlx4_0",port="2"} 0
# HELP node_infiniband_unicast_packets_transmitted_total Number of unicast packets transmitted (including errors)
# TYPE node_infiniband_unicast_packets_transmitted_total counter
node_infiniband_unicast_packets_transmitted_total{device="mlx4_0",port="1"} 61239
node_infiniband_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 0
# HELP node_intr Total number of interrupts serviced. # HELP node_intr Total number of interrupts serviced.
# TYPE node_intr counter # TYPE node_intr counter
node_intr 8.885917e+06 node_intr 8.885917e+06

View file

@ -0,0 +1,177 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build linux
// +build !noinfiniband
package collector
import (
"errors"
"path/filepath"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/log"
)
const infinibandPath = "class/infiniband"
var (
errInfinibandNoDevicesFound = errors.New("no InfiniBand devices detected")
errInfinibandNoPortsFound = errors.New("no InfiniBand ports detected")
)
type infinibandCollector struct {
metricDescs map[string]*prometheus.Desc
counters map[string]infinibandMetric
}
type infinibandMetric struct {
File string
Help string
}
func init() {
Factories["infiniband"] = NewInfiniBandCollector
}
func NewInfiniBandCollector() (Collector, error) {
var i infinibandCollector
// Filenames of all InfiniBand counter metrics including a detailed description.
i.counters = map[string]infinibandMetric{
"link_downed_total": {"link_downed", "Number of times the link failed to recover from an error state and went down"},
"link_error_recovery_total": {"link_error_recovery", "Number of times the link successfully recovered from an error state"},
"multicast_packets_received_total": {"multicast_rcv_packets", "Number of multicast packets received (including errors)"},
"multicast_packets_transmitted_total": {"multicast_xmit_packets", "Number of multicast packets transmitted (including errors)"},
"port_data_received_bytes": {"port_rcv_data", "Number of data octets received on all links"},
"port_data_transmitted_bytes": {"port_xmit_data", "Number of data octets transmitted on all links"},
"unicast_packets_received_total": {"unicast_rcv_packets", "Number of unicast packets received (including errors)"},
"unicast_packets_transmitted_total": {"unicast_xmit_packets", "Number of unicast packets transmitted (including errors)"},
}
subsystem := "infiniband"
i.metricDescs = make(map[string]*prometheus.Desc)
for metricName, infinibandMetric := range i.counters {
i.metricDescs[metricName] = prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, metricName),
infinibandMetric.Help,
[]string{"device", "port"},
nil,
)
}
return &i, nil
}
// infinibandDevices retrieves a list of InfiniBand devices.
func infinibandDevices(infinibandPath string) ([]string, error) {
devices, err := filepath.Glob(filepath.Join(infinibandPath, "/*"))
if err != nil {
return nil, err
}
if len(devices) < 1 {
log.Debugf("Unable to detect InfiniBand devices")
err = errInfinibandNoDevicesFound
return nil, err
}
// Extract just the filenames which equate to the device names.
for i, device := range devices {
devices[i] = filepath.Base(device)
}
return devices, nil
}
// Retrieve a list of ports for the InfiniBand device.
func infinibandPorts(infinibandPath, device string) ([]string, error) {
ports, err := filepath.Glob(filepath.Join(infinibandPath, device, "ports/*"))
if err != nil {
return nil, err
}
if len(ports) < 1 {
log.Debugf("Unable to detect ports for %s", device)
err = errInfinibandNoPortsFound
return nil, err
}
// Extract just the filenames which equates to the port numbers.
for i, port := range ports {
ports[i] = filepath.Base(port)
}
return ports, nil
}
func readMetric(directory, metricFile string) (uint64, error) {
metric, err := readUintFromFile(filepath.Join(directory, metricFile))
if err != nil {
log.Debugf("Error reading %q file", metricFile)
return 0, err
}
return metric, nil
}
func (c *infinibandCollector) Update(ch chan<- prometheus.Metric) (err error) {
devices, err := infinibandDevices(sysFilePath(infinibandPath))
// If no devices are found or another error is raised while attempting to find devices,
// InfiniBand is likely not installed and the collector should be skipped.
switch err {
case nil:
case errInfinibandNoDevicesFound:
return nil
default:
return err
}
for _, device := range devices {
ports, err := infinibandPorts(sysFilePath(infinibandPath), device)
// If no ports are found for the specified device, skip to the next device.
switch err {
case nil:
case errInfinibandNoPortsFound:
continue
default:
return err
}
for _, port := range ports {
portFiles := sysFilePath(filepath.Join(infinibandPath, device, "ports", port))
// Add metrics for the InfiniBand counters.
for metricName, infinibandMetric := range c.counters {
metric, err := readMetric(filepath.Join(portFiles, "counters"), infinibandMetric.File)
if err != nil {
return err
}
ch <- prometheus.MustNewConstMetric(
c.metricDescs[metricName],
prometheus.CounterValue,
float64(metric),
device,
port,
)
}
}
}
return nil
}

View file

@ -0,0 +1,40 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package collector
import (
"testing"
)
func TestInfiniBandDevices(t *testing.T) {
devices, err := infinibandDevices("fixtures/sys/class/infiniband")
if err != nil {
t.Fatal(err)
}
if l := len(devices); l != 1 {
t.Fatal("Retrieved an unexpected number of InfiniBand devices: %d", l)
}
}
func TestInfiniBandPorts(t *testing.T) {
ports, err := infinibandPorts("fixtures/sys/class/infiniband", "mlx4_0")
if err != nil {
t.Fatal(err)
}
if l := len(ports); l != 2 {
t.Fatal("Retrieved an unexpected number of InfiniBand ports: %d", l)
}
}

View file

@ -11,6 +11,7 @@ collectors=$(cat << COLLECTORS
entropy entropy
filefd filefd
hwmon hwmon
infiniband
ksmd ksmd
loadavg loadavg
mdadm mdadm

View file

@ -32,7 +32,7 @@ import (
) )
const ( const (
defaultCollectors = "conntrack,cpu,diskstats,entropy,edac,filefd,filesystem,hwmon,loadavg,mdadm,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname,vmstat,wifi,zfs" defaultCollectors = "conntrack,cpu,diskstats,entropy,edac,filefd,filesystem,hwmon,infiniband,loadavg,mdadm,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname,vmstat,wifi,zfs"
) )
var ( var (