diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt index 9a08a561..87c4b012 100644 --- a/collector/fixtures/e2e-output.txt +++ b/collector/fixtures/e2e-output.txt @@ -654,6 +654,38 @@ node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp2"} 84 node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp3"} 84 node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp4"} 84 node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp5"} 84 +# HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down +# TYPE node_infiniband_link_downed_total counter +node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0 +node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state +# TYPE node_infiniband_link_error_recovery_total counter +node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0 +node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors) +# TYPE node_infiniband_multicast_packets_received_total counter +node_infiniband_multicast_packets_received_total{device="mlx4_0",port="1"} 93 +node_infiniband_multicast_packets_received_total{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_multicast_packets_transmitted_total Number of multicast packets transmitted (including errors) +# TYPE node_infiniband_multicast_packets_transmitted_total counter +node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16 +node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_port_data_received_bytes Number of data octets received on all links +# TYPE node_infiniband_port_data_received_bytes counter +node_infiniband_port_data_received_bytes{device="mlx4_0",port="1"} 4.631917e+06 +node_infiniband_port_data_received_bytes{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_port_data_transmitted_bytes Number of data octets transmitted on all links +# TYPE node_infiniband_port_data_transmitted_bytes counter +node_infiniband_port_data_transmitted_bytes{device="mlx4_0",port="1"} 3.73344e+06 +node_infiniband_port_data_transmitted_bytes{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors) +# TYPE node_infiniband_unicast_packets_received_total counter +node_infiniband_unicast_packets_received_total{device="mlx4_0",port="1"} 61148 +node_infiniband_unicast_packets_received_total{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_unicast_packets_transmitted_total Number of unicast packets transmitted (including errors) +# TYPE node_infiniband_unicast_packets_transmitted_total counter +node_infiniband_unicast_packets_transmitted_total{device="mlx4_0",port="1"} 61239 +node_infiniband_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 # HELP node_intr Total number of interrupts serviced. # TYPE node_intr counter node_intr 8.885917e+06 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/link_downed b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/link_downed new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/link_downed @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/link_error_recovery b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/link_error_recovery new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/link_error_recovery @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/multicast_rcv_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/multicast_rcv_packets new file mode 100644 index 00000000..c67f579c --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/multicast_rcv_packets @@ -0,0 +1 @@ +93 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/multicast_xmit_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/multicast_xmit_packets new file mode 100644 index 00000000..b6a7d89c --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/multicast_xmit_packets @@ -0,0 +1 @@ +16 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/port_rcv_data b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/port_rcv_data new file mode 100644 index 00000000..496ea27d --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/port_rcv_data @@ -0,0 +1 @@ +4631917 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/port_xmit_data b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/port_xmit_data new file mode 100644 index 00000000..85ea8ebf --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/port_xmit_data @@ -0,0 +1 @@ +3733440 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/unicast_rcv_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/unicast_rcv_packets new file mode 100644 index 00000000..2406651b --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/unicast_rcv_packets @@ -0,0 +1 @@ +61148 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/unicast_xmit_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/unicast_xmit_packets new file mode 100644 index 00000000..6279bd6a --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/1/counters/unicast_xmit_packets @@ -0,0 +1 @@ +61239 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/link_downed b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/link_downed new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/link_downed @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/link_error_recovery b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/link_error_recovery new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/link_error_recovery @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/multicast_rcv_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/multicast_rcv_packets new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/multicast_rcv_packets @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/multicast_xmit_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/multicast_xmit_packets new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/multicast_xmit_packets @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/port_rcv_data b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/port_rcv_data new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/port_rcv_data @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/port_xmit_data b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/port_xmit_data new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/port_xmit_data @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/unicast_rcv_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/unicast_rcv_packets new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/unicast_rcv_packets @@ -0,0 +1 @@ +0 diff --git a/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/unicast_xmit_packets b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/unicast_xmit_packets new file mode 100644 index 00000000..573541ac --- /dev/null +++ b/collector/fixtures/sys/class/infiniband/mlx4_0/ports/2/counters/unicast_xmit_packets @@ -0,0 +1 @@ +0 diff --git a/collector/infiniband_linux.go b/collector/infiniband_linux.go new file mode 100644 index 00000000..34ce4dab --- /dev/null +++ b/collector/infiniband_linux.go @@ -0,0 +1,177 @@ +// Copyright 2017 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build linux +// +build !noinfiniband + +package collector + +import ( + "errors" + "path/filepath" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/log" +) + +const infinibandPath = "class/infiniband" + +var ( + errInfinibandNoDevicesFound = errors.New("no InfiniBand devices detected") + errInfinibandNoPortsFound = errors.New("no InfiniBand ports detected") +) + +type infinibandCollector struct { + metricDescs map[string]*prometheus.Desc + counters map[string]infinibandMetric +} + +type infinibandMetric struct { + File string + Help string +} + +func init() { + Factories["infiniband"] = NewInfiniBandCollector +} + +func NewInfiniBandCollector() (Collector, error) { + var i infinibandCollector + + // Filenames of all InfiniBand counter metrics including a detailed description. + i.counters = map[string]infinibandMetric{ + "link_downed_total": {"link_downed", "Number of times the link failed to recover from an error state and went down"}, + "link_error_recovery_total": {"link_error_recovery", "Number of times the link successfully recovered from an error state"}, + "multicast_packets_received_total": {"multicast_rcv_packets", "Number of multicast packets received (including errors)"}, + "multicast_packets_transmitted_total": {"multicast_xmit_packets", "Number of multicast packets transmitted (including errors)"}, + "port_data_received_bytes": {"port_rcv_data", "Number of data octets received on all links"}, + "port_data_transmitted_bytes": {"port_xmit_data", "Number of data octets transmitted on all links"}, + "unicast_packets_received_total": {"unicast_rcv_packets", "Number of unicast packets received (including errors)"}, + "unicast_packets_transmitted_total": {"unicast_xmit_packets", "Number of unicast packets transmitted (including errors)"}, + } + + subsystem := "infiniband" + i.metricDescs = make(map[string]*prometheus.Desc) + + for metricName, infinibandMetric := range i.counters { + i.metricDescs[metricName] = prometheus.NewDesc( + prometheus.BuildFQName(Namespace, subsystem, metricName), + infinibandMetric.Help, + []string{"device", "port"}, + nil, + ) + } + + return &i, nil +} + +// infinibandDevices retrieves a list of InfiniBand devices. +func infinibandDevices(infinibandPath string) ([]string, error) { + devices, err := filepath.Glob(filepath.Join(infinibandPath, "/*")) + if err != nil { + return nil, err + } + + if len(devices) < 1 { + log.Debugf("Unable to detect InfiniBand devices") + err = errInfinibandNoDevicesFound + return nil, err + } + + // Extract just the filenames which equate to the device names. + for i, device := range devices { + devices[i] = filepath.Base(device) + } + + return devices, nil +} + +// Retrieve a list of ports for the InfiniBand device. +func infinibandPorts(infinibandPath, device string) ([]string, error) { + ports, err := filepath.Glob(filepath.Join(infinibandPath, device, "ports/*")) + if err != nil { + return nil, err + } + + if len(ports) < 1 { + log.Debugf("Unable to detect ports for %s", device) + err = errInfinibandNoPortsFound + return nil, err + } + + // Extract just the filenames which equates to the port numbers. + for i, port := range ports { + ports[i] = filepath.Base(port) + } + + return ports, nil +} + +func readMetric(directory, metricFile string) (uint64, error) { + metric, err := readUintFromFile(filepath.Join(directory, metricFile)) + if err != nil { + log.Debugf("Error reading %q file", metricFile) + return 0, err + } + + return metric, nil +} + +func (c *infinibandCollector) Update(ch chan<- prometheus.Metric) (err error) { + devices, err := infinibandDevices(sysFilePath(infinibandPath)) + + // If no devices are found or another error is raised while attempting to find devices, + // InfiniBand is likely not installed and the collector should be skipped. + switch err { + case nil: + case errInfinibandNoDevicesFound: + return nil + default: + return err + } + + for _, device := range devices { + ports, err := infinibandPorts(sysFilePath(infinibandPath), device) + + // If no ports are found for the specified device, skip to the next device. + switch err { + case nil: + case errInfinibandNoPortsFound: + continue + default: + return err + } + + for _, port := range ports { + portFiles := sysFilePath(filepath.Join(infinibandPath, device, "ports", port)) + + // Add metrics for the InfiniBand counters. + for metricName, infinibandMetric := range c.counters { + metric, err := readMetric(filepath.Join(portFiles, "counters"), infinibandMetric.File) + if err != nil { + return err + } + + ch <- prometheus.MustNewConstMetric( + c.metricDescs[metricName], + prometheus.CounterValue, + float64(metric), + device, + port, + ) + } + } + } + + return nil +} diff --git a/end-to-end-test.sh b/end-to-end-test.sh index 2e92dad5..718e2590 100755 --- a/end-to-end-test.sh +++ b/end-to-end-test.sh @@ -10,6 +10,7 @@ collectors=$(cat << COLLECTORS entropy filefd hwmon + infiniband ksmd loadavg mdadm