diff --git a/collector/fixtures/e2e-64k-page-output.txt b/collector/fixtures/e2e-64k-page-output.txt index edfb373e..f694567f 100644 --- a/collector/fixtures/e2e-64k-page-output.txt +++ b/collector/fixtures/e2e-64k-page-output.txt @@ -787,10 +787,12 @@ node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="1 node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239 # HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down # TYPE node_infiniband_link_downed_total counter +node_infiniband_link_downed_total{device="i40iw0",port="1"} 0 node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0 node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0 # HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state # TYPE node_infiniband_link_error_recovery_total counter +node_infiniband_link_error_recovery_total{device="i40iw0",port="1"} 0 node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0 node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0 # HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors) @@ -803,10 +805,12 @@ node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16 node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 # HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links # TYPE node_infiniband_port_data_received_bytes_total counter +node_infiniband_port_data_received_bytes_total{device="i40iw0",port="1"} 0 node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07 node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0 # HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links # TYPE node_infiniband_port_data_transmitted_bytes_total counter +node_infiniband_port_data_transmitted_bytes_total{device="i40iw0",port="1"} 0 node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07 node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0 # HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors) diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt index 9e24adab..5f0b8ac8 100644 --- a/collector/fixtures/e2e-output.txt +++ b/collector/fixtures/e2e-output.txt @@ -787,10 +787,12 @@ node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="1 node_infiniband_legacy_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 61239 # HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down # TYPE node_infiniband_link_downed_total counter +node_infiniband_link_downed_total{device="i40iw0",port="1"} 0 node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0 node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0 # HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state # TYPE node_infiniband_link_error_recovery_total counter +node_infiniband_link_error_recovery_total{device="i40iw0",port="1"} 0 node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0 node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0 # HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors) @@ -803,10 +805,12 @@ node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16 node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 # HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links # TYPE node_infiniband_port_data_received_bytes_total counter +node_infiniband_port_data_received_bytes_total{device="i40iw0",port="1"} 0 node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="1"} 1.8527668e+07 node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0 # HELP node_infiniband_port_data_transmitted_bytes_total Number of data octets transmitted on all links # TYPE node_infiniband_port_data_transmitted_bytes_total counter +node_infiniband_port_data_transmitted_bytes_total{device="i40iw0",port="1"} 0 node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07 node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0 # HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors) diff --git a/collector/fixtures/sys.ttar b/collector/fixtures/sys.ttar index d8982edb..b8e45464 100644 --- a/collector/fixtures/sys.ttar +++ b/collector/fixtures/sys.ttar @@ -109,6 +109,103 @@ Mode: 644 Directory: sys/class/infiniband Mode: 755 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/class/infiniband/i40iw0 +Mode: 755 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/class/infiniband/i40iw0/ports +Mode: 755 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/class/infiniband/i40iw0/ports/1 +Mode: 755 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/class/infiniband/i40iw0/ports/1/counters +Mode: 755 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/i40iw0/ports/1/counters/excessive_buffer_overrun_errors +Lines: 1 +N/A (no PMA) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/i40iw0/ports/1/counters/link_downed +Lines: 1 +N/A (no PMA) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/i40iw0/ports/1/counters/link_error_recovery +Lines: 1 +N/A (no PMA) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/i40iw0/ports/1/counters/local_link_integrity_errors +Lines: 1 +N/A (no PMA) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_constraint_errors +Lines: 1 +N/A (no PMA) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_data +Lines: 1 +N/A (no PMA) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_errors +Lines: 1 +N/A (no PMA) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_packets +Lines: 1 +N/A (no PMA) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_remote_physical_errors +Lines: 1 +N/A (no PMA) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/i40iw0/ports/1/counters/port_rcv_switch_relay_errors +Lines: 1 +N/A (no PMA) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_constraint_errors +Lines: 1 +N/A (no PMA) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_data +Lines: 1 +N/A (no PMA) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_discards +Lines: 1 +N/A (no PMA) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_packets +Lines: 1 +N/A (no PMA) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/i40iw0/ports/1/counters/port_xmit_wait +Lines: 1 +N/A (no PMA) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/i40iw0/ports/1/counters/symbol_error +Lines: 1 +N/A (no PMA) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/i40iw0/ports/1/counters/VL15_dropped +Lines: 1 +N/A (no PMA) +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: sys/class/infiniband/mlx4_0 Mode: 755 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/collector/infiniband_linux.go b/collector/infiniband_linux.go index 7c219769..660e4bd9 100644 --- a/collector/infiniband_linux.go +++ b/collector/infiniband_linux.go @@ -20,6 +20,7 @@ import ( "errors" "os" "path/filepath" + "strings" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/log" @@ -144,6 +145,15 @@ func infinibandPorts(infinibandPath, device string) ([]string, error) { func readMetric(directory, metricFile string) (uint64, error) { metric, err := readUintFromFile(filepath.Join(directory, metricFile)) if err != nil { + // Ugly workaround for handling #966, when counters are + // `N/A (not available)`. + // This was already patched and submitted, see + // https://www.spinics.net/lists/linux-rdma/msg68596.html + // Remove this as soon as the fix lands in the enterprise distros. + if strings.Contains(err.Error(), "N/A (no PMA)") { + log.Debugf("%q value is N/A", metricFile) + return 0, nil + } log.Debugf("Error reading %q file", metricFile) return 0, err } diff --git a/collector/infiniband_linux_test.go b/collector/infiniband_linux_test.go index 68370c05..d2090f83 100644 --- a/collector/infiniband_linux_test.go +++ b/collector/infiniband_linux_test.go @@ -23,7 +23,7 @@ func TestInfiniBandDevices(t *testing.T) { t.Fatal(err) } - if l := len(devices); l != 1 { + if l := len(devices); l != 2 { t.Fatalf("Retrieved an unexpected number of InfiniBand devices: %d", l) } }