From bdc0e7e67807ef995a29c36ad9377aff15ae266c Mon Sep 17 00:00:00 2001 From: Patrick Date: Tue, 30 Oct 2018 16:54:09 -0400 Subject: [PATCH] Collect additional common Infiniband counters (#1120) * Collect additional common Infiniband counters Signed-off-by: Patrick Freeman --- CHANGELOG.md | 2 +- collector/fixtures/e2e-64k-page-output.txt | 31 +++++++++++++++ collector/fixtures/e2e-output.txt | 31 +++++++++++++++ collector/fixtures/sys.ttar | 45 ++++++++++++++++++++++ collector/infiniband_linux.go | 24 ++++++++---- 5 files changed, 124 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 129670c2..c6e79c83 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ * [BUGFIX] * [CHANGE] -* [ENHANCEMENT] +* [ENHANCEMENT] Add Infiniband counters #1120 * [FEATURE] ## 0.17.0-rc.0 / 2018-10-19 diff --git a/collector/fixtures/e2e-64k-page-output.txt b/collector/fixtures/e2e-64k-page-output.txt index cf67e153..91e27a4b 100644 --- a/collector/fixtures/e2e-64k-page-output.txt +++ b/collector/fixtures/e2e-64k-page-output.txt @@ -826,6 +826,14 @@ node_infiniband_multicast_packets_received_total{device="mlx4_0",port="2"} 0 # TYPE node_infiniband_multicast_packets_transmitted_total counter node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16 node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_port_constraint_errors_received_total Number of packets received on the switch physical port that are discarded +# TYPE node_infiniband_port_constraint_errors_received_total counter +node_infiniband_port_constraint_errors_received_total{device="i40iw0",port="1"} 0 +node_infiniband_port_constraint_errors_received_total{device="mlx4_0",port="1"} 0 +# HELP node_infiniband_port_constraint_errors_transmitted_total Number of packets not transmitted from the switch physical port +# TYPE node_infiniband_port_constraint_errors_transmitted_total counter +node_infiniband_port_constraint_errors_transmitted_total{device="i40iw0",port="1"} 0 +node_infiniband_port_constraint_errors_transmitted_total{device="mlx4_0",port="1"} 0 # HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links # TYPE node_infiniband_port_data_received_bytes_total counter node_infiniband_port_data_received_bytes_total{device="i40iw0",port="1"} 0 @@ -836,6 +844,29 @@ node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0 node_infiniband_port_data_transmitted_bytes_total{device="i40iw0",port="1"} 0 node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07 node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_port_discards_received_total Number of inbound packets discarded by the port because the port is down or congested +# TYPE node_infiniband_port_discards_received_total counter +node_infiniband_port_discards_received_total{device="mlx4_0",port="1"} 0 +# HELP node_infiniband_port_discards_transmitted_total Number of outbound packets discarded by the port because the port is down or congested +# TYPE node_infiniband_port_discards_transmitted_total counter +node_infiniband_port_discards_transmitted_total{device="i40iw0",port="1"} 0 +node_infiniband_port_discards_transmitted_total{device="mlx4_0",port="1"} 5 +# HELP node_infiniband_port_errors_received_total Number of packets containing an error that were received on this port +# TYPE node_infiniband_port_errors_received_total counter +node_infiniband_port_errors_received_total{device="i40iw0",port="1"} 0 +node_infiniband_port_errors_received_total{device="mlx4_0",port="1"} 0 +# HELP node_infiniband_port_packets_received_total Number of packets received on all VLs by this port (including errors) +# TYPE node_infiniband_port_packets_received_total counter +node_infiniband_port_packets_received_total{device="i40iw0",port="1"} 0 +node_infiniband_port_packets_received_total{device="mlx4_0",port="1"} 6.825908347e+09 +# HELP node_infiniband_port_packets_transmitted_total Number of packets transmitted on all VLs from this port (including errors) +# TYPE node_infiniband_port_packets_transmitted_total counter +node_infiniband_port_packets_transmitted_total{device="i40iw0",port="1"} 0 +node_infiniband_port_packets_transmitted_total{device="mlx4_0",port="1"} 6.235865e+06 +# HELP node_infiniband_port_transmit_wait_total Number of ticks during which the port had data to transmit but no data was sent during the entire tick +# TYPE node_infiniband_port_transmit_wait_total counter +node_infiniband_port_transmit_wait_total{device="i40iw0",port="1"} 0 +node_infiniband_port_transmit_wait_total{device="mlx4_0",port="1"} 4.294967295e+09 # HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors) # TYPE node_infiniband_unicast_packets_received_total counter node_infiniband_unicast_packets_received_total{device="mlx4_0",port="1"} 61148 diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt index 9cbfce16..2f381124 100644 --- a/collector/fixtures/e2e-output.txt +++ b/collector/fixtures/e2e-output.txt @@ -826,6 +826,14 @@ node_infiniband_multicast_packets_received_total{device="mlx4_0",port="2"} 0 # TYPE node_infiniband_multicast_packets_transmitted_total counter node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16 node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_port_constraint_errors_received_total Number of packets received on the switch physical port that are discarded +# TYPE node_infiniband_port_constraint_errors_received_total counter +node_infiniband_port_constraint_errors_received_total{device="i40iw0",port="1"} 0 +node_infiniband_port_constraint_errors_received_total{device="mlx4_0",port="1"} 0 +# HELP node_infiniband_port_constraint_errors_transmitted_total Number of packets not transmitted from the switch physical port +# TYPE node_infiniband_port_constraint_errors_transmitted_total counter +node_infiniband_port_constraint_errors_transmitted_total{device="i40iw0",port="1"} 0 +node_infiniband_port_constraint_errors_transmitted_total{device="mlx4_0",port="1"} 0 # HELP node_infiniband_port_data_received_bytes_total Number of data octets received on all links # TYPE node_infiniband_port_data_received_bytes_total counter node_infiniband_port_data_received_bytes_total{device="i40iw0",port="1"} 0 @@ -836,6 +844,29 @@ node_infiniband_port_data_received_bytes_total{device="mlx4_0",port="2"} 0 node_infiniband_port_data_transmitted_bytes_total{device="i40iw0",port="1"} 0 node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="1"} 1.493376e+07 node_infiniband_port_data_transmitted_bytes_total{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_port_discards_received_total Number of inbound packets discarded by the port because the port is down or congested +# TYPE node_infiniband_port_discards_received_total counter +node_infiniband_port_discards_received_total{device="mlx4_0",port="1"} 0 +# HELP node_infiniband_port_discards_transmitted_total Number of outbound packets discarded by the port because the port is down or congested +# TYPE node_infiniband_port_discards_transmitted_total counter +node_infiniband_port_discards_transmitted_total{device="i40iw0",port="1"} 0 +node_infiniband_port_discards_transmitted_total{device="mlx4_0",port="1"} 5 +# HELP node_infiniband_port_errors_received_total Number of packets containing an error that were received on this port +# TYPE node_infiniband_port_errors_received_total counter +node_infiniband_port_errors_received_total{device="i40iw0",port="1"} 0 +node_infiniband_port_errors_received_total{device="mlx4_0",port="1"} 0 +# HELP node_infiniband_port_packets_received_total Number of packets received on all VLs by this port (including errors) +# TYPE node_infiniband_port_packets_received_total counter +node_infiniband_port_packets_received_total{device="i40iw0",port="1"} 0 +node_infiniband_port_packets_received_total{device="mlx4_0",port="1"} 6.825908347e+09 +# HELP node_infiniband_port_packets_transmitted_total Number of packets transmitted on all VLs from this port (including errors) +# TYPE node_infiniband_port_packets_transmitted_total counter +node_infiniband_port_packets_transmitted_total{device="i40iw0",port="1"} 0 +node_infiniband_port_packets_transmitted_total{device="mlx4_0",port="1"} 6.235865e+06 +# HELP node_infiniband_port_transmit_wait_total Number of ticks during which the port had data to transmit but no data was sent during the entire tick +# TYPE node_infiniband_port_transmit_wait_total counter +node_infiniband_port_transmit_wait_total{device="i40iw0",port="1"} 0 +node_infiniband_port_transmit_wait_total{device="mlx4_0",port="1"} 4.294967295e+09 # HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors) # TYPE node_infiniband_unicast_packets_received_total counter node_infiniband_unicast_packets_received_total{device="mlx4_0",port="1"} 61148 diff --git a/collector/fixtures/sys.ttar b/collector/fixtures/sys.ttar index f739de25..27b0cb40 100644 --- a/collector/fixtures/sys.ttar +++ b/collector/fixtures/sys.ttar @@ -238,16 +238,61 @@ Lines: 1 16 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/port_rcv_constraint_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/class/infiniband/mlx4_0/ports/1/counters/port_rcv_data Lines: 1 4631917 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/port_rcv_discards +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/port_xmit_discards +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/port_rcv_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/port_rcv_packets +Lines: 1 +6825908347 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/port_xmit_constraint_errors +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/class/infiniband/mlx4_0/ports/1/counters/port_xmit_data Lines: 1 3733440 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/port_xmit_discards +Lines: 1 +5 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/port_xmit_packets +Lines: 1 +6235865 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/infiniband/mlx4_0/ports/1/counters/port_xmit_wait +Lines: 1 +4294967295 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/class/infiniband/mlx4_0/ports/1/counters/unicast_rcv_packets Lines: 1 61148 diff --git a/collector/infiniband_linux.go b/collector/infiniband_linux.go index 660e4bd9..16134f3e 100644 --- a/collector/infiniband_linux.go +++ b/collector/infiniband_linux.go @@ -54,14 +54,22 @@ func NewInfiniBandCollector() (Collector, error) { // Filenames of all InfiniBand counter metrics including a detailed description. i.counters = map[string]infinibandMetric{ - "link_downed_total": {"link_downed", "Number of times the link failed to recover from an error state and went down"}, - "link_error_recovery_total": {"link_error_recovery", "Number of times the link successfully recovered from an error state"}, - "multicast_packets_received_total": {"multicast_rcv_packets", "Number of multicast packets received (including errors)"}, - "multicast_packets_transmitted_total": {"multicast_xmit_packets", "Number of multicast packets transmitted (including errors)"}, - "port_data_received_bytes_total": {"port_rcv_data", "Number of data octets received on all links"}, - "port_data_transmitted_bytes_total": {"port_xmit_data", "Number of data octets transmitted on all links"}, - "unicast_packets_received_total": {"unicast_rcv_packets", "Number of unicast packets received (including errors)"}, - "unicast_packets_transmitted_total": {"unicast_xmit_packets", "Number of unicast packets transmitted (including errors)"}, + "link_downed_total": {"link_downed", "Number of times the link failed to recover from an error state and went down"}, + "link_error_recovery_total": {"link_error_recovery", "Number of times the link successfully recovered from an error state"}, + "multicast_packets_received_total": {"multicast_rcv_packets", "Number of multicast packets received (including errors)"}, + "multicast_packets_transmitted_total": {"multicast_xmit_packets", "Number of multicast packets transmitted (including errors)"}, + "port_constraint_errors_received_total": {"port_rcv_constraint_errors", "Number of packets received on the switch physical port that are discarded"}, + "port_constraint_errors_transmitted_total": {"port_xmit_constraint_errors", "Number of packets not transmitted from the switch physical port"}, + "port_data_received_bytes_total": {"port_rcv_data", "Number of data octets received on all links"}, + "port_data_transmitted_bytes_total": {"port_xmit_data", "Number of data octets transmitted on all links"}, + "port_discards_received_total": {"port_rcv_discards", "Number of inbound packets discarded by the port because the port is down or congested"}, + "port_discards_transmitted_total": {"port_xmit_discards", "Number of outbound packets discarded by the port because the port is down or congested"}, + "port_errors_received_total": {"port_rcv_errors", "Number of packets containing an error that were received on this port"}, + "port_packets_received_total": {"port_rcv_packets", "Number of packets received on all VLs by this port (including errors)"}, + "port_packets_transmitted_total": {"port_xmit_packets", "Number of packets transmitted on all VLs from this port (including errors)"}, + "port_transmit_wait_total": {"port_xmit_wait", "Number of ticks during which the port had data to transmit but no data was sent during the entire tick"}, + "unicast_packets_received_total": {"unicast_rcv_packets", "Number of unicast packets received (including errors)"}, + "unicast_packets_transmitted_total": {"unicast_xmit_packets", "Number of unicast packets transmitted (including errors)"}, } // Deprecated counters for some older versions of InfiniBand drivers.