From 04fbcfffa194b5a77c8de60ca38d2530a8dfeae3 Mon Sep 17 00:00:00 2001 From: Benjamin Drung Date: Fri, 22 Nov 2019 22:52:17 +0100 Subject: [PATCH] Collect InfiniBand port state and physical state (#1357) Collect the InfiniBand port state, the physical state, and the maximum signal transfer rate. Signed-off-by: Benjamin Drung --- collector/fixtures/e2e-64k-page-output.txt | 15 +++++++++++++++ collector/fixtures/e2e-output.txt | 15 +++++++++++++++ collector/infiniband_linux.go | 7 +++++++ 3 files changed, 37 insertions(+) diff --git a/collector/fixtures/e2e-64k-page-output.txt b/collector/fixtures/e2e-64k-page-output.txt index 729deff9..9f75f082 100644 --- a/collector/fixtures/e2e-64k-page-output.txt +++ b/collector/fixtures/e2e-64k-page-output.txt @@ -840,6 +840,11 @@ node_infiniband_multicast_packets_received_total{device="mlx4_0",port="2"} 0 # TYPE node_infiniband_multicast_packets_transmitted_total counter node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16 node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_physical_state_id Physical state of the InfiniBand port (0: no change, 1: sleep, 2: polling, 3: disable, 4: shift, 5: link up, 6: link error recover, 7: phytest) +# TYPE node_infiniband_physical_state_id gauge +node_infiniband_physical_state_id{device="i40iw0",port="1"} 5 +node_infiniband_physical_state_id{device="mlx4_0",port="1"} 5 +node_infiniband_physical_state_id{device="mlx4_0",port="2"} 5 # HELP node_infiniband_port_constraint_errors_received_total Number of packets received on the switch physical port that are discarded # TYPE node_infiniband_port_constraint_errors_received_total counter node_infiniband_port_constraint_errors_received_total{device="mlx4_0",port="1"} 0 @@ -872,6 +877,16 @@ node_infiniband_port_packets_transmitted_total{device="mlx4_0",port="1"} 6.23586 # HELP node_infiniband_port_transmit_wait_total Number of ticks during which the port had data to transmit but no data was sent during the entire tick # TYPE node_infiniband_port_transmit_wait_total counter node_infiniband_port_transmit_wait_total{device="mlx4_0",port="1"} 4.294967295e+09 +# HELP node_infiniband_rate_bytes_per_second Maximum signal transfer rate +# TYPE node_infiniband_rate_bytes_per_second gauge +node_infiniband_rate_bytes_per_second{device="i40iw0",port="1"} 1.25e+09 +node_infiniband_rate_bytes_per_second{device="mlx4_0",port="1"} 5e+09 +node_infiniband_rate_bytes_per_second{device="mlx4_0",port="2"} 5e+09 +# HELP node_infiniband_state_id State of the InfiniBand port (0: no change, 1: down, 2: init, 3: armed, 4: active, 5: act defer) +# TYPE node_infiniband_state_id gauge +node_infiniband_state_id{device="i40iw0",port="1"} 4 +node_infiniband_state_id{device="mlx4_0",port="1"} 4 +node_infiniband_state_id{device="mlx4_0",port="2"} 4 # HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors) # TYPE node_infiniband_unicast_packets_received_total counter node_infiniband_unicast_packets_received_total{device="mlx4_0",port="1"} 61148 diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt index 41129c4a..c1633469 100644 --- a/collector/fixtures/e2e-output.txt +++ b/collector/fixtures/e2e-output.txt @@ -840,6 +840,11 @@ node_infiniband_multicast_packets_received_total{device="mlx4_0",port="2"} 0 # TYPE node_infiniband_multicast_packets_transmitted_total counter node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16 node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0 +# HELP node_infiniband_physical_state_id Physical state of the InfiniBand port (0: no change, 1: sleep, 2: polling, 3: disable, 4: shift, 5: link up, 6: link error recover, 7: phytest) +# TYPE node_infiniband_physical_state_id gauge +node_infiniband_physical_state_id{device="i40iw0",port="1"} 5 +node_infiniband_physical_state_id{device="mlx4_0",port="1"} 5 +node_infiniband_physical_state_id{device="mlx4_0",port="2"} 5 # HELP node_infiniband_port_constraint_errors_received_total Number of packets received on the switch physical port that are discarded # TYPE node_infiniband_port_constraint_errors_received_total counter node_infiniband_port_constraint_errors_received_total{device="mlx4_0",port="1"} 0 @@ -872,6 +877,16 @@ node_infiniband_port_packets_transmitted_total{device="mlx4_0",port="1"} 6.23586 # HELP node_infiniband_port_transmit_wait_total Number of ticks during which the port had data to transmit but no data was sent during the entire tick # TYPE node_infiniband_port_transmit_wait_total counter node_infiniband_port_transmit_wait_total{device="mlx4_0",port="1"} 4.294967295e+09 +# HELP node_infiniband_rate_bytes_per_second Maximum signal transfer rate +# TYPE node_infiniband_rate_bytes_per_second gauge +node_infiniband_rate_bytes_per_second{device="i40iw0",port="1"} 1.25e+09 +node_infiniband_rate_bytes_per_second{device="mlx4_0",port="1"} 5e+09 +node_infiniband_rate_bytes_per_second{device="mlx4_0",port="2"} 5e+09 +# HELP node_infiniband_state_id State of the InfiniBand port (0: no change, 1: down, 2: init, 3: armed, 4: active, 5: act defer) +# TYPE node_infiniband_state_id gauge +node_infiniband_state_id{device="i40iw0",port="1"} 4 +node_infiniband_state_id{device="mlx4_0",port="1"} 4 +node_infiniband_state_id{device="mlx4_0",port="2"} 4 # HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors) # TYPE node_infiniband_unicast_packets_received_total counter node_infiniband_unicast_packets_received_total{device="mlx4_0",port="1"} 61148 diff --git a/collector/infiniband_linux.go b/collector/infiniband_linux.go index 237a9133..828e40e5 100644 --- a/collector/infiniband_linux.go +++ b/collector/infiniband_linux.go @@ -57,6 +57,7 @@ func NewInfiniBandCollector() (Collector, error) { "link_error_recovery_total": "Number of times the link successfully recovered from an error state", "multicast_packets_received_total": "Number of multicast packets received (including errors)", "multicast_packets_transmitted_total": "Number of multicast packets transmitted (including errors)", + "physical_state_id": "Physical state of the InfiniBand port (0: no change, 1: sleep, 2: polling, 3: disable, 4: shift, 5: link up, 6: link error recover, 7: phytest)", "port_constraint_errors_received_total": "Number of packets received on the switch physical port that are discarded", "port_constraint_errors_transmitted_total": "Number of packets not transmitted from the switch physical port", "port_data_received_bytes_total": "Number of data octets received on all links", @@ -67,6 +68,8 @@ func NewInfiniBandCollector() (Collector, error) { "port_packets_received_total": "Number of packets received on all VLs by this port (including errors)", "port_packets_transmitted_total": "Number of packets transmitted on all VLs from this port (including errors)", "port_transmit_wait_total": "Number of ticks during which the port had data to transmit but no data was sent during the entire tick", + "rate_bytes_per_second": "Maximum signal transfer rate", + "state_id": "State of the InfiniBand port (0: no change, 1: down, 2: init, 3: armed, 4: active, 5: act defer)", "unicast_packets_received_total": "Number of unicast packets received (including errors)", "unicast_packets_transmitted_total": "Number of unicast packets transmitted (including errors)", } @@ -105,6 +108,10 @@ func (c *infinibandCollector) Update(ch chan<- prometheus.Metric) error { for _, port := range device.Ports { portStr := strconv.FormatUint(uint64(port.Port), 10) + c.pushMetric(ch, "state_id", uint64(port.StateID), port.Name, portStr, prometheus.GaugeValue) + c.pushMetric(ch, "physical_state_id", uint64(port.PhysStateID), port.Name, portStr, prometheus.GaugeValue) + c.pushMetric(ch, "rate_bytes_per_second", port.Rate, port.Name, portStr, prometheus.GaugeValue) + c.pushCounter(ch, "legacy_multicast_packets_received_total", port.Counters.LegacyPortMulticastRcvPackets, port.Name, portStr) c.pushCounter(ch, "legacy_multicast_packets_transmitted_total", port.Counters.LegacyPortMulticastXmitPackets, port.Name, portStr) c.pushCounter(ch, "legacy_data_received_bytes_total", port.Counters.LegacyPortRcvData64, port.Name, portStr)