mirror of
				https://github.com/prometheus/node_exporter.git
				synced 2025-08-20 18:33:52 -07:00 
			
		
		
		
	* Add collector for PCIe devices with link information
The link status of PCIe devices sometimes changes,
like link or speed downgrades, and devices disappear.
This patch collects PCIe devices' link infromation  to detect such failures.
As a first step, this collector exports PCIe devices'
- Device information (vendor_id, device_id, etc.)
- Parent PCIe device (e.g. PCIe bridge, PCIe switch)
- Link status (max_link_{transfers_per_second|width}, current_link_{transfers_per_second|width})
---------
Signed-off-by: Naoki MATSUMOTO <m.naoki9911@gmail.com>
		
	
			
		
			
				
	
	
		
			383 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable file
		
	
	
	
	
			
		
		
	
	
			383 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable file
		
	
	
	
	
| #!/usr/bin/env bash
 | |
| 
 | |
| set -euf -o pipefail
 | |
| 
 | |
| # Allow setting GOHOSTOS for debugging purposes.
 | |
| GOHOSTOS=${GOHOSTOS:-$(go env GOHOSTOS)}
 | |
| 
 | |
| # Allow setting arch for debugging purposes.
 | |
| arch=${arch:-$(uname -m)}
 | |
| 
 | |
| maybe_flag_search_scope() {
 | |
|   local collector=$1
 | |
|   os_aux_os=""
 | |
|   if [[ $GOHOSTOS =~ ^(freebsd|openbsd|netbsd|solaris|dragonfly)$ ]]; then
 | |
|     os_aux_os=" ${collector}_bsd.go"
 | |
|   fi
 | |
|   echo "${collector}_common.go ${collector}.go ${collector}_${GOHOSTOS}.go ${collector}_${GOHOSTOS}_${arch}.go${os_aux_os}"
 | |
| }
 | |
| 
 | |
| supported_collectors() {
 | |
|   local collectors=$1
 | |
|   local supported=""
 | |
|   for collector in ${collectors}; do
 | |
|     for filename in $(maybe_flag_search_scope "${collector}"); do
 | |
|       file="collector/${filename}"
 | |
|       if ./tools/tools match ${file} > /dev/null 2>&1; then
 | |
|         if grep -h -E -o -- "registerCollector\(" ${file} > /dev/null 2>&1; then
 | |
|           supported="${supported} ${collector}"
 | |
|         fi
 | |
|         break
 | |
|       fi
 | |
|     done
 | |
|   done
 | |
|   echo "${supported}" | tr ' ' '\n' | sort | uniq
 | |
| }
 | |
| 
 | |
| enabled_collectors=$(cat << COLLECTORS
 | |
|   arp
 | |
|   bcache
 | |
|   bonding
 | |
|   btrfs
 | |
|   buddyinfo
 | |
|   cgroups
 | |
|   conntrack
 | |
|   cpu
 | |
|   cpufreq
 | |
|   cpu_vulnerabilities
 | |
|   diskstats
 | |
|   dmi
 | |
|   drbd
 | |
|   edac
 | |
|   entropy
 | |
|   fibrechannel
 | |
|   filefd
 | |
|   hwmon
 | |
|   infiniband
 | |
|   interrupts
 | |
|   ipvs
 | |
|   ksmd
 | |
|   lnstat
 | |
|   loadavg
 | |
|   mdadm
 | |
|   meminfo
 | |
|   meminfo_numa
 | |
|   mountstats
 | |
|   netdev
 | |
|   netstat
 | |
|   nfs
 | |
|   nfsd
 | |
|   pcidevice
 | |
|   pressure
 | |
|   processes
 | |
|   qdisc
 | |
|   rapl
 | |
|   schedstat
 | |
|   slabinfo
 | |
|   sockstat
 | |
|   softirqs
 | |
|   stat
 | |
|   sysctl
 | |
|   textfile
 | |
|   thermal_zone
 | |
|   udp_queues
 | |
|   vmstat
 | |
|   watchdog
 | |
|   wifi
 | |
|   xfrm
 | |
|   xfs
 | |
|   zfs
 | |
|   zoneinfo
 | |
| COLLECTORS
 | |
| )
 | |
| supported_enabled_collectors=$(supported_collectors "${enabled_collectors}")
 | |
| 
 | |
| disabled_collectors=$(cat << COLLECTORS
 | |
|   selinux
 | |
|   filesystem
 | |
|   timex
 | |
|   uname
 | |
| COLLECTORS
 | |
| )
 | |
| supported_disabled_collectors=$(supported_collectors "${disabled_collectors}")
 | |
| 
 | |
| cd "$(dirname $0)"
 | |
| 
 | |
| port="$((10000 + (RANDOM % 10000)))"
 | |
| tmpdir=$(mktemp -d /tmp/node_exporter_e2e_test.XXXXXX)
 | |
| 
 | |
| skip_re="^(go_|node_exporter_build_info|node_scrape_collector_duration_seconds|process_|node_textfile_mtime_seconds|node_time_(zone|seconds)|node_network_(receive|transmit)_(bytes|packets)_total)"
 | |
| 
 | |
| case "${arch}" in
 | |
|   aarch64|ppc64le) fixture_metrics='collector/fixtures/e2e-64k-page-output.txt' ;;
 | |
|   *) fixture_metrics='collector/fixtures/e2e-output.txt' ;;
 | |
| esac
 | |
| 
 | |
| # Only test CPU info collection on x86_64.
 | |
| case "${arch}" in
 | |
|   x86_64)
 | |
|     cpu_info_collector='--collector.cpu.info'
 | |
|     cpu_info_bugs='^(cpu_meltdown|spectre_.*|mds)$'
 | |
|     cpu_info_flags='^(aes|avx.?|constant_tsc)$'
 | |
|     ;;
 | |
|   *)
 | |
|     cpu_info_collector='--no-collector.cpu.info'
 | |
|     cpu_info_bugs=''
 | |
|     cpu_info_flags=''
 | |
|     ;;
 | |
| esac
 | |
| 
 | |
| keep=0; update=0; verbose=0
 | |
| while getopts 'hkuv' opt
 | |
| do
 | |
|   case "$opt" in
 | |
|     k)
 | |
|       keep=1
 | |
|       ;;
 | |
|     u)
 | |
|       update=1
 | |
|       ;;
 | |
|     v)
 | |
|       verbose=1
 | |
|       set -x
 | |
|       ;;
 | |
|     *)
 | |
|       echo "Usage: $0 [-k] [-u] [-v]"
 | |
|       echo "  -k: keep temporary files and leave node_exporter running"
 | |
|       echo "  -u: update fixture_metrics"
 | |
|       echo "  -v: verbose output"
 | |
|       exit 1
 | |
|       ;;
 | |
|   esac
 | |
| done
 | |
| 
 | |
| if [ ! -x ./node_exporter ]
 | |
| then
 | |
|     echo './node_exporter not found. Consider running `go build` first.' >&2
 | |
|     exit 1
 | |
| fi
 | |
| 
 | |
| collector_flags=$(cat << FLAGS
 | |
|   ${cpu_info_collector}
 | |
|   --collector.arp.device-exclude=nope
 | |
|   --collector.bcache.priorityStats
 | |
|   --collector.cpu.info.bugs-include=${cpu_info_bugs}
 | |
|   --collector.cpu.info.flags-include=${cpu_info_flags}
 | |
|   --collector.hwmon.chip-include=(applesmc|coretemp|hwmon4|nct6779)
 | |
|   --collector.netclass.ignore-invalid-speed
 | |
|   --collector.netclass.ignored-devices=(dmz|int)
 | |
|   --collector.netdev.device-include=lo
 | |
|   --collector.qdisc.device-include=(wlan0|eth0)
 | |
|   --collector.qdisc.fixtures=collector/fixtures/qdisc/
 | |
|   --collector.stat.softirq
 | |
|   --collector.sysctl.include-info=kernel.seccomp.actions_avail
 | |
|   --collector.sysctl.include=fs.file-nr
 | |
|   --collector.sysctl.include=fs.file-nr:total,current,max
 | |
|   --collector.sysctl.include=kernel.threads-max
 | |
|   --collector.textfile.directory=collector/fixtures/textfile/two_metric_files/
 | |
|   --collector.wifi.fixtures=collector/fixtures/wifi
 | |
|   --no-collector.arp.netlink
 | |
| FLAGS
 | |
| )
 | |
| 
 | |
| # Handle supported --[no-]collector.<name> flags. These are not hardcoded.
 | |
| _filtered_collector_flags=""
 | |
| for flag in ${collector_flags}; do
 | |
|   collector=$(echo "${flag}" | cut -d"." -f2)
 | |
|   # If the flag is associated with an enabled-by-default collector, include it.
 | |
|   enabled_by_default=0
 | |
|   for filename in $(maybe_flag_search_scope "${collector}") ; do
 | |
|       file="collector/${filename}"
 | |
|       if grep -h -E -o -- "registerCollector\(.*, defaultEnabled" ${file} > /dev/null 2>&1; then
 | |
|         _filtered_collector_flags="${_filtered_collector_flags} ${flag}"
 | |
|         enabled_by_default=1
 | |
|         break
 | |
|       fi
 | |
|   done
 | |
|   if [ ${enabled_by_default} -eq 1 ]; then
 | |
|     continue
 | |
|   fi
 | |
|   # If the flag is associated with an enabled-list collector, include it.
 | |
|   if echo "${supported_enabled_collectors} ${supported_disabled_collectors}" | grep -q -w "${collector}"; then
 | |
|     _filtered_collector_flags="${_filtered_collector_flags} ${flag}"
 | |
|   fi
 | |
| done
 | |
| 
 | |
| # Handle supported --[no-]collector.<name>.<collector> flags. These are hardcoded and matched by the expression below.
 | |
| filtered_collector_flags=""
 | |
| # Check flags of all supported collectors further down their sub-collectors (beyond the 2nd ".").
 | |
| for flag in ${_filtered_collector_flags}; do
 | |
|   # Iterate through all possible files where the flag may be defined.
 | |
|   flag_collector="$(echo "${flag}" | cut -d"." -f2)"
 | |
|   for filename in $(maybe_flag_search_scope "${flag_collector}") ; do
 | |
|     file="collector/${filename}"
 | |
|     # Move to next iteration if the current file is not included under the build context.
 | |
|     if ! ./tools/tools match "$file" > /dev/null 2>&1; then
 | |
|      continue
 | |
|     fi
 | |
|     # Flag has the format: --[no-]collector.<name>.<collector>.
 | |
|     if [ -n "$(echo ${flag} | cut -d"." -f3)" ]; then
 | |
|       # Check if the flag is used in the file.
 | |
|       trimmed_flag=$(echo "${flag}" | tr -d "\"' " | cut -d"=" -f1 | cut -c 3-)
 | |
|       if [[ $trimmed_flag =~ ^no- ]]; then
 | |
|         trimmed_flag=$(echo $trimmed_flag | cut -c 4-)
 | |
|       fi
 | |
|       if grep -h -E -o -- "kingpin.Flag\(\"${trimmed_flag}" ${file} > /dev/null 2>&1; then
 | |
|         filtered_collector_flags="${filtered_collector_flags} ${flag}"
 | |
|       else
 | |
|        continue
 | |
|       fi
 | |
|     # Flag has the format: --[no-]collector.<name>.
 | |
|     else
 | |
|       # Flag is supported by the host.
 | |
|       filtered_collector_flags="${filtered_collector_flags} ${flag}"
 | |
|     fi
 | |
|   done
 | |
| done
 | |
| 
 | |
| # Check for ignored flags.
 | |
| ignored_flags=""
 | |
| for flag in ${collector_flags}; do
 | |
|   flag=$(echo "${flag}" | tr -d " ")
 | |
|   if ! echo "${filtered_collector_flags}" | grep -q -F -- "${flag}" > /dev/null 2>&1; then
 | |
|     ignored_flags="${ignored_flags} ${flag}"
 | |
|   fi
 | |
| done
 | |
| 
 | |
| echo "ENABLED COLLECTORS======="
 | |
| echo "${supported_enabled_collectors:1}" | tr ' ' '\n' | sort
 | |
| echo "========================="
 | |
| 
 | |
| echo "DISABLED COLLECTORS======"
 | |
| echo "${supported_disabled_collectors:1}" | tr ' ' '\n' | sort
 | |
| echo "========================="
 | |
| 
 | |
| echo "IGNORED FLAGS============"
 | |
| echo "${ignored_flags:1}"| tr ' ' '\n' | sort | uniq
 | |
| echo "========================="
 | |
| 
 | |
| ./node_exporter \
 | |
|   --path.rootfs="collector/fixtures" \
 | |
|   --path.procfs="collector/fixtures/proc" \
 | |
|   --path.sysfs="collector/fixtures/sys" \
 | |
|   --path.udev.data="collector/fixtures/udev/data" \
 | |
|   $(for c in ${supported_enabled_collectors}; do echo --collector.${c}  ; done) \
 | |
|   $(for c in ${supported_disabled_collectors}; do echo --no-collector.${c}  ; done) \
 | |
|   ${filtered_collector_flags} \
 | |
|   --web.listen-address "127.0.0.1:${port}" \
 | |
|   --log.level="debug" > "${tmpdir}/node_exporter.log" 2>&1 &
 | |
| 
 | |
| echo $! > "${tmpdir}/node_exporter.pid"
 | |
| 
 | |
| generated_metrics="${tmpdir}/e2e-output.txt"
 | |
| for os in freebsd openbsd netbsd solaris dragonfly darwin; do
 | |
|   if [ "${GOHOSTOS}" = "${os}" ]; then
 | |
|     generated_metrics="${tmpdir}/e2e-output-${GOHOSTOS}.txt"
 | |
|     fixture_metrics="${fixture_metrics::-4}-${GOHOSTOS}.txt"
 | |
|   fi
 | |
| done
 | |
| 
 | |
| finish() {
 | |
|   if [ $? -ne 0 -o ${verbose} -ne 0 ]
 | |
|   then
 | |
|     cat << EOF >&2
 | |
| LOG =====================
 | |
| $(cat "${tmpdir}/node_exporter.log")
 | |
| =========================
 | |
| EOF
 | |
|   fi
 | |
| 
 | |
|   if [ ${update} -ne 0 ]
 | |
|   then
 | |
|     cp "${generated_metrics}" "${fixture_metrics}"
 | |
|   fi
 | |
| 
 | |
|   if [ ${keep} -eq 0 ]
 | |
|   then
 | |
|     kill -9 "$(cat ${tmpdir}/node_exporter.pid)"
 | |
|     # This silences the "Killed" message
 | |
|     set +e
 | |
|     wait "$(cat ${tmpdir}/node_exporter.pid)" > /dev/null 2>&1
 | |
|     rm -rf "${tmpdir}"
 | |
|   fi
 | |
| }
 | |
| 
 | |
| trap finish EXIT
 | |
| 
 | |
| get() {
 | |
|   if command -v curl > /dev/null 2>&1
 | |
|   then
 | |
|     curl -s -f "$@"
 | |
|   elif command -v wget > /dev/null 2>&1
 | |
|   then
 | |
|     wget -O - "$@"
 | |
|   else
 | |
|     echo "Neither curl nor wget found"
 | |
|     exit 1
 | |
|   fi
 | |
| }
 | |
| 
 | |
| sleep 1
 | |
| 
 | |
| get "127.0.0.1:${port}/metrics" | grep --text -E -v "${skip_re}" > "${generated_metrics}"
 | |
| 
 | |
| # The following ignore-list is only applicable to the VMs used to run E2E tests on platforms for which containerized environments are not available.
 | |
| # However, owing to this, there are some non-deterministic metrics that end up generating samples, unlike their containerized counterparts, for e.g., node_network_receive_bytes_total. 
 | |
| non_deterministic_metrics=$(cat << METRICS
 | |
|   node_boot_time_seconds
 | |
|   node_cpu_frequency_hertz
 | |
|   node_cpu_frequency_max_hertz
 | |
|   node_cpu_seconds_total
 | |
|   node_disk_io_time_seconds_total
 | |
|   node_disk_read_bytes_total
 | |
|   node_disk_read_sectors_total
 | |
|   node_disk_read_time_seconds_total
 | |
|   node_disk_reads_completed_total
 | |
|   node_disk_write_time_seconds_total
 | |
|   node_disk_writes_completed_total
 | |
|   node_disk_written_bytes_total
 | |
|   node_disk_written_sectors_total
 | |
|   node_exec_context_switches_total
 | |
|   node_exec_device_interrupts_total
 | |
|   node_exec_forks_total
 | |
|   node_exec_software_interrupts_total
 | |
|   node_exec_system_calls_total
 | |
|   node_exec_traps_total
 | |
|   node_interrupts_total
 | |
|   node_load1
 | |
|   node_load15
 | |
|   node_load5
 | |
|   node_memory_active_bytes
 | |
|   node_memory_buffer_bytes
 | |
|   node_memory_cache_bytes
 | |
|   node_memory_compressed_bytes
 | |
|   node_memory_free_bytes
 | |
|   node_memory_inactive_bytes
 | |
|   node_memory_internal_bytes
 | |
|   node_memory_laundry_bytes
 | |
|   node_memory_purgeable_bytes
 | |
|   node_memory_size_bytes
 | |
|   node_memory_swapped_in_bytes_total
 | |
|   node_memory_swapped_out_bytes_total
 | |
|   node_memory_wired_bytes
 | |
|   node_netstat_tcp_receive_packets_total
 | |
|   node_netstat_tcp_transmit_packets_total
 | |
|   node_network_receive_bytes_total
 | |
|   node_network_receive_multicast_total
 | |
|   node_network_transmit_multicast_total
 | |
| METRICS
 | |
| )
 | |
| 
 | |
| # Remove non-deterministic metrics from the generated metrics file (as we run their workflows in VMs).
 | |
| for os in freebsd openbsd netbsd solaris dragonfly darwin; do
 | |
|   if [ "${GOHOSTOS}" = "${os}" ]; then
 | |
|     for metric in ${non_deterministic_metrics}; do
 | |
|       sed -i "/${metric}/d" "${generated_metrics}"
 | |
|     done
 | |
|   fi
 | |
| done
 | |
| 
 | |
| diff -u \
 | |
|   "${fixture_metrics}" \
 | |
|   "${generated_metrics}"
 |