feat: Collect /sys/net/class PCIE AER counters

Signed-off-by: Diego Asturias <dasturias@arista.com>
This commit is contained in:
Diego Asturias 2024-12-10 01:24:02 +00:00
parent cf8c6891cc
commit 35dbec2263
3 changed files with 256 additions and 2 deletions

252
collector/aer_linux.go Normal file
View file

@ -0,0 +1,252 @@
// Copyright 2024 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//go:build !nonetclass && linux
// +build !nonetclass,linux
package collector
import (
"errors"
"fmt"
"log/slog"
"os"
"regexp"
"strconv"
"github.com/alecthomas/kingpin/v2"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/procfs/sysfs"
)
var (
aerIgnoredDevices = kingpin.Flag("collector.aer.ignored-devices", "Regexp of aer devices to ignore for aer collector.").Default("^$").String()
aerCorrectableRxErr = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "correctable_rx_err"),
"Count of correctable receiver errors",
[]string{"interface"}, nil,
)
aerCorrectableBadTLP = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "correctable_bad_tlp"),
"Count of correctable bad TLPs",
[]string{"interface"}, nil,
)
aerCorrectableBadDLLP = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "correctable_bad_dllp"),
"Count of correctable bad DLLPs",
[]string{"interface"}, nil,
)
aerCorrectableRollover = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "correctable_rollover"),
"Count of correctable rollovers",
[]string{"interface"}, nil,
)
aerCorrectableTimeout = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "correctable_timeout"),
"Count of correctable replay timer timeouts",
[]string{"interface"}, nil,
)
aerCorrectableNonFatalErr = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "correctable_non_fatal_err"),
"Count of correctable advisory non-fatal errors",
[]string{"interface"}, nil,
)
aerCorrectableCorrIntErr = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "correctable_corr_int_err"),
"Count of correctable corrected internal errors",
[]string{"interface"}, nil,
)
aerCorrectableHeaderOF = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "correctable_header_of"),
"Count of correctable header log Overflows",
[]string{"interface"}, nil,
)
aerUncorrectableUndefined = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_undefined"),
"Count of uncorrectable undefined errors",
[]string{"interface", "fatal"}, nil,
)
aerUncorrectableDLP = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_dlp"),
"Count of uncorrectable data link protocol errors",
[]string{"interface", "fatal"}, nil,
)
aerUncorrectableSDES = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_sdes"),
"Count of uncorrectable surprise down errors",
[]string{"interface", "fatal"}, nil,
)
aerUncorrectableTLP = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_tlp"),
"Count of uncorrectable poisoned TLPs",
[]string{"interface", "fatal"}, nil,
)
aerUncorrectableFCP = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_fcp"),
"Count of uncorrectable flow control protocol errors",
[]string{"interface", "fatal"}, nil,
)
aerUncorrectableCmpltTO = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_cmplt_to"),
"Count of uncorrectable completion timeouts",
[]string{"interface", "fatal"}, nil,
)
aerUncorrectableCmpltAbrt = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_cmplt_abrt"),
"Count of uncorrectable completer aborts",
[]string{"interface", "fatal"}, nil,
)
aerUncorrectableUnxCmplt = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_unx_cmplt"),
"Count of uncorrectable unexpected completion errors",
[]string{"interface", "fatal"}, nil,
)
aerUncorrectableRxOF = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_rx_of"),
"Count of uncorrectable receiver overflows",
[]string{"interface", "fatal"}, nil,
)
aerUncorrectableMalfTLP = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_malf_tlp"),
"Count of uncorrectable malformed TLPs",
[]string{"interface", "fatal"}, nil,
)
aerUncorrectableECRC = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_ecrc"),
"Count of uncorrectable ECRCs",
[]string{"interface", "fatal"}, nil,
)
aerUncorrectableUnsupReq = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_unsup_req"),
"Count of uncorrectable unsupported requests",
[]string{"interface", "fatal"}, nil,
)
aerUncorrectableACSViol = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_acs_viol"),
"Count of uncorrectable ACS violations",
[]string{"interface", "fatal"}, nil,
)
aerUncorrectableUncorrIntErr = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_uncorr_int_err"),
"Count of uncorrectable uncorrectable internal errors",
[]string{"interface", "fatal"}, nil,
)
aerUncorrectableBlockedTLP = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_blocked_tlp"),
"Count of uncorrectable MC blocked TLPs",
[]string{"interface", "fatal"}, nil,
)
aerUncorrectableAtomicOpBlocked = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_atomic_op_blocked"),
"Count of uncorrectable AtomicOp egress blocked errors",
[]string{"interface", "fatal"}, nil,
)
aerUncorrectableTLPBlockedErr = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_tlp_blocked_err"),
"Count of uncorrectable TLP prefix blocked errors",
[]string{"interface", "fatal"}, nil,
)
aerUncorrectablePoisonTLPBlocked = prometheus.NewDesc(prometheus.BuildFQName(namespace, "aer", "uncorrectable_poison_tlp_blocked"),
"Count of uncorrectable poison TLP prefix blocked errors",
[]string{"interface", "fatal"}, nil,
)
)
type aerCollector struct {
fs sysfs.FS
ignoredDevicesPattern *regexp.Regexp
logger *slog.Logger
}
func init() {
registerCollector("aer", defaultDisabled, NewAerCollector)
}
// NewAerCollector returns a new Collector exposing aer stats.
func NewAerCollector(logger *slog.Logger) (Collector, error) {
return makeAerCollector(logger)
}
func makeAerCollector(logger *slog.Logger) (*aerCollector, error) {
fs, err := sysfs.NewFS(*sysPath)
if err != nil {
return nil, fmt.Errorf("failed to open sysfs: %w", err)
}
if *aerIgnoredDevices != "" {
logger.Info("Parsed flag --collector.aer.ignored-devices", "flag", *aerIgnoredDevices)
}
pattern := regexp.MustCompile(*aerIgnoredDevices)
return &aerCollector{
fs: fs,
ignoredDevicesPattern: pattern,
logger: logger,
}, nil
}
func (c *aerCollector) Update(ch chan<- prometheus.Metric) error {
counters, err := c.fs.AerCounters()
if err != nil {
if errors.Is(err, os.ErrNotExist) || errors.Is(err, os.ErrPermission) {
c.logger.Debug("Could not read netclass file", "err", err)
return ErrNoData
}
return fmt.Errorf("could not get net class info: %w", err)
}
for deviceName, deviceCounters := range counters {
if c.ignoredDevicesPattern.MatchString(deviceName) {
continue
}
c.updateCorrectableCntrs(ch, deviceName, deviceCounters.Correctable)
c.updateUncorrectableCntrs(ch, deviceName, deviceCounters.Fatal, true)
c.updateUncorrectableCntrs(ch, deviceName, deviceCounters.NonFatal, false)
}
return nil
}
func (c *aerCollector) updateCorrectableCntrs(ch chan<- prometheus.Metric, deviceName string, counters sysfs.CorrectableAerCounters) {
ch <- prometheus.MustNewConstMetric(aerCorrectableRxErr, prometheus.CounterValue,
float64(counters.RxErr), deviceName)
ch <- prometheus.MustNewConstMetric(aerCorrectableBadTLP, prometheus.CounterValue,
float64(counters.BadTLP), deviceName)
ch <- prometheus.MustNewConstMetric(aerCorrectableBadDLLP, prometheus.CounterValue,
float64(counters.BadDLLP), deviceName)
ch <- prometheus.MustNewConstMetric(aerCorrectableRollover, prometheus.CounterValue,
float64(counters.Rollover), deviceName)
ch <- prometheus.MustNewConstMetric(aerCorrectableTimeout, prometheus.CounterValue,
float64(counters.Timeout), deviceName)
ch <- prometheus.MustNewConstMetric(aerCorrectableNonFatalErr, prometheus.CounterValue,
float64(counters.NonFatalErr), deviceName)
ch <- prometheus.MustNewConstMetric(aerCorrectableCorrIntErr, prometheus.CounterValue,
float64(counters.CorrIntErr), deviceName)
ch <- prometheus.MustNewConstMetric(aerCorrectableHeaderOF, prometheus.CounterValue,
float64(counters.HeaderOF), deviceName)
}
func (c *aerCollector) updateUncorrectableCntrs(ch chan<- prometheus.Metric, deviceName string, counters sysfs.UncorrectableAerCounters, fatal bool) {
ch <- prometheus.MustNewConstMetric(aerUncorrectableUndefined, prometheus.CounterValue,
float64(counters.Undefined), deviceName, strconv.FormatBool(fatal))
ch <- prometheus.MustNewConstMetric(aerUncorrectableDLP, prometheus.CounterValue,
float64(counters.DLP), deviceName, strconv.FormatBool(fatal))
ch <- prometheus.MustNewConstMetric(aerUncorrectableSDES, prometheus.CounterValue,
float64(counters.SDES), deviceName, strconv.FormatBool(fatal))
ch <- prometheus.MustNewConstMetric(aerUncorrectableTLP, prometheus.CounterValue,
float64(counters.TLP), deviceName, strconv.FormatBool(fatal))
ch <- prometheus.MustNewConstMetric(aerUncorrectableFCP, prometheus.CounterValue,
float64(counters.FCP), deviceName, strconv.FormatBool(fatal))
ch <- prometheus.MustNewConstMetric(aerUncorrectableCmpltTO, prometheus.CounterValue,
float64(counters.CmpltTO), deviceName, strconv.FormatBool(fatal))
ch <- prometheus.MustNewConstMetric(aerUncorrectableCmpltAbrt, prometheus.CounterValue,
float64(counters.CmpltAbrt), deviceName, strconv.FormatBool(fatal))
ch <- prometheus.MustNewConstMetric(aerUncorrectableUnxCmplt, prometheus.CounterValue,
float64(counters.UnxCmplt), deviceName, strconv.FormatBool(fatal))
ch <- prometheus.MustNewConstMetric(aerUncorrectableRxOF, prometheus.CounterValue,
float64(counters.RxOF), deviceName, strconv.FormatBool(fatal))
ch <- prometheus.MustNewConstMetric(aerUncorrectableMalfTLP, prometheus.CounterValue,
float64(counters.MalfTLP), deviceName, strconv.FormatBool(fatal))
ch <- prometheus.MustNewConstMetric(aerUncorrectableECRC, prometheus.CounterValue,
float64(counters.ECRC), deviceName, strconv.FormatBool(fatal))
ch <- prometheus.MustNewConstMetric(aerUncorrectableUnsupReq, prometheus.CounterValue,
float64(counters.UnsupReq), deviceName, strconv.FormatBool(fatal))
ch <- prometheus.MustNewConstMetric(aerUncorrectableACSViol, prometheus.CounterValue,
float64(counters.ACSViol), deviceName, strconv.FormatBool(fatal))
ch <- prometheus.MustNewConstMetric(aerUncorrectableUncorrIntErr, prometheus.CounterValue,
float64(counters.UncorrIntErr), deviceName, strconv.FormatBool(fatal))
ch <- prometheus.MustNewConstMetric(aerUncorrectableBlockedTLP, prometheus.CounterValue,
float64(counters.BlockedTLP), deviceName, strconv.FormatBool(fatal))
ch <- prometheus.MustNewConstMetric(aerUncorrectableAtomicOpBlocked, prometheus.CounterValue,
float64(counters.AtomicOpBlocked), deviceName, strconv.FormatBool(fatal))
ch <- prometheus.MustNewConstMetric(aerUncorrectableTLPBlockedErr, prometheus.CounterValue,
float64(counters.TLPBlockedErr), deviceName, strconv.FormatBool(fatal))
ch <- prometheus.MustNewConstMetric(aerUncorrectablePoisonTLPBlocked, prometheus.CounterValue,
float64(counters.PoisonTLPBlocked), deviceName, strconv.FormatBool(fatal))
}

2
go.mod
View file

@ -59,3 +59,5 @@ require (
google.golang.org/protobuf v1.34.2 // indirect google.golang.org/protobuf v1.34.2 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect
) )
replace github.com/prometheus/procfs => github.com/dasturiasArista/procfs v1.0.2

4
go.sum
View file

@ -12,6 +12,8 @@ github.com/cilium/ebpf v0.12.3 h1:8ht6F9MquybnY97at+VDZb3eQQr8ev79RueWeVaEcG4=
github.com/cilium/ebpf v0.12.3/go.mod h1:TctK1ivibvI3znr66ljgi4hqOT8EYQjz1KWBfb1UVgM= github.com/cilium/ebpf v0.12.3/go.mod h1:TctK1ivibvI3znr66ljgi4hqOT8EYQjz1KWBfb1UVgM=
github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs= github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs=
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/dasturiasArista/procfs v1.0.2 h1:vNFucKeBondLfu8afp7KAar4dkAO0GuoFEuuyt96ZZM=
github.com/dasturiasArista/procfs v1.0.2/go.mod h1:S2aFqsiJkGSmNV1vWOVVkGk3LOZXC6lHs6Gjq9u9v5g=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
@ -83,8 +85,6 @@ github.com/prometheus/common v0.60.1 h1:FUas6GcOw66yB/73KC+BOZoFJmbo/1pojoILArPA
github.com/prometheus/common v0.60.1/go.mod h1:h0LYf1R1deLSKtD4Vdg8gy4RuOvENW2J/h19V5NADQw= github.com/prometheus/common v0.60.1/go.mod h1:h0LYf1R1deLSKtD4Vdg8gy4RuOvENW2J/h19V5NADQw=
github.com/prometheus/exporter-toolkit v0.13.1 h1:Evsh0gWQo2bdOHlnz9+0Nm7/OFfIwhE2Ws4A2jIlR04= github.com/prometheus/exporter-toolkit v0.13.1 h1:Evsh0gWQo2bdOHlnz9+0Nm7/OFfIwhE2Ws4A2jIlR04=
github.com/prometheus/exporter-toolkit v0.13.1/go.mod h1:ujdv2YIOxtdFxxqtloLpbqmxd5J0Le6IITUvIRSWjj0= github.com/prometheus/exporter-toolkit v0.13.1/go.mod h1:ujdv2YIOxtdFxxqtloLpbqmxd5J0Le6IITUvIRSWjj0=
github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
github.com/safchain/ethtool v0.4.1 h1:S6mEleTADqgynileXoiapt/nKnatyR6bmIHoF+h2ADo= github.com/safchain/ethtool v0.4.1 h1:S6mEleTADqgynileXoiapt/nKnatyR6bmIHoF+h2ADo=