mirror of
https://github.com/prometheus/node_exporter.git
synced 2025-01-13 06:48:05 -08:00
Add btrfs device error stats (#2193)
* Improve metrics filesystem scanning logic * Makes ioctl syscalls to load the device error stats. * Adds filesystem mountpoint labels to existing metrics for ease of use. Signed-off-by: Marcus Cobden <leth@users.noreply.github.com>
This commit is contained in:
parent
d7e89e769d
commit
135710d8b5
|
@ -18,8 +18,13 @@ package collector
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"path"
|
||||
"strings"
|
||||
"syscall"
|
||||
|
||||
dennwc "github.com/dennwc/btrfs"
|
||||
"github.com/go-kit/log"
|
||||
"github.com/go-kit/log/level"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/procfs/btrfs"
|
||||
)
|
||||
|
@ -52,19 +57,163 @@ func NewBtrfsCollector(logger log.Logger) (Collector, error) {
|
|||
func (c *btrfsCollector) Update(ch chan<- prometheus.Metric) error {
|
||||
stats, err := c.fs.Stats()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to retrieve Btrfs stats: %w", err)
|
||||
return fmt.Errorf("failed to retrieve Btrfs stats from procfs: %w", err)
|
||||
}
|
||||
|
||||
ioctlStatsMap, err := c.getIoctlStats()
|
||||
if err != nil {
|
||||
level.Debug(c.logger).Log(
|
||||
"msg", "Error querying btrfs device stats with ioctl",
|
||||
"err", err)
|
||||
ioctlStatsMap = make(map[string]*btrfsIoctlFsStats)
|
||||
}
|
||||
|
||||
for _, s := range stats {
|
||||
c.updateBtrfsStats(ch, s)
|
||||
// match up procfs and ioctl info by filesystem UUID (without dashes)
|
||||
var fsUUID = strings.Replace(s.UUID, "-", "", -1)
|
||||
ioctlStats := ioctlStatsMap[fsUUID]
|
||||
c.updateBtrfsStats(ch, s, ioctlStats)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
type btrfsIoctlFsDevStats struct {
|
||||
path string
|
||||
uuid string
|
||||
|
||||
bytesUsed uint64
|
||||
totalBytes uint64
|
||||
|
||||
// The error stats below match the following upstream lists:
|
||||
// https://github.com/dennwc/btrfs/blob/b3db0b2dedac3bf580f412034d77e0bf4b420167/btrfs.go#L132-L140
|
||||
// https://github.com/torvalds/linux/blob/70d605cbeecb408dd884b1f0cd3963eeeaac144c/include/uapi/linux/btrfs.h#L680-L692
|
||||
writeErrs uint64
|
||||
readErrs uint64
|
||||
flushErrs uint64
|
||||
corruptionErrs uint64
|
||||
generationErrs uint64
|
||||
}
|
||||
|
||||
type btrfsIoctlFsStats struct {
|
||||
uuid string
|
||||
devices []btrfsIoctlFsDevStats
|
||||
}
|
||||
|
||||
func (c *btrfsCollector) getIoctlStats() (map[string]*btrfsIoctlFsStats, error) {
|
||||
// Instead of introducing more ioctl calls to scan for all btrfs
|
||||
// filesytems re-use our mount point utils to find known mounts
|
||||
mountsList, err := mountPointDetails(c.logger)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// track devices we have successfully scanned, by device path
|
||||
devicesDone := make(map[string]struct{})
|
||||
// filesystems scann results by UUID
|
||||
fsStats := make(map[string]*btrfsIoctlFsStats)
|
||||
|
||||
for _, mount := range mountsList {
|
||||
if mount.fsType != "btrfs" {
|
||||
continue
|
||||
}
|
||||
|
||||
if _, found := devicesDone[mount.device]; found {
|
||||
// We already found this filesystem by another mount point
|
||||
continue
|
||||
}
|
||||
|
||||
fs, err := dennwc.Open(mount.mountPoint, true)
|
||||
if err != nil {
|
||||
// failed to open this mount point, maybe we didn't have permission
|
||||
// maybe we'll find another mount point for this FS later
|
||||
level.Debug(c.logger).Log(
|
||||
"msg", "Error inspecting btrfs mountpoint",
|
||||
"mountPoint", mount.mountPoint,
|
||||
"err", err)
|
||||
continue
|
||||
}
|
||||
|
||||
fsInfo, err := fs.Info()
|
||||
if err != nil {
|
||||
// Failed to get the FS info for some reason,
|
||||
// perhaps it'll work with a different mount point
|
||||
level.Debug(c.logger).Log(
|
||||
"msg", "Error querying btrfs filesystem",
|
||||
"mountPoint", mount.mountPoint,
|
||||
"err", err)
|
||||
continue
|
||||
}
|
||||
|
||||
fsID := fsInfo.FSID.String()
|
||||
if _, found := fsStats[fsID]; found {
|
||||
// We already found this filesystem by another mount point
|
||||
continue
|
||||
}
|
||||
|
||||
deviceStats, err := c.getIoctlDeviceStats(fs, &fsInfo)
|
||||
if err != nil {
|
||||
level.Debug(c.logger).Log(
|
||||
"msg", "Error querying btrfs device stats",
|
||||
"mountPoint", mount.mountPoint,
|
||||
"err", err)
|
||||
continue
|
||||
}
|
||||
|
||||
devicesDone[mount.device] = struct{}{}
|
||||
fsStats[fsID] = &btrfsIoctlFsStats{
|
||||
uuid: fsID,
|
||||
devices: deviceStats,
|
||||
}
|
||||
}
|
||||
|
||||
return fsStats, nil
|
||||
}
|
||||
|
||||
func (c *btrfsCollector) getIoctlDeviceStats(fs *dennwc.FS, fsInfo *dennwc.Info) ([]btrfsIoctlFsDevStats, error) {
|
||||
devices := make([]btrfsIoctlFsDevStats, 0, fsInfo.NumDevices)
|
||||
|
||||
for i := uint64(0); i <= fsInfo.MaxID; i++ {
|
||||
deviceInfo, err := fs.GetDevInfo(i)
|
||||
|
||||
if err != nil {
|
||||
if errno, ok := err.(syscall.Errno); ok && errno == syscall.ENODEV {
|
||||
// device IDs do not consistently start at 0, nor are ranges contiguous, so we expect this
|
||||
continue
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
deviceStats, err := fs.GetDevStats(i)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
devices = append(devices, btrfsIoctlFsDevStats{
|
||||
path: deviceInfo.Path,
|
||||
uuid: deviceInfo.UUID.String(),
|
||||
bytesUsed: deviceInfo.BytesUsed,
|
||||
totalBytes: deviceInfo.TotalBytes,
|
||||
|
||||
writeErrs: deviceStats.WriteErrs,
|
||||
readErrs: deviceStats.ReadErrs,
|
||||
flushErrs: deviceStats.FlushErrs,
|
||||
corruptionErrs: deviceStats.CorruptionErrs,
|
||||
generationErrs: deviceStats.GenerationErrs,
|
||||
})
|
||||
|
||||
if uint64(len(devices)) == fsInfo.NumDevices {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return devices, nil
|
||||
}
|
||||
|
||||
// btrfsMetric represents a single Btrfs metric that is converted into a Prometheus Metric.
|
||||
type btrfsMetric struct {
|
||||
name string
|
||||
metricType prometheus.ValueType
|
||||
desc string
|
||||
value float64
|
||||
extraLabel []string
|
||||
|
@ -72,14 +221,14 @@ type btrfsMetric struct {
|
|||
}
|
||||
|
||||
// updateBtrfsStats collects statistics for one bcache ID.
|
||||
func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs.Stats) {
|
||||
func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs.Stats, ioctlStats *btrfsIoctlFsStats) {
|
||||
const subsystem = "btrfs"
|
||||
|
||||
// Basic information about the filesystem.
|
||||
devLabels := []string{"uuid"}
|
||||
|
||||
// Retrieve the metrics.
|
||||
metrics := c.getMetrics(s)
|
||||
metrics := c.getMetrics(s, ioctlStats)
|
||||
|
||||
// Convert all gathered metrics to Prometheus Metrics and add to channel.
|
||||
for _, m := range metrics {
|
||||
|
@ -99,7 +248,7 @@ func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs.
|
|||
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
desc,
|
||||
prometheus.GaugeValue,
|
||||
m.metricType,
|
||||
m.value,
|
||||
labelValues...,
|
||||
)
|
||||
|
@ -107,38 +256,104 @@ func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs.
|
|||
}
|
||||
|
||||
// getMetrics returns metrics for the given Btrfs statistics.
|
||||
func (c *btrfsCollector) getMetrics(s *btrfs.Stats) []btrfsMetric {
|
||||
func (c *btrfsCollector) getMetrics(s *btrfs.Stats, ioctlStats *btrfsIoctlFsStats) []btrfsMetric {
|
||||
metrics := []btrfsMetric{
|
||||
{
|
||||
name: "info",
|
||||
desc: "Filesystem information",
|
||||
value: 1,
|
||||
metricType: prometheus.GaugeValue,
|
||||
extraLabel: []string{"label"},
|
||||
extraLabelValue: []string{s.Label},
|
||||
},
|
||||
{
|
||||
name: "global_rsv_size_bytes",
|
||||
desc: "Size of global reserve.",
|
||||
value: float64(s.Allocation.GlobalRsvSize),
|
||||
name: "global_rsv_size_bytes",
|
||||
desc: "Size of global reserve.",
|
||||
metricType: prometheus.GaugeValue,
|
||||
value: float64(s.Allocation.GlobalRsvSize),
|
||||
},
|
||||
}
|
||||
|
||||
// Information about devices.
|
||||
for n, dev := range s.Devices {
|
||||
metrics = append(metrics, btrfsMetric{
|
||||
name: "device_size_bytes",
|
||||
desc: "Size of a device that is part of the filesystem.",
|
||||
value: float64(dev.Size),
|
||||
extraLabel: []string{"device"},
|
||||
extraLabelValue: []string{n},
|
||||
})
|
||||
}
|
||||
|
||||
// Information about data, metadata and system data.
|
||||
metrics = append(metrics, c.getAllocationStats("data", s.Allocation.Data)...)
|
||||
metrics = append(metrics, c.getAllocationStats("metadata", s.Allocation.Metadata)...)
|
||||
metrics = append(metrics, c.getAllocationStats("system", s.Allocation.System)...)
|
||||
|
||||
// Information about devices.
|
||||
if ioctlStats == nil {
|
||||
for n, dev := range s.Devices {
|
||||
metrics = append(metrics, btrfsMetric{
|
||||
name: "device_size_bytes",
|
||||
desc: "Size of a device that is part of the filesystem.",
|
||||
metricType: prometheus.GaugeValue,
|
||||
value: float64(dev.Size),
|
||||
extraLabel: []string{"device"},
|
||||
extraLabelValue: []string{n},
|
||||
})
|
||||
}
|
||||
return metrics
|
||||
}
|
||||
|
||||
for _, dev := range ioctlStats.devices {
|
||||
// trim the path prefix from the device name so the value should match
|
||||
// the value used in the fallback branch above.
|
||||
// e.g. /dev/sda -> sda, /rootfs/dev/md1 -> md1
|
||||
_, device := path.Split(dev.path)
|
||||
|
||||
extraLabels := []string{"device", "btrfs_dev_uuid"}
|
||||
extraLabelValues := []string{device, dev.uuid}
|
||||
|
||||
metrics = append(metrics,
|
||||
btrfsMetric{
|
||||
name: "device_size_bytes",
|
||||
desc: "Size of a device that is part of the filesystem.",
|
||||
metricType: prometheus.GaugeValue,
|
||||
value: float64(dev.totalBytes),
|
||||
extraLabel: extraLabels,
|
||||
extraLabelValue: extraLabelValues,
|
||||
},
|
||||
// A bytes available metric is probably more useful than a
|
||||
// bytes used metric, because large numbers of bytes will
|
||||
// suffer from floating point representation issues
|
||||
// and we probably care more about the number when it's low anyway
|
||||
btrfsMetric{
|
||||
name: "device_unused_bytes",
|
||||
desc: "Unused bytes unused on a device that is part of the filesystem.",
|
||||
metricType: prometheus.GaugeValue,
|
||||
value: float64(dev.totalBytes - dev.bytesUsed),
|
||||
extraLabel: extraLabels,
|
||||
extraLabelValue: extraLabelValues,
|
||||
})
|
||||
|
||||
errorLabels := append([]string{"type"}, extraLabels...)
|
||||
values := []uint64{
|
||||
dev.writeErrs,
|
||||
dev.readErrs,
|
||||
dev.flushErrs,
|
||||
dev.corruptionErrs,
|
||||
dev.generationErrs,
|
||||
}
|
||||
btrfsErrorTypeNames := []string{
|
||||
"write",
|
||||
"read",
|
||||
"flush",
|
||||
"corruption",
|
||||
"generation",
|
||||
}
|
||||
|
||||
for i, errorType := range btrfsErrorTypeNames {
|
||||
metrics = append(metrics,
|
||||
btrfsMetric{
|
||||
name: "device_errors_total",
|
||||
desc: "Errors reported for the device",
|
||||
metricType: prometheus.CounterValue,
|
||||
value: float64(values[i]),
|
||||
extraLabel: errorLabels,
|
||||
extraLabelValue: append([]string{errorType}, extraLabelValues...),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return metrics
|
||||
}
|
||||
|
||||
|
@ -148,6 +363,7 @@ func (c *btrfsCollector) getAllocationStats(a string, s *btrfs.AllocationStats)
|
|||
{
|
||||
name: "reserved_bytes",
|
||||
desc: "Amount of space reserved for a data type",
|
||||
metricType: prometheus.GaugeValue,
|
||||
value: float64(s.ReservedBytes),
|
||||
extraLabel: []string{"block_group_type"},
|
||||
extraLabelValue: []string{a},
|
||||
|
@ -168,6 +384,7 @@ func (c *btrfsCollector) getLayoutStats(a, l string, s *btrfs.LayoutUsage) []btr
|
|||
{
|
||||
name: "used_bytes",
|
||||
desc: "Amount of used space by a layout/data type",
|
||||
metricType: prometheus.GaugeValue,
|
||||
value: float64(s.UsedBytes),
|
||||
extraLabel: []string{"block_group_type", "mode"},
|
||||
extraLabelValue: []string{a, l},
|
||||
|
@ -175,6 +392,7 @@ func (c *btrfsCollector) getLayoutStats(a, l string, s *btrfs.LayoutUsage) []btr
|
|||
{
|
||||
name: "size_bytes",
|
||||
desc: "Amount of space allocated for a layout/data type",
|
||||
metricType: prometheus.GaugeValue,
|
||||
value: float64(s.TotalBytes),
|
||||
extraLabel: []string{"block_group_type", "mode"},
|
||||
extraLabelValue: []string{a, l},
|
||||
|
@ -182,6 +400,7 @@ func (c *btrfsCollector) getLayoutStats(a, l string, s *btrfs.LayoutUsage) []btr
|
|||
{
|
||||
name: "allocation_ratio",
|
||||
desc: "Data allocation ratio for a layout/data type",
|
||||
metricType: prometheus.GaugeValue,
|
||||
value: s.Ratio,
|
||||
extraLabel: []string{"block_group_type", "mode"},
|
||||
extraLabelValue: []string{a, l},
|
||||
|
|
|
@ -27,8 +27,6 @@ var expectedBtrfsMetrics = [][]btrfsMetric{
|
|||
{
|
||||
{name: "info", value: 1, extraLabel: []string{"label"}, extraLabelValue: []string{"fixture"}},
|
||||
{name: "global_rsv_size_bytes", value: 1.6777216e+07},
|
||||
{name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop25"}},
|
||||
{name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop26"}},
|
||||
{name: "reserved_bytes", value: 0, extraLabel: []string{"block_group_type"}, extraLabelValue: []string{"data"}},
|
||||
{name: "used_bytes", value: 8.08189952e+08, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"data", "raid0"}},
|
||||
{name: "size_bytes", value: 2.147483648e+09, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"data", "raid0"}},
|
||||
|
@ -41,14 +39,12 @@ var expectedBtrfsMetrics = [][]btrfsMetric{
|
|||
{name: "used_bytes", value: 16384, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"system", "raid1"}},
|
||||
{name: "size_bytes", value: 8.388608e+06, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"system", "raid1"}},
|
||||
{name: "allocation_ratio", value: 2, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"system", "raid1"}},
|
||||
{name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop25"}},
|
||||
{name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop26"}},
|
||||
},
|
||||
{
|
||||
{name: "info", value: 1, extraLabel: []string{"label"}, extraLabelValue: []string{""}},
|
||||
{name: "global_rsv_size_bytes", value: 1.6777216e+07},
|
||||
{name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop22"}},
|
||||
{name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop23"}},
|
||||
{name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop24"}},
|
||||
{name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop25"}},
|
||||
{name: "reserved_bytes", value: 0, extraLabel: []string{"block_group_type"}, extraLabelValue: []string{"data"}},
|
||||
{name: "used_bytes", value: 0, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"data", "raid5"}},
|
||||
{name: "size_bytes", value: 6.44087808e+08, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"data", "raid5"}},
|
||||
|
@ -61,6 +57,10 @@ var expectedBtrfsMetrics = [][]btrfsMetric{
|
|||
{name: "used_bytes", value: 16384, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"system", "raid6"}},
|
||||
{name: "size_bytes", value: 1.6777216e+07, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"system", "raid6"}},
|
||||
{name: "allocation_ratio", value: 2, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"system", "raid6"}},
|
||||
{name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop22"}},
|
||||
{name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop23"}},
|
||||
{name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop24"}},
|
||||
{name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop25"}},
|
||||
},
|
||||
}
|
||||
|
||||
|
@ -104,7 +104,7 @@ func TestBtrfs(t *testing.T) {
|
|||
}
|
||||
|
||||
for i, s := range stats {
|
||||
metrics := collector.getMetrics(s)
|
||||
metrics := collector.getMetrics(s, nil)
|
||||
if len(metrics) != len(expectedBtrfsMetrics[i]) {
|
||||
t.Fatalf("Unexpected number of Btrfs metrics: expected %v, got %v", len(expectedBtrfsMetrics[i]), len(metrics))
|
||||
}
|
||||
|
|
1
go.mod
1
go.mod
|
@ -5,6 +5,7 @@ go 1.17
|
|||
require (
|
||||
github.com/beevik/ntp v0.3.0
|
||||
github.com/coreos/go-systemd/v22 v22.3.2
|
||||
github.com/dennwc/btrfs v0.0.0-20220403080356-b3db0b2dedac
|
||||
github.com/ema/qdisc v0.0.0-20200603082823-62d0308e3e00
|
||||
github.com/go-kit/log v0.2.1
|
||||
github.com/godbus/dbus/v5 v5.1.0
|
||||
|
|
4
go.sum
4
go.sum
|
@ -63,6 +63,10 @@ github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ3
|
|||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dennwc/btrfs v0.0.0-20220403080356-b3db0b2dedac h1:cjfIEKdg/lZ+ewJk8FYoYZTN2HY/okVSPtmHLaYMAvE=
|
||||
github.com/dennwc/btrfs v0.0.0-20220403080356-b3db0b2dedac/go.mod h1:MYsOV9Dgsec3FFSOjywi0QK5r6TeBbdWxdrMGtiYXHA=
|
||||
github.com/dennwc/ioctl v1.0.0 h1:DsWAAjIxRqNcLn9x6mwfuf2pet3iB7aK90K4tF16rLg=
|
||||
github.com/dennwc/ioctl v1.0.0/go.mod h1:ellh2YB5ldny99SBU/VX7Nq0xiZbHphf1DrtHxxjMk0=
|
||||
github.com/ema/qdisc v0.0.0-20200603082823-62d0308e3e00 h1:0GHzegkDz/zSrt+Zph1OueNImPdUxoToypnkhhRYTjI=
|
||||
github.com/ema/qdisc v0.0.0-20200603082823-62d0308e3e00/go.mod h1:ix4kG2zvdUd8kEKSW0ZTr1XLks0epFpI4j745DXxlNE=
|
||||
github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
|
||||
|
|
Loading…
Reference in a new issue