From c6162312f2891c67c16e167ce6a8f3961a30ab39 Mon Sep 17 00:00:00 2001 From: Ben Kochie Date: Wed, 12 Oct 2016 13:07:49 +0200 Subject: [PATCH] Add Linux NUMA "numastat" metrics (#249) * Add Linux NUMA "numastat" metrics Read the `numastat` metrics from /sys/devices/system/node/node* when reading NUMA meminfo metrics. * Update end-to-end test output. * Add `numastat` metrics as counters. * Add tests for error conditions. * Refactor meminfo numa metrics struct * Refactor meminfoKey into a simple struct of metric data. This makes it easier to pass slices of metrics around. * Refactor tests. * Fixup: Add suggested fixes. * Fixup: More fixes * Add another scanner.Err() return * Add "_total" to counter metrics. --- collector/fixtures/e2e-output.txt | 24 +++++ .../sys/devices/system/node/node0/numastat | 6 ++ .../sys/devices/system/node/node1/numastat | 6 ++ collector/meminfo_numa_linux.go | 90 ++++++++++++++----- collector/meminfo_numa_linux_test.go | 58 ++++++++++-- 5 files changed, 158 insertions(+), 26 deletions(-) create mode 100644 collector/fixtures/sys/devices/system/node/node0/numastat create mode 100644 collector/fixtures/sys/devices/system/node/node1/numastat diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt index 54ff6795..73e9c08c 100644 --- a/collector/fixtures/e2e-output.txt +++ b/collector/fixtures/e2e-output.txt @@ -1011,6 +1011,30 @@ node_memory_numa_Writeback{node="1"} 0 # TYPE node_memory_numa_WritebackTmp gauge node_memory_numa_WritebackTmp{node="0"} 0 node_memory_numa_WritebackTmp{node="1"} 0 +# HELP node_memory_numa_interleave_hit_total Memory information field interleave_hit_total. +# TYPE node_memory_numa_interleave_hit_total counter +node_memory_numa_interleave_hit_total{node="0"} 57146 +node_memory_numa_interleave_hit_total{node="1"} 57286 +# HELP node_memory_numa_local_node_total Memory information field local_node_total. +# TYPE node_memory_numa_local_node_total counter +node_memory_numa_local_node_total{node="0"} 1.93454780853e+11 +node_memory_numa_local_node_total{node="1"} 3.2671904655e+11 +# HELP node_memory_numa_numa_foreign_total Memory information field numa_foreign_total. +# TYPE node_memory_numa_numa_foreign_total counter +node_memory_numa_numa_foreign_total{node="0"} 5.98586233e+10 +node_memory_numa_numa_foreign_total{node="1"} 1.2624528e+07 +# HELP node_memory_numa_numa_hit_total Memory information field numa_hit_total. +# TYPE node_memory_numa_numa_hit_total counter +node_memory_numa_numa_hit_total{node="0"} 1.93460335812e+11 +node_memory_numa_numa_hit_total{node="1"} 3.26720946761e+11 +# HELP node_memory_numa_numa_miss_total Memory information field numa_miss_total. +# TYPE node_memory_numa_numa_miss_total counter +node_memory_numa_numa_miss_total{node="0"} 1.2624528e+07 +node_memory_numa_numa_miss_total{node="1"} 5.9858626709e+10 +# HELP node_memory_numa_other_node_total Memory information field other_node_total. +# TYPE node_memory_numa_other_node_total counter +node_memory_numa_other_node_total{node="0"} 1.8179487e+07 +node_memory_numa_other_node_total{node="1"} 5.986052692e+10 # HELP node_net_bonding_slaves Number of configured slaves per bonding interface. # TYPE node_net_bonding_slaves gauge node_net_bonding_slaves{master="bond0"} 0 diff --git a/collector/fixtures/sys/devices/system/node/node0/numastat b/collector/fixtures/sys/devices/system/node/node0/numastat new file mode 100644 index 00000000..c17b6426 --- /dev/null +++ b/collector/fixtures/sys/devices/system/node/node0/numastat @@ -0,0 +1,6 @@ +numa_hit 193460335812 +numa_miss 12624528 +numa_foreign 59858623300 +interleave_hit 57146 +local_node 193454780853 +other_node 18179487 diff --git a/collector/fixtures/sys/devices/system/node/node1/numastat b/collector/fixtures/sys/devices/system/node/node1/numastat new file mode 100644 index 00000000..3187db14 --- /dev/null +++ b/collector/fixtures/sys/devices/system/node/node1/numastat @@ -0,0 +1,6 @@ +numa_hit 326720946761 +numa_miss 59858626709 +numa_foreign 12624528 +interleave_hit 57286 +local_node 326719046550 +other_node 59860526920 diff --git a/collector/meminfo_numa_linux.go b/collector/meminfo_numa_linux.go index 81466abf..f6310c65 100644 --- a/collector/meminfo_numa_linux.go +++ b/collector/meminfo_numa_linux.go @@ -33,8 +33,13 @@ const ( memInfoNumaSubsystem = "memory_numa" ) -type meminfoKey struct { - metricName, numaNode string +var meminfoNodeRE = regexp.MustCompile(`.*devices/system/node/node([0-9]*)`) + +type meminfoMetric struct { + metricName string + metricType prometheus.ValueType + numaNode string + value float64 } type meminfoNumaCollector struct { @@ -54,53 +59,70 @@ func NewMeminfoNumaCollector() (Collector, error) { } func (c *meminfoNumaCollector) Update(ch chan<- prometheus.Metric) (err error) { - memInfoNuma, err := getMemInfoNuma() + metrics, err := getMemInfoNuma() if err != nil { return fmt.Errorf("couldn't get NUMA meminfo: %s", err) } - for k, v := range memInfoNuma { - desc, ok := c.metricDescs[k.metricName] + for _, v := range metrics { + desc, ok := c.metricDescs[v.metricName] if !ok { desc = prometheus.NewDesc( - prometheus.BuildFQName(Namespace, memInfoNumaSubsystem, k.metricName), - fmt.Sprintf("Memory information field %s.", k.metricName), + prometheus.BuildFQName(Namespace, memInfoNumaSubsystem, v.metricName), + fmt.Sprintf("Memory information field %s.", v.metricName), []string{"node"}, nil) - c.metricDescs[k.metricName] = desc + c.metricDescs[v.metricName] = desc } - ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, v, k.numaNode) + ch <- prometheus.MustNewConstMetric(desc, v.metricType, v.value, v.numaNode) } return nil } -func getMemInfoNuma() (map[meminfoKey]float64, error) { - info := make(map[meminfoKey]float64) +func getMemInfoNuma() ([]meminfoMetric, error) { + var ( + metrics []meminfoMetric + ) nodes, err := filepath.Glob(sysFilePath("devices/system/node/node[0-9]*")) if err != nil { return nil, err } for _, node := range nodes { - file, err := os.Open(path.Join(node, "meminfo")) + meminfoFile, err := os.Open(path.Join(node, "meminfo")) if err != nil { return nil, err } - defer file.Close() + defer meminfoFile.Close() - numaInfo, err := parseMemInfoNuma(file) + numaInfo, err := parseMemInfoNuma(meminfoFile) if err != nil { return nil, err } - for k, v := range numaInfo { - info[k] = v + metrics = append(metrics, numaInfo...) + + numastatFile, err := os.Open(path.Join(node, "numastat")) + if err != nil { + return nil, err } + defer numastatFile.Close() + + nodeNumber := meminfoNodeRE.FindStringSubmatch(node) + if nodeNumber == nil { + return nil, fmt.Errorf("device node string didn't match regexp: %s", node) + } + + numaStat, err := parseMemInfoNumaStat(numastatFile, nodeNumber[1]) + if err != nil { + return nil, err + } + metrics = append(metrics, numaStat...) } - return info, nil + return metrics, nil } -func parseMemInfoNuma(r io.Reader) (map[meminfoKey]float64, error) { +func parseMemInfoNuma(r io.Reader) ([]meminfoMetric, error) { var ( - memInfo = map[meminfoKey]float64{} + memInfo []meminfoMetric scanner = bufio.NewScanner(r) re = regexp.MustCompile("\\((.*)\\)") ) @@ -127,8 +149,34 @@ func parseMemInfoNuma(r io.Reader) (map[meminfoKey]float64, error) { // Active(anon) -> Active_anon metric = re.ReplaceAllString(metric, "_${1}") - memInfo[meminfoKey{metric, parts[1]}] = fv + memInfo = append(memInfo, meminfoMetric{metric, prometheus.GaugeValue, parts[1], fv}) } - return memInfo, nil + return memInfo, scanner.Err() +} + +func parseMemInfoNumaStat(r io.Reader, nodeNumber string) ([]meminfoMetric, error) { + var ( + numaStat []meminfoMetric + scanner = bufio.NewScanner(r) + ) + + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" { + continue + } + parts := strings.Fields(string(line)) + if len(parts) != 2 { + return nil, fmt.Errorf("line scan did not return 2 fields: %s", line) + } + + fv, err := strconv.ParseFloat(parts[1], 64) + if err != nil { + return nil, fmt.Errorf("invalid value in numastat: %s", err) + } + + numaStat = append(numaStat, meminfoMetric{parts[0] + "_total", prometheus.CounterValue, nodeNumber, fv}) + } + return numaStat, scanner.Err() } diff --git a/collector/meminfo_numa_linux_test.go b/collector/meminfo_numa_linux_test.go index 26779bc5..a17714e8 100644 --- a/collector/meminfo_numa_linux_test.go +++ b/collector/meminfo_numa_linux_test.go @@ -30,11 +30,15 @@ func TestMemInfoNuma(t *testing.T) { t.Fatal(err) } - if want, got := 707915776.0, memInfo[meminfoKey{"Active_anon", "0"}]; want != got { - t.Errorf("want memory Active(anon) %f, got %f", want, got) + if want, got := 707915776.0, memInfo[5].value; want != got { + t.Errorf("want memory Active(anon) value %f, got %f", want, got) } - if want, got := 150994944.0, memInfo[meminfoKey{"AnonHugePages", "0"}]; want != got { + if want, got := "Active_anon", memInfo[5].metricName; want != got { + t.Errorf("want metric Active(anon) metricName %s, got %s", want, got) + } + + if want, got := 150994944.0, memInfo[25].value; want != got { t.Errorf("want memory AnonHugePages %f, got %f", want, got) } @@ -49,11 +53,55 @@ func TestMemInfoNuma(t *testing.T) { t.Fatal(err) } - if want, got := 291930112.0, memInfo[meminfoKey{"Inactive_anon", "1"}]; want != got { + if want, got := 291930112.0, memInfo[6].value; want != got { t.Errorf("want memory Inactive(anon) %f, got %f", want, got) } - if want, got := 85585088512.0, memInfo[meminfoKey{"FilePages", "1"}]; want != got { + if want, got := 85585088512.0, memInfo[13].value; want != got { t.Errorf("want memory FilePages %f, got %f", want, got) } } + +func TestMemInfoNumaStat(t *testing.T) { + file, err := os.Open("fixtures/sys/devices/system/node/node0/numastat") + if err != nil { + t.Fatal(err) + } + defer file.Close() + + numaStat, err := parseMemInfoNumaStat(file, "0") + if err != nil { + t.Fatal(err) + } + + if want, got := 193460335812.0, numaStat[0].value; want != got { + t.Errorf("want numa stat numa_hit value %f, got %f", want, got) + } + + if want, got := "numa_hit_total", numaStat[0].metricName; want != got { + t.Errorf("want numa stat numa_hit metricName %s, got %s", want, got) + } + + if want, got := 193454780853.0, numaStat[4].value; want != got { + t.Errorf("want numa stat local_node %f, got %f", want, got) + } + + file, err = os.Open("fixtures/sys/devices/system/node/node1/numastat") + if err != nil { + t.Fatal(err) + } + defer file.Close() + + numaStat, err = parseMemInfoNumaStat(file, "1") + if err != nil { + t.Fatal(err) + } + + if want, got := 59858626709.0, numaStat[1].value; want != got { + t.Errorf("want numa stat numa_miss %f, got %f", want, got) + } + + if want, got := 59860526920.0, numaStat[5].value; want != got { + t.Errorf("want numa stat other_node %f, got %f", want, got) + } +}