Add Linux NUMA "numastat" metrics (#249)

* Add Linux NUMA "numastat" metrics
  Read the `numastat` metrics from /sys/devices/system/node/node* when reading NUMA meminfo metrics.
* Update end-to-end test output.
* Add `numastat` metrics as counters.
* Add tests for error conditions.
* Refactor meminfo numa metrics struct
* Refactor meminfoKey into a simple struct of metric data.
  This makes it easier to pass slices of metrics around.
* Refactor tests.
* Fixup: Add suggested fixes.
* Fixup:  More fixes
* Add another scanner.Err() return
* Add "_total" to counter metrics.
This commit is contained in:
Ben Kochie 2016-10-12 13:07:49 +02:00 committed by Tobias Schmidt
parent 081ecc5db0
commit c6162312f2
5 changed files with 158 additions and 26 deletions

View file

@ -1011,6 +1011,30 @@ node_memory_numa_Writeback{node="1"} 0
# TYPE node_memory_numa_WritebackTmp gauge
node_memory_numa_WritebackTmp{node="0"} 0
node_memory_numa_WritebackTmp{node="1"} 0
# HELP node_memory_numa_interleave_hit_total Memory information field interleave_hit_total.
# TYPE node_memory_numa_interleave_hit_total counter
node_memory_numa_interleave_hit_total{node="0"} 57146
node_memory_numa_interleave_hit_total{node="1"} 57286
# HELP node_memory_numa_local_node_total Memory information field local_node_total.
# TYPE node_memory_numa_local_node_total counter
node_memory_numa_local_node_total{node="0"} 1.93454780853e+11
node_memory_numa_local_node_total{node="1"} 3.2671904655e+11
# HELP node_memory_numa_numa_foreign_total Memory information field numa_foreign_total.
# TYPE node_memory_numa_numa_foreign_total counter
node_memory_numa_numa_foreign_total{node="0"} 5.98586233e+10
node_memory_numa_numa_foreign_total{node="1"} 1.2624528e+07
# HELP node_memory_numa_numa_hit_total Memory information field numa_hit_total.
# TYPE node_memory_numa_numa_hit_total counter
node_memory_numa_numa_hit_total{node="0"} 1.93460335812e+11
node_memory_numa_numa_hit_total{node="1"} 3.26720946761e+11
# HELP node_memory_numa_numa_miss_total Memory information field numa_miss_total.
# TYPE node_memory_numa_numa_miss_total counter
node_memory_numa_numa_miss_total{node="0"} 1.2624528e+07
node_memory_numa_numa_miss_total{node="1"} 5.9858626709e+10
# HELP node_memory_numa_other_node_total Memory information field other_node_total.
# TYPE node_memory_numa_other_node_total counter
node_memory_numa_other_node_total{node="0"} 1.8179487e+07
node_memory_numa_other_node_total{node="1"} 5.986052692e+10
# HELP node_net_bonding_slaves Number of configured slaves per bonding interface.
# TYPE node_net_bonding_slaves gauge
node_net_bonding_slaves{master="bond0"} 0

View file

@ -0,0 +1,6 @@
numa_hit 193460335812
numa_miss 12624528
numa_foreign 59858623300
interleave_hit 57146
local_node 193454780853
other_node 18179487

View file

@ -0,0 +1,6 @@
numa_hit 326720946761
numa_miss 59858626709
numa_foreign 12624528
interleave_hit 57286
local_node 326719046550
other_node 59860526920

View file

@ -33,8 +33,13 @@ const (
memInfoNumaSubsystem = "memory_numa"
)
type meminfoKey struct {
metricName, numaNode string
var meminfoNodeRE = regexp.MustCompile(`.*devices/system/node/node([0-9]*)`)
type meminfoMetric struct {
metricName string
metricType prometheus.ValueType
numaNode string
value float64
}
type meminfoNumaCollector struct {
@ -54,53 +59,70 @@ func NewMeminfoNumaCollector() (Collector, error) {
}
func (c *meminfoNumaCollector) Update(ch chan<- prometheus.Metric) (err error) {
memInfoNuma, err := getMemInfoNuma()
metrics, err := getMemInfoNuma()
if err != nil {
return fmt.Errorf("couldn't get NUMA meminfo: %s", err)
}
for k, v := range memInfoNuma {
desc, ok := c.metricDescs[k.metricName]
for _, v := range metrics {
desc, ok := c.metricDescs[v.metricName]
if !ok {
desc = prometheus.NewDesc(
prometheus.BuildFQName(Namespace, memInfoNumaSubsystem, k.metricName),
fmt.Sprintf("Memory information field %s.", k.metricName),
prometheus.BuildFQName(Namespace, memInfoNumaSubsystem, v.metricName),
fmt.Sprintf("Memory information field %s.", v.metricName),
[]string{"node"}, nil)
c.metricDescs[k.metricName] = desc
c.metricDescs[v.metricName] = desc
}
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, v, k.numaNode)
ch <- prometheus.MustNewConstMetric(desc, v.metricType, v.value, v.numaNode)
}
return nil
}
func getMemInfoNuma() (map[meminfoKey]float64, error) {
info := make(map[meminfoKey]float64)
func getMemInfoNuma() ([]meminfoMetric, error) {
var (
metrics []meminfoMetric
)
nodes, err := filepath.Glob(sysFilePath("devices/system/node/node[0-9]*"))
if err != nil {
return nil, err
}
for _, node := range nodes {
file, err := os.Open(path.Join(node, "meminfo"))
meminfoFile, err := os.Open(path.Join(node, "meminfo"))
if err != nil {
return nil, err
}
defer file.Close()
defer meminfoFile.Close()
numaInfo, err := parseMemInfoNuma(file)
numaInfo, err := parseMemInfoNuma(meminfoFile)
if err != nil {
return nil, err
}
for k, v := range numaInfo {
info[k] = v
metrics = append(metrics, numaInfo...)
numastatFile, err := os.Open(path.Join(node, "numastat"))
if err != nil {
return nil, err
}
defer numastatFile.Close()
nodeNumber := meminfoNodeRE.FindStringSubmatch(node)
if nodeNumber == nil {
return nil, fmt.Errorf("device node string didn't match regexp: %s", node)
}
numaStat, err := parseMemInfoNumaStat(numastatFile, nodeNumber[1])
if err != nil {
return nil, err
}
metrics = append(metrics, numaStat...)
}
return info, nil
return metrics, nil
}
func parseMemInfoNuma(r io.Reader) (map[meminfoKey]float64, error) {
func parseMemInfoNuma(r io.Reader) ([]meminfoMetric, error) {
var (
memInfo = map[meminfoKey]float64{}
memInfo []meminfoMetric
scanner = bufio.NewScanner(r)
re = regexp.MustCompile("\\((.*)\\)")
)
@ -127,8 +149,34 @@ func parseMemInfoNuma(r io.Reader) (map[meminfoKey]float64, error) {
// Active(anon) -> Active_anon
metric = re.ReplaceAllString(metric, "_${1}")
memInfo[meminfoKey{metric, parts[1]}] = fv
memInfo = append(memInfo, meminfoMetric{metric, prometheus.GaugeValue, parts[1], fv})
}
return memInfo, nil
return memInfo, scanner.Err()
}
func parseMemInfoNumaStat(r io.Reader, nodeNumber string) ([]meminfoMetric, error) {
var (
numaStat []meminfoMetric
scanner = bufio.NewScanner(r)
)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if line == "" {
continue
}
parts := strings.Fields(string(line))
if len(parts) != 2 {
return nil, fmt.Errorf("line scan did not return 2 fields: %s", line)
}
fv, err := strconv.ParseFloat(parts[1], 64)
if err != nil {
return nil, fmt.Errorf("invalid value in numastat: %s", err)
}
numaStat = append(numaStat, meminfoMetric{parts[0] + "_total", prometheus.CounterValue, nodeNumber, fv})
}
return numaStat, scanner.Err()
}

View file

@ -30,11 +30,15 @@ func TestMemInfoNuma(t *testing.T) {
t.Fatal(err)
}
if want, got := 707915776.0, memInfo[meminfoKey{"Active_anon", "0"}]; want != got {
t.Errorf("want memory Active(anon) %f, got %f", want, got)
if want, got := 707915776.0, memInfo[5].value; want != got {
t.Errorf("want memory Active(anon) value %f, got %f", want, got)
}
if want, got := 150994944.0, memInfo[meminfoKey{"AnonHugePages", "0"}]; want != got {
if want, got := "Active_anon", memInfo[5].metricName; want != got {
t.Errorf("want metric Active(anon) metricName %s, got %s", want, got)
}
if want, got := 150994944.0, memInfo[25].value; want != got {
t.Errorf("want memory AnonHugePages %f, got %f", want, got)
}
@ -49,11 +53,55 @@ func TestMemInfoNuma(t *testing.T) {
t.Fatal(err)
}
if want, got := 291930112.0, memInfo[meminfoKey{"Inactive_anon", "1"}]; want != got {
if want, got := 291930112.0, memInfo[6].value; want != got {
t.Errorf("want memory Inactive(anon) %f, got %f", want, got)
}
if want, got := 85585088512.0, memInfo[meminfoKey{"FilePages", "1"}]; want != got {
if want, got := 85585088512.0, memInfo[13].value; want != got {
t.Errorf("want memory FilePages %f, got %f", want, got)
}
}
func TestMemInfoNumaStat(t *testing.T) {
file, err := os.Open("fixtures/sys/devices/system/node/node0/numastat")
if err != nil {
t.Fatal(err)
}
defer file.Close()
numaStat, err := parseMemInfoNumaStat(file, "0")
if err != nil {
t.Fatal(err)
}
if want, got := 193460335812.0, numaStat[0].value; want != got {
t.Errorf("want numa stat numa_hit value %f, got %f", want, got)
}
if want, got := "numa_hit_total", numaStat[0].metricName; want != got {
t.Errorf("want numa stat numa_hit metricName %s, got %s", want, got)
}
if want, got := 193454780853.0, numaStat[4].value; want != got {
t.Errorf("want numa stat local_node %f, got %f", want, got)
}
file, err = os.Open("fixtures/sys/devices/system/node/node1/numastat")
if err != nil {
t.Fatal(err)
}
defer file.Close()
numaStat, err = parseMemInfoNumaStat(file, "1")
if err != nil {
t.Fatal(err)
}
if want, got := 59858626709.0, numaStat[1].value; want != got {
t.Errorf("want numa stat numa_miss %f, got %f", want, got)
}
if want, got := 59860526920.0, numaStat[5].value; want != got {
t.Errorf("want numa stat other_node %f, got %f", want, got)
}
}