node_exporter/collector/filesystem_linux.go
Ben Kochie 4abc6fba7d
Add fallback for missing /proc/1/mounts (#1172)
* Add fallback for missing /proc/1/mounts

On some systems, `/proc/1/mounts` is hidden from non-root users due to
the `hidepid` procfs feature. Attempt to fallback to `/proc/mounts` if
`/proc/1/mounts` is not found.

Signed-off-by: Ben Kochie <superq@gmail.com>

* Add tests.

Signed-off-by: Ben Kochie <superq@gmail.com>

* Add CHANGELOG entry.

Signed-off-by: Ben Kochie <superq@gmail.com>
2018-11-30 14:01:55 +01:00

177 lines
5.1 KiB
Go

// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build !nofilesystem
package collector
import (
"bufio"
"fmt"
"io"
"os"
"strings"
"sync"
"syscall"
"time"
"github.com/prometheus/common/log"
)
const (
defIgnoredMountPoints = "^/(dev|proc|sys|var/lib/docker/.+)($|/)"
defIgnoredFSTypes = "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$"
mountTimeout = 30 * time.Second
)
var stuckMounts = make(map[string]struct{})
var stuckMountsMtx = &sync.Mutex{}
// GetStats returns filesystem stats.
func (c *filesystemCollector) GetStats() ([]filesystemStats, error) {
mps, err := mountPointDetails()
if err != nil {
return nil, err
}
stats := []filesystemStats{}
for _, labels := range mps {
if c.ignoredMountPointsPattern.MatchString(labels.mountPoint) {
log.Debugf("Ignoring mount point: %s", labels.mountPoint)
continue
}
if c.ignoredFSTypesPattern.MatchString(labels.fsType) {
log.Debugf("Ignoring fs type: %s", labels.fsType)
continue
}
stuckMountsMtx.Lock()
if _, ok := stuckMounts[labels.mountPoint]; ok {
stats = append(stats, filesystemStats{
labels: labels,
deviceError: 1,
})
log.Debugf("Mount point %q is in an unresponsive state", labels.mountPoint)
stuckMountsMtx.Unlock()
continue
}
stuckMountsMtx.Unlock()
// The success channel is used do tell the "watcher" that the stat
// finished successfully. The channel is closed on success.
success := make(chan struct{})
go stuckMountWatcher(labels.mountPoint, success)
buf := new(syscall.Statfs_t)
err = syscall.Statfs(rootfsFilePath(labels.mountPoint), buf)
stuckMountsMtx.Lock()
close(success)
// If the mount has been marked as stuck, unmark it and log it's recovery.
if _, ok := stuckMounts[labels.mountPoint]; ok {
log.Debugf("Mount point %q has recovered, monitoring will resume", labels.mountPoint)
delete(stuckMounts, labels.mountPoint)
}
stuckMountsMtx.Unlock()
if err != nil {
stats = append(stats, filesystemStats{
labels: labels,
deviceError: 1,
})
log.Debugf("Error on statfs() system call for %q: %s", rootfsFilePath(labels.mountPoint), err)
continue
}
var ro float64
for _, option := range strings.Split(labels.options, ",") {
if option == "ro" {
ro = 1
break
}
}
stats = append(stats, filesystemStats{
labels: labels,
size: float64(buf.Blocks) * float64(buf.Bsize),
free: float64(buf.Bfree) * float64(buf.Bsize),
avail: float64(buf.Bavail) * float64(buf.Bsize),
files: float64(buf.Files),
filesFree: float64(buf.Ffree),
ro: ro,
})
}
return stats, nil
}
// stuckMountWatcher listens on the given success channel and if the channel closes
// then the watcher does nothing. If instead the timeout is reached, the
// mount point that is being watched is marked as stuck.
func stuckMountWatcher(mountPoint string, success chan struct{}) {
select {
case <-success:
// Success
case <-time.After(mountTimeout):
// Timed out, mark mount as stuck
stuckMountsMtx.Lock()
select {
case <-success:
// Success came in just after the timeout was reached, don't label the mount as stuck
default:
log.Debugf("Mount point %q timed out, it is being labeled as stuck and will not be monitored", mountPoint)
stuckMounts[mountPoint] = struct{}{}
}
stuckMountsMtx.Unlock()
}
}
func mountPointDetails() ([]filesystemLabels, error) {
file, err := os.Open(procFilePath("1/mounts"))
if os.IsNotExist(err) {
// Fallback to `/proc/mounts` if `/proc/1/mounts` is missing due hidepid.
log.Debugf("Got %q reading root mounts, falling back to system mounts", err)
file, err = os.Open(procFilePath("mounts"))
}
if err != nil {
return nil, err
}
defer file.Close()
return parseFilesystemLabels(file)
}
func parseFilesystemLabels(r io.Reader) ([]filesystemLabels, error) {
var filesystems []filesystemLabels
scanner := bufio.NewScanner(r)
for scanner.Scan() {
parts := strings.Fields(scanner.Text())
if len(parts) < 4 {
return nil, fmt.Errorf("malformed mount point information: %q", scanner.Text())
}
// Ensure we handle the translation of \040 and \011
// as per fstab(5).
parts[1] = strings.Replace(parts[1], "\\040", " ", -1)
parts[1] = strings.Replace(parts[1], "\\011", "\t", -1)
filesystems = append(filesystems, filesystemLabels{
device: parts[0],
mountPoint: parts[1],
fsType: parts[2],
options: parts[3],
})
}
return filesystems, scanner.Err()
}