From 09b4305090a6af1e69c528539fba63fc5ff781c6 Mon Sep 17 00:00:00 2001
From: mknapphrt <39998367+mknapphrt@users.noreply.github.com>
Date: Sat, 14 Jul 2018 05:10:28 -0400
Subject: [PATCH] Changed the way that stuck mounts are handled. If a mount
 fails to return, it will stop being queried until it returns. (#997)

Fixed spelling mistakes.

Update transport_generic.go

Changed to a mutex approach instead of channels and added a timeout before declaring a mount stuck.

Removed unnecessary lock channel and clarified some var names.

Fixed style nits.

Signed-off-by: Mark Knapp <mknapp@hudson-trading.com>
---
 collector/filesystem_linux.go | 55 ++++++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/collector/filesystem_linux.go b/collector/filesystem_linux.go
index e434c04d..78e0aea0 100644
--- a/collector/filesystem_linux.go
+++ b/collector/filesystem_linux.go
@@ -19,7 +19,9 @@ import (
 	"bufio"
 	"os"
 	"strings"
+	"sync"
 	"syscall"
+	"time"
 
 	"github.com/prometheus/common/log"
 )
@@ -28,8 +30,12 @@ const (
 	defIgnoredMountPoints = "^/(dev|proc|sys|var/lib/docker)($|/)"
 	defIgnoredFSTypes     = "^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$"
 	readOnly              = 0x1 // ST_RDONLY
+	mountTimeout          = 30 * time.Second
 )
 
+var stuckMounts = make(map[string]struct{})
+var stuckMountsMtx = &sync.Mutex{}
+
 // GetStats returns filesystem stats.
 func (c *filesystemCollector) GetStats() ([]filesystemStats, error) {
 	mps, err := mountPointDetails()
@@ -46,9 +52,35 @@ func (c *filesystemCollector) GetStats() ([]filesystemStats, error) {
 			log.Debugf("Ignoring fs type: %s", labels.fsType)
 			continue
 		}
+		stuckMountsMtx.Lock()
+		if _, ok := stuckMounts[labels.mountPoint]; ok {
+			stats = append(stats, filesystemStats{
+				labels:      labels,
+				deviceError: 1,
+			})
+			log.Debugf("Mount point %q is in an unresponsive state", labels.mountPoint)
+			stuckMountsMtx.Unlock()
+			continue
+		}
+		stuckMountsMtx.Unlock()
+
+		// The success channel is used do tell the "watcher" that the stat
+		// finished successfully. The channel is closed on success.
+		success := make(chan struct{})
+		go stuckMountWatcher(labels.mountPoint, success)
 
 		buf := new(syscall.Statfs_t)
-		err := syscall.Statfs(labels.mountPoint, buf)
+		err = syscall.Statfs(labels.mountPoint, buf)
+
+		stuckMountsMtx.Lock()
+		close(success)
+		// If the mount has been marked as stuck, unmark it and log it's recovery.
+		if _, ok := stuckMounts[labels.mountPoint]; ok {
+			log.Debugf("Mount point %q has recovered, monitoring will resume", labels.mountPoint)
+			delete(stuckMounts, labels.mountPoint)
+		}
+		stuckMountsMtx.Unlock()
+
 		if err != nil {
 			stats = append(stats, filesystemStats{
 				labels:      labels,
@@ -76,6 +108,27 @@ func (c *filesystemCollector) GetStats() ([]filesystemStats, error) {
 	return stats, nil
 }
 
+// stuckMountWatcher listens on the given success channel and if the channel closes
+// then the watcher does nothing. If instead the timeout is reached, the
+// mount point that is being watched is marked as stuck.
+func stuckMountWatcher(mountPoint string, success chan struct{}) {
+	select {
+	case <-success:
+		// Success
+	case <-time.After(mountTimeout):
+		// Timed out, mark mount as stuck
+		stuckMountsMtx.Lock()
+		select {
+		case <-success:
+			// Success came in just after the timeout was reached, don't label the mount as stuck
+		default:
+			log.Debugf("Mount point %q timed out, it is being labeled as stuck and will not be monitored", mountPoint)
+			stuckMounts[mountPoint] = struct{}{}
+		}
+		stuckMountsMtx.Unlock()
+	}
+}
+
 func mountPointDetails() ([]filesystemLabels, error) {
 	file, err := os.Open(procFilePath("mounts"))
 	if err != nil {