Add prometheus_tsdb_clean_start metric (#8824)

Add cleanup of the lockfile when the db is cleanly closed

The metric describes the status of the lockfile on startup
0: Already existed
1: Did not exist
-1: Disabled

Therefore, if the min value over time of this metric is 0, that means that executions have exited uncleanly
We can then use that metric to have a much lower threshold on the crashlooping alert:

If the metric exists and it has been zero, two restarts is enough to trigger the alarm
If it does not exist (old prom version for example), the current five restarts threshold remains

Signed-off-by: Julien Duchesne <julien.duchesne@grafana.com>

* Change metric name + set unset value to -1

Signed-off-by: Julien Duchesne <julien.duchesne@grafana.com>

* Only check the last value of the clean start alert

Signed-off-by: Julien Duchesne <julien.duchesne@grafana.com>

* Fix test + nit

Signed-off-by: Julien Duchesne <julien.duchesne@grafana.com>
This commit is contained in:
Julien Duchesne 2021-06-16 05:33:02 -04:00 committed by GitHub
parent b59822904d
commit 8855c2e626
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 119 additions and 17 deletions

View file

@ -386,6 +386,21 @@
{
alert: 'PrometheusHAGroupCrashlooping',
expr: |||
(
prometheus_tsdb_clean_start{%(prometheusSelector)s} == 0
and
(
count by (%(prometheusHAGroupLabels)s) (
changes(process_start_time_seconds{%(prometheusSelector)s}[30m]) > 1
)
/
count by (%(prometheusHAGroupLabels)s) (
up{%(prometheusSelector)s}
)
)
> 0.5
)
or
(
count by (%(prometheusHAGroupLabels)s) (
changes(process_start_time_seconds{%(prometheusSelector)s}[30m]) > 4
@ -403,7 +418,7 @@
},
annotations: {
summary: 'More than half of the Prometheus instances within the same HA group are crashlooping.',
description: '{{ $value | humanizePercentage }} of Prometheus instances within the %(prometheusHAGroupName)s HA group have restarted at least 5 times in the last 30m.' % $._config,
description: '{{ $value | humanizePercentage }} of Prometheus instances within the %(prometheusHAGroupName)s HA group have had at least 5 total restarts or 2 unclean restarts in the last 30m.' % $._config,
},
},
],

View file

@ -58,6 +58,10 @@ const (
tmpForCreationBlockDirSuffix = ".tmp-for-creation"
// Pre-2.21 tmp dir suffix, used in clean-up functions.
tmpLegacy = ".tmp"
lockfileDisabled = -1
lockfileReplaced = 0
lockfileCreatedCleanly = 1
)
var (
@ -153,8 +157,9 @@ type BlocksToDeleteFunc func(blocks []*Block) map[ulid.ULID]struct{}
// DB handles reads and writes of time series falling into
// a hashed partition of a seriedb.
type DB struct {
dir string
lockf fileutil.Releaser
dir string
lockf fileutil.Releaser
lockfPath string
logger log.Logger
metrics *dbMetrics
@ -186,19 +191,20 @@ type DB struct {
}
type dbMetrics struct {
loadedBlocks prometheus.GaugeFunc
symbolTableSize prometheus.GaugeFunc
reloads prometheus.Counter
reloadsFailed prometheus.Counter
compactionsFailed prometheus.Counter
compactionsTriggered prometheus.Counter
compactionsSkipped prometheus.Counter
sizeRetentionCount prometheus.Counter
timeRetentionCount prometheus.Counter
startTime prometheus.GaugeFunc
tombCleanTimer prometheus.Histogram
blocksBytes prometheus.Gauge
maxBytes prometheus.Gauge
loadedBlocks prometheus.GaugeFunc
symbolTableSize prometheus.GaugeFunc
reloads prometheus.Counter
reloadsFailed prometheus.Counter
compactionsFailed prometheus.Counter
compactionsTriggered prometheus.Counter
compactionsSkipped prometheus.Counter
sizeRetentionCount prometheus.Counter
timeRetentionCount prometheus.Counter
startTime prometheus.GaugeFunc
tombCleanTimer prometheus.Histogram
blocksBytes prometheus.Gauge
maxBytes prometheus.Gauge
lockfileCreatedCleanly prometheus.Gauge
}
func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics {
@ -276,6 +282,10 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics {
Name: "prometheus_tsdb_size_retentions_total",
Help: "The number of times that blocks were deleted because the maximum number of bytes was exceeded.",
})
m.lockfileCreatedCleanly = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "prometheus_tsdb_clean_start",
Help: "-1: lockfile is disabled. 0: a lockfile from a previous execution was replaced. 1: lockfile creation was clean",
})
if r != nil {
r.MustRegister(
@ -292,6 +302,7 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics {
m.tombCleanTimer,
m.blocksBytes,
m.maxBytes,
m.lockfileCreatedCleanly,
)
}
return m
@ -653,12 +664,22 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
db.blocksToDelete = DefaultBlocksToDelete(db)
}
lockfileCreationStatus := lockfileDisabled
if !opts.NoLockfile {
absdir, err := filepath.Abs(dir)
if err != nil {
return nil, err
}
lockf, _, err := fileutil.Flock(filepath.Join(absdir, "lock"))
db.lockfPath = filepath.Join(absdir, "lock")
if _, err := os.Stat(db.lockfPath); err == nil {
level.Warn(db.logger).Log("msg", "A TSDB lockfile from a previous execution already existed. It was replaced", "file", db.lockfPath)
lockfileCreationStatus = lockfileReplaced
} else {
lockfileCreationStatus = lockfileCreatedCleanly
}
lockf, _, err := fileutil.Flock(db.lockfPath)
if err != nil {
return nil, errors.Wrap(err, "lock DB directory")
}
@ -707,6 +728,7 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
if maxBytes < 0 {
maxBytes = 0
}
db.metrics.lockfileCreatedCleanly.Set(float64(lockfileCreationStatus))
db.metrics.maxBytes.Set(float64(maxBytes))
if err := db.reload(); err != nil {
@ -1418,6 +1440,7 @@ func (db *DB) Close() error {
errs := tsdb_errors.NewMulti(g.Wait())
if db.lockf != nil {
errs.Add(db.lockf.Release())
errs.Add(os.Remove(db.lockfPath))
}
if db.head != nil {
errs.Add(db.head.Close())

View file

@ -3125,6 +3125,70 @@ func TestNoPanicOnTSDBOpenError(t *testing.T) {
require.NoError(t, lockf.Release())
}
func TestLockfileMetric(t *testing.T) {
cases := []struct {
fileAlreadyExists bool
lockFileDisabled bool
expectedValue int
}{
{
fileAlreadyExists: false,
lockFileDisabled: false,
expectedValue: lockfileCreatedCleanly,
},
{
fileAlreadyExists: true,
lockFileDisabled: false,
expectedValue: lockfileReplaced,
},
{
fileAlreadyExists: true,
lockFileDisabled: true,
expectedValue: lockfileDisabled,
},
{
fileAlreadyExists: false,
lockFileDisabled: true,
expectedValue: lockfileDisabled,
},
}
for _, c := range cases {
t.Run(fmt.Sprintf("%+v", c), func(t *testing.T) {
tmpdir, err := ioutil.TempDir("", "test")
require.NoError(t, err)
t.Cleanup(func() {
require.NoError(t, os.RemoveAll(tmpdir))
})
absdir, err := filepath.Abs(tmpdir)
require.NoError(t, err)
// Test preconditions (file already exists + lockfile option)
lockfilePath := filepath.Join(absdir, "lock")
if c.fileAlreadyExists {
err = ioutil.WriteFile(lockfilePath, []byte{}, 0644)
require.NoError(t, err)
}
opts := DefaultOptions()
opts.NoLockfile = c.lockFileDisabled
// Create the DB, this should create a lockfile and the metrics
db, err := Open(tmpdir, nil, nil, opts)
require.NoError(t, err)
require.Equal(t, float64(c.expectedValue), prom_testutil.ToFloat64(db.metrics.lockfileCreatedCleanly))
// Close the DB, this should delete the lockfile
require.NoError(t, db.Close())
// Check that the lockfile is always deleted
if !c.lockFileDisabled {
_, err = os.Stat(lockfilePath)
require.Error(t, err, "lockfile was not deleted")
}
})
}
}
func TestQuerier_ShouldNotPanicIfHeadChunkIsTruncatedWhileReadingQueriedChunks(t *testing.T) {
t.Skip("TODO: investigate why process crash in CI")