From 882162d5b95ea23b26642192049e9590fc121242 Mon Sep 17 00:00:00 2001 From: Krasi Georgiev Date: Thu, 30 May 2019 13:57:28 +0200 Subject: [PATCH] fix the "failed compaction" metric. (#613) Signed-off-by: Krasi Georgiev --- compact.go | 9 --------- compact_test.go | 3 +++ db.go | 11 +++++++++++ 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/compact.go b/compact.go index c0948bbf3..4a56f585c 100644 --- a/compact.go +++ b/compact.go @@ -84,7 +84,6 @@ type LeveledCompactor struct { type compactorMetrics struct { ran prometheus.Counter populatingBlocks prometheus.Gauge - failed prometheus.Counter overlappingBlocks prometheus.Counter duration prometheus.Histogram chunkSize prometheus.Histogram @@ -103,10 +102,6 @@ func newCompactorMetrics(r prometheus.Registerer) *compactorMetrics { Name: "prometheus_tsdb_compaction_populating_block", Help: "Set to 1 when a block is currently being written to the disk.", }) - m.failed = prometheus.NewCounter(prometheus.CounterOpts{ - Name: "prometheus_tsdb_compactions_failed_total", - Help: "Total number of compactions that failed for the partition.", - }) m.overlappingBlocks = prometheus.NewCounter(prometheus.CounterOpts{ Name: "prometheus_tsdb_vertical_compactions_total", Help: "Total number of compactions done on overlapping blocks.", @@ -136,7 +131,6 @@ func newCompactorMetrics(r prometheus.Registerer) *compactorMetrics { r.MustRegister( m.ran, m.populatingBlocks, - m.failed, m.overlappingBlocks, m.duration, m.chunkRange, @@ -541,9 +535,6 @@ func (c *LeveledCompactor) write(dest string, meta *BlockMeta, blocks ...BlockRe if err := os.RemoveAll(tmp); err != nil { level.Error(c.logger).Log("msg", "removed tmp folder after failed compaction", "err", err.Error()) } - if err != nil { - c.metrics.failed.Inc() - } c.metrics.ran.Inc() c.metrics.duration.Observe(time.Since(t).Seconds()) }(time.Now()) diff --git a/compact_test.go b/compact_test.go index 258047856..545dedbfc 100644 --- a/compact_test.go +++ b/compact_test.go @@ -1042,6 +1042,7 @@ func TestDeleteCompactionBlockAfterFailedReload(t *testing.T) { testutil.Equals(t, 0.0, prom_testutil.ToFloat64(db.metrics.reloadsFailed), "initial 'failed db reload' count metrics mismatch") testutil.Equals(t, 0.0, prom_testutil.ToFloat64(db.compactor.(*LeveledCompactor).metrics.ran), "initial `compactions` count metric mismatch") + testutil.Equals(t, 0.0, prom_testutil.ToFloat64(db.metrics.compactionsFailed), "initial `compactions failed` count metric mismatch") // Do the compaction and check the metrics. // Compaction should succeed, but the reload should fail and @@ -1049,6 +1050,8 @@ func TestDeleteCompactionBlockAfterFailedReload(t *testing.T) { testutil.NotOk(t, db.compact()) testutil.Equals(t, 1.0, prom_testutil.ToFloat64(db.metrics.reloadsFailed), "'failed db reload' count metrics mismatch") testutil.Equals(t, 1.0, prom_testutil.ToFloat64(db.compactor.(*LeveledCompactor).metrics.ran), "`compaction` count metric mismatch") + testutil.Equals(t, 1.0, prom_testutil.ToFloat64(db.metrics.compactionsFailed), "`compactions failed` count metric mismatch") + actBlocks, err = blockDirs(db.Dir()) testutil.Ok(t, err) testutil.Equals(t, expBlocks, len(actBlocks)-1, "block count should be the same as before the compaction") // -1 to exclude the corrupted block. diff --git a/db.go b/db.go index 52b21c2fd..f8e6f5232 100644 --- a/db.go +++ b/db.go @@ -147,6 +147,7 @@ type dbMetrics struct { reloads prometheus.Counter reloadsFailed prometheus.Counter compactionsTriggered prometheus.Counter + compactionsFailed prometheus.Counter timeRetentionCount prometheus.Counter compactionsSkipped prometheus.Counter startTime prometheus.GaugeFunc @@ -191,6 +192,10 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics { Name: "prometheus_tsdb_compactions_triggered_total", Help: "Total number of triggered compactions for the partition.", }) + m.compactionsFailed = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_compactions_failed_total", + Help: "Total number of compactions that failed for the partition.", + }) m.timeRetentionCount = prometheus.NewCounter(prometheus.CounterOpts{ Name: "prometheus_tsdb_time_retentions_total", Help: "The number of times that blocks were deleted because the maximum time limit was exceeded.", @@ -231,6 +236,7 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics { m.reloadsFailed, m.timeRetentionCount, m.compactionsTriggered, + m.compactionsFailed, m.startTime, m.tombCleanTimer, m.blocksBytes, @@ -411,6 +417,11 @@ func (a dbAppender) Commit() error { func (db *DB) compact() (err error) { db.cmtx.Lock() defer db.cmtx.Unlock() + defer func() { + if err != nil { + db.metrics.compactionsFailed.Inc() + } + }() // Check whether we have pending head blocks that are ready to be persisted. // They have the highest priority. for {