fix the "failed compaction" metric. (#613)

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
This commit is contained in:
Krasi Georgiev 2019-05-30 13:57:28 +02:00 committed by GitHub
parent 13c80a5979
commit 882162d5b9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 14 additions and 9 deletions

View file

@ -84,7 +84,6 @@ type LeveledCompactor struct {
type compactorMetrics struct { type compactorMetrics struct {
ran prometheus.Counter ran prometheus.Counter
populatingBlocks prometheus.Gauge populatingBlocks prometheus.Gauge
failed prometheus.Counter
overlappingBlocks prometheus.Counter overlappingBlocks prometheus.Counter
duration prometheus.Histogram duration prometheus.Histogram
chunkSize prometheus.Histogram chunkSize prometheus.Histogram
@ -103,10 +102,6 @@ func newCompactorMetrics(r prometheus.Registerer) *compactorMetrics {
Name: "prometheus_tsdb_compaction_populating_block", Name: "prometheus_tsdb_compaction_populating_block",
Help: "Set to 1 when a block is currently being written to the disk.", Help: "Set to 1 when a block is currently being written to the disk.",
}) })
m.failed = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_compactions_failed_total",
Help: "Total number of compactions that failed for the partition.",
})
m.overlappingBlocks = prometheus.NewCounter(prometheus.CounterOpts{ m.overlappingBlocks = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_vertical_compactions_total", Name: "prometheus_tsdb_vertical_compactions_total",
Help: "Total number of compactions done on overlapping blocks.", Help: "Total number of compactions done on overlapping blocks.",
@ -136,7 +131,6 @@ func newCompactorMetrics(r prometheus.Registerer) *compactorMetrics {
r.MustRegister( r.MustRegister(
m.ran, m.ran,
m.populatingBlocks, m.populatingBlocks,
m.failed,
m.overlappingBlocks, m.overlappingBlocks,
m.duration, m.duration,
m.chunkRange, m.chunkRange,
@ -541,9 +535,6 @@ func (c *LeveledCompactor) write(dest string, meta *BlockMeta, blocks ...BlockRe
if err := os.RemoveAll(tmp); err != nil { if err := os.RemoveAll(tmp); err != nil {
level.Error(c.logger).Log("msg", "removed tmp folder after failed compaction", "err", err.Error()) level.Error(c.logger).Log("msg", "removed tmp folder after failed compaction", "err", err.Error())
} }
if err != nil {
c.metrics.failed.Inc()
}
c.metrics.ran.Inc() c.metrics.ran.Inc()
c.metrics.duration.Observe(time.Since(t).Seconds()) c.metrics.duration.Observe(time.Since(t).Seconds())
}(time.Now()) }(time.Now())

View file

@ -1042,6 +1042,7 @@ func TestDeleteCompactionBlockAfterFailedReload(t *testing.T) {
testutil.Equals(t, 0.0, prom_testutil.ToFloat64(db.metrics.reloadsFailed), "initial 'failed db reload' count metrics mismatch") testutil.Equals(t, 0.0, prom_testutil.ToFloat64(db.metrics.reloadsFailed), "initial 'failed db reload' count metrics mismatch")
testutil.Equals(t, 0.0, prom_testutil.ToFloat64(db.compactor.(*LeveledCompactor).metrics.ran), "initial `compactions` count metric mismatch") testutil.Equals(t, 0.0, prom_testutil.ToFloat64(db.compactor.(*LeveledCompactor).metrics.ran), "initial `compactions` count metric mismatch")
testutil.Equals(t, 0.0, prom_testutil.ToFloat64(db.metrics.compactionsFailed), "initial `compactions failed` count metric mismatch")
// Do the compaction and check the metrics. // Do the compaction and check the metrics.
// Compaction should succeed, but the reload should fail and // Compaction should succeed, but the reload should fail and
@ -1049,6 +1050,8 @@ func TestDeleteCompactionBlockAfterFailedReload(t *testing.T) {
testutil.NotOk(t, db.compact()) testutil.NotOk(t, db.compact())
testutil.Equals(t, 1.0, prom_testutil.ToFloat64(db.metrics.reloadsFailed), "'failed db reload' count metrics mismatch") testutil.Equals(t, 1.0, prom_testutil.ToFloat64(db.metrics.reloadsFailed), "'failed db reload' count metrics mismatch")
testutil.Equals(t, 1.0, prom_testutil.ToFloat64(db.compactor.(*LeveledCompactor).metrics.ran), "`compaction` count metric mismatch") testutil.Equals(t, 1.0, prom_testutil.ToFloat64(db.compactor.(*LeveledCompactor).metrics.ran), "`compaction` count metric mismatch")
testutil.Equals(t, 1.0, prom_testutil.ToFloat64(db.metrics.compactionsFailed), "`compactions failed` count metric mismatch")
actBlocks, err = blockDirs(db.Dir()) actBlocks, err = blockDirs(db.Dir())
testutil.Ok(t, err) testutil.Ok(t, err)
testutil.Equals(t, expBlocks, len(actBlocks)-1, "block count should be the same as before the compaction") // -1 to exclude the corrupted block. testutil.Equals(t, expBlocks, len(actBlocks)-1, "block count should be the same as before the compaction") // -1 to exclude the corrupted block.

11
db.go
View file

@ -147,6 +147,7 @@ type dbMetrics struct {
reloads prometheus.Counter reloads prometheus.Counter
reloadsFailed prometheus.Counter reloadsFailed prometheus.Counter
compactionsTriggered prometheus.Counter compactionsTriggered prometheus.Counter
compactionsFailed prometheus.Counter
timeRetentionCount prometheus.Counter timeRetentionCount prometheus.Counter
compactionsSkipped prometheus.Counter compactionsSkipped prometheus.Counter
startTime prometheus.GaugeFunc startTime prometheus.GaugeFunc
@ -191,6 +192,10 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics {
Name: "prometheus_tsdb_compactions_triggered_total", Name: "prometheus_tsdb_compactions_triggered_total",
Help: "Total number of triggered compactions for the partition.", Help: "Total number of triggered compactions for the partition.",
}) })
m.compactionsFailed = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_compactions_failed_total",
Help: "Total number of compactions that failed for the partition.",
})
m.timeRetentionCount = prometheus.NewCounter(prometheus.CounterOpts{ m.timeRetentionCount = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_time_retentions_total", Name: "prometheus_tsdb_time_retentions_total",
Help: "The number of times that blocks were deleted because the maximum time limit was exceeded.", Help: "The number of times that blocks were deleted because the maximum time limit was exceeded.",
@ -231,6 +236,7 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics {
m.reloadsFailed, m.reloadsFailed,
m.timeRetentionCount, m.timeRetentionCount,
m.compactionsTriggered, m.compactionsTriggered,
m.compactionsFailed,
m.startTime, m.startTime,
m.tombCleanTimer, m.tombCleanTimer,
m.blocksBytes, m.blocksBytes,
@ -411,6 +417,11 @@ func (a dbAppender) Commit() error {
func (db *DB) compact() (err error) { func (db *DB) compact() (err error) {
db.cmtx.Lock() db.cmtx.Lock()
defer db.cmtx.Unlock() defer db.cmtx.Unlock()
defer func() {
if err != nil {
db.metrics.compactionsFailed.Inc()
}
}()
// Check whether we have pending head blocks that are ready to be persisted. // Check whether we have pending head blocks that are ready to be persisted.
// They have the highest priority. // They have the highest priority.
for { for {