mirror of
https://github.com/prometheus/prometheus.git
synced 2024-11-09 23:24:05 -08:00
fix the "failed compaction" metric. (#613)
Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
This commit is contained in:
parent
13c80a5979
commit
882162d5b9
|
@ -84,7 +84,6 @@ type LeveledCompactor struct {
|
||||||
type compactorMetrics struct {
|
type compactorMetrics struct {
|
||||||
ran prometheus.Counter
|
ran prometheus.Counter
|
||||||
populatingBlocks prometheus.Gauge
|
populatingBlocks prometheus.Gauge
|
||||||
failed prometheus.Counter
|
|
||||||
overlappingBlocks prometheus.Counter
|
overlappingBlocks prometheus.Counter
|
||||||
duration prometheus.Histogram
|
duration prometheus.Histogram
|
||||||
chunkSize prometheus.Histogram
|
chunkSize prometheus.Histogram
|
||||||
|
@ -103,10 +102,6 @@ func newCompactorMetrics(r prometheus.Registerer) *compactorMetrics {
|
||||||
Name: "prometheus_tsdb_compaction_populating_block",
|
Name: "prometheus_tsdb_compaction_populating_block",
|
||||||
Help: "Set to 1 when a block is currently being written to the disk.",
|
Help: "Set to 1 when a block is currently being written to the disk.",
|
||||||
})
|
})
|
||||||
m.failed = prometheus.NewCounter(prometheus.CounterOpts{
|
|
||||||
Name: "prometheus_tsdb_compactions_failed_total",
|
|
||||||
Help: "Total number of compactions that failed for the partition.",
|
|
||||||
})
|
|
||||||
m.overlappingBlocks = prometheus.NewCounter(prometheus.CounterOpts{
|
m.overlappingBlocks = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
Name: "prometheus_tsdb_vertical_compactions_total",
|
Name: "prometheus_tsdb_vertical_compactions_total",
|
||||||
Help: "Total number of compactions done on overlapping blocks.",
|
Help: "Total number of compactions done on overlapping blocks.",
|
||||||
|
@ -136,7 +131,6 @@ func newCompactorMetrics(r prometheus.Registerer) *compactorMetrics {
|
||||||
r.MustRegister(
|
r.MustRegister(
|
||||||
m.ran,
|
m.ran,
|
||||||
m.populatingBlocks,
|
m.populatingBlocks,
|
||||||
m.failed,
|
|
||||||
m.overlappingBlocks,
|
m.overlappingBlocks,
|
||||||
m.duration,
|
m.duration,
|
||||||
m.chunkRange,
|
m.chunkRange,
|
||||||
|
@ -541,9 +535,6 @@ func (c *LeveledCompactor) write(dest string, meta *BlockMeta, blocks ...BlockRe
|
||||||
if err := os.RemoveAll(tmp); err != nil {
|
if err := os.RemoveAll(tmp); err != nil {
|
||||||
level.Error(c.logger).Log("msg", "removed tmp folder after failed compaction", "err", err.Error())
|
level.Error(c.logger).Log("msg", "removed tmp folder after failed compaction", "err", err.Error())
|
||||||
}
|
}
|
||||||
if err != nil {
|
|
||||||
c.metrics.failed.Inc()
|
|
||||||
}
|
|
||||||
c.metrics.ran.Inc()
|
c.metrics.ran.Inc()
|
||||||
c.metrics.duration.Observe(time.Since(t).Seconds())
|
c.metrics.duration.Observe(time.Since(t).Seconds())
|
||||||
}(time.Now())
|
}(time.Now())
|
||||||
|
|
|
@ -1042,6 +1042,7 @@ func TestDeleteCompactionBlockAfterFailedReload(t *testing.T) {
|
||||||
|
|
||||||
testutil.Equals(t, 0.0, prom_testutil.ToFloat64(db.metrics.reloadsFailed), "initial 'failed db reload' count metrics mismatch")
|
testutil.Equals(t, 0.0, prom_testutil.ToFloat64(db.metrics.reloadsFailed), "initial 'failed db reload' count metrics mismatch")
|
||||||
testutil.Equals(t, 0.0, prom_testutil.ToFloat64(db.compactor.(*LeveledCompactor).metrics.ran), "initial `compactions` count metric mismatch")
|
testutil.Equals(t, 0.0, prom_testutil.ToFloat64(db.compactor.(*LeveledCompactor).metrics.ran), "initial `compactions` count metric mismatch")
|
||||||
|
testutil.Equals(t, 0.0, prom_testutil.ToFloat64(db.metrics.compactionsFailed), "initial `compactions failed` count metric mismatch")
|
||||||
|
|
||||||
// Do the compaction and check the metrics.
|
// Do the compaction and check the metrics.
|
||||||
// Compaction should succeed, but the reload should fail and
|
// Compaction should succeed, but the reload should fail and
|
||||||
|
@ -1049,6 +1050,8 @@ func TestDeleteCompactionBlockAfterFailedReload(t *testing.T) {
|
||||||
testutil.NotOk(t, db.compact())
|
testutil.NotOk(t, db.compact())
|
||||||
testutil.Equals(t, 1.0, prom_testutil.ToFloat64(db.metrics.reloadsFailed), "'failed db reload' count metrics mismatch")
|
testutil.Equals(t, 1.0, prom_testutil.ToFloat64(db.metrics.reloadsFailed), "'failed db reload' count metrics mismatch")
|
||||||
testutil.Equals(t, 1.0, prom_testutil.ToFloat64(db.compactor.(*LeveledCompactor).metrics.ran), "`compaction` count metric mismatch")
|
testutil.Equals(t, 1.0, prom_testutil.ToFloat64(db.compactor.(*LeveledCompactor).metrics.ran), "`compaction` count metric mismatch")
|
||||||
|
testutil.Equals(t, 1.0, prom_testutil.ToFloat64(db.metrics.compactionsFailed), "`compactions failed` count metric mismatch")
|
||||||
|
|
||||||
actBlocks, err = blockDirs(db.Dir())
|
actBlocks, err = blockDirs(db.Dir())
|
||||||
testutil.Ok(t, err)
|
testutil.Ok(t, err)
|
||||||
testutil.Equals(t, expBlocks, len(actBlocks)-1, "block count should be the same as before the compaction") // -1 to exclude the corrupted block.
|
testutil.Equals(t, expBlocks, len(actBlocks)-1, "block count should be the same as before the compaction") // -1 to exclude the corrupted block.
|
||||||
|
|
11
db.go
11
db.go
|
@ -147,6 +147,7 @@ type dbMetrics struct {
|
||||||
reloads prometheus.Counter
|
reloads prometheus.Counter
|
||||||
reloadsFailed prometheus.Counter
|
reloadsFailed prometheus.Counter
|
||||||
compactionsTriggered prometheus.Counter
|
compactionsTriggered prometheus.Counter
|
||||||
|
compactionsFailed prometheus.Counter
|
||||||
timeRetentionCount prometheus.Counter
|
timeRetentionCount prometheus.Counter
|
||||||
compactionsSkipped prometheus.Counter
|
compactionsSkipped prometheus.Counter
|
||||||
startTime prometheus.GaugeFunc
|
startTime prometheus.GaugeFunc
|
||||||
|
@ -191,6 +192,10 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics {
|
||||||
Name: "prometheus_tsdb_compactions_triggered_total",
|
Name: "prometheus_tsdb_compactions_triggered_total",
|
||||||
Help: "Total number of triggered compactions for the partition.",
|
Help: "Total number of triggered compactions for the partition.",
|
||||||
})
|
})
|
||||||
|
m.compactionsFailed = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
|
Name: "prometheus_tsdb_compactions_failed_total",
|
||||||
|
Help: "Total number of compactions that failed for the partition.",
|
||||||
|
})
|
||||||
m.timeRetentionCount = prometheus.NewCounter(prometheus.CounterOpts{
|
m.timeRetentionCount = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
Name: "prometheus_tsdb_time_retentions_total",
|
Name: "prometheus_tsdb_time_retentions_total",
|
||||||
Help: "The number of times that blocks were deleted because the maximum time limit was exceeded.",
|
Help: "The number of times that blocks were deleted because the maximum time limit was exceeded.",
|
||||||
|
@ -231,6 +236,7 @@ func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics {
|
||||||
m.reloadsFailed,
|
m.reloadsFailed,
|
||||||
m.timeRetentionCount,
|
m.timeRetentionCount,
|
||||||
m.compactionsTriggered,
|
m.compactionsTriggered,
|
||||||
|
m.compactionsFailed,
|
||||||
m.startTime,
|
m.startTime,
|
||||||
m.tombCleanTimer,
|
m.tombCleanTimer,
|
||||||
m.blocksBytes,
|
m.blocksBytes,
|
||||||
|
@ -411,6 +417,11 @@ func (a dbAppender) Commit() error {
|
||||||
func (db *DB) compact() (err error) {
|
func (db *DB) compact() (err error) {
|
||||||
db.cmtx.Lock()
|
db.cmtx.Lock()
|
||||||
defer db.cmtx.Unlock()
|
defer db.cmtx.Unlock()
|
||||||
|
defer func() {
|
||||||
|
if err != nil {
|
||||||
|
db.metrics.compactionsFailed.Inc()
|
||||||
|
}
|
||||||
|
}()
|
||||||
// Check whether we have pending head blocks that are ready to be persisted.
|
// Check whether we have pending head blocks that are ready to be persisted.
|
||||||
// They have the highest priority.
|
// They have the highest priority.
|
||||||
for {
|
for {
|
||||||
|
|
Loading…
Reference in a new issue