diff --git a/checkpoint.go b/checkpoint.go index f45f3791f0..5559452a18 100644 --- a/checkpoint.go +++ b/checkpoint.go @@ -26,6 +26,7 @@ import ( "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/tsdb/fileutil" "github.com/prometheus/tsdb/wal" ) @@ -102,7 +103,7 @@ const checkpointPrefix = "checkpoint." // it with the original WAL. // // Non-critical errors are logged and not returned. -func Checkpoint(logger log.Logger, w *wal.WAL, m, n int, keep func(id uint64) bool, mint int64) (*CheckpointStats, error) { +func Checkpoint(logger log.Logger, w *wal.WAL, m, n int, keep func(id uint64) bool, mint int64, checkpointDeleteFail prometheus.Counter) (*CheckpointStats, error) { if logger == nil { logger = log.NewNopLogger() } @@ -283,6 +284,7 @@ func Checkpoint(logger log.Logger, w *wal.WAL, m, n int, keep func(id uint64) bo // occupying disk space. // They will just be ignored since a higher checkpoint exists. level.Error(logger).Log("msg", "delete old checkpoints", "err", err) + checkpointDeleteFail.Add(float64(1)) } return stats, nil } diff --git a/checkpoint_test.go b/checkpoint_test.go index daa54df194..97130c2912 100644 --- a/checkpoint_test.go +++ b/checkpoint_test.go @@ -20,6 +20,7 @@ import ( "path/filepath" "testing" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/tsdb/fileutil" "github.com/prometheus/tsdb/labels" "github.com/prometheus/tsdb/testutil" @@ -139,7 +140,7 @@ func TestCheckpoint(t *testing.T) { _, err = Checkpoint(nil, w, 100, 106, func(x uint64) bool { return x%2 == 0 - }, last/2) + }, last/2, prometheus.NewCounter(prometheus.CounterOpts{})) testutil.Ok(t, err) // Only the new checkpoint should be left. diff --git a/head.go b/head.go index bc8cdfbe4b..e5afccffdb 100644 --- a/head.go +++ b/head.go @@ -76,19 +76,20 @@ type Head struct { } type headMetrics struct { - activeAppenders prometheus.Gauge - series prometheus.Gauge - seriesCreated prometheus.Counter - seriesRemoved prometheus.Counter - seriesNotFound prometheus.Counter - chunks prometheus.Gauge - chunksCreated prometheus.Counter - chunksRemoved prometheus.Counter - gcDuration prometheus.Summary - minTime prometheus.GaugeFunc - maxTime prometheus.GaugeFunc - samplesAppended prometheus.Counter - walTruncateDuration prometheus.Summary + activeAppenders prometheus.Gauge + series prometheus.Gauge + seriesCreated prometheus.Counter + seriesRemoved prometheus.Counter + seriesNotFound prometheus.Counter + chunks prometheus.Gauge + chunksCreated prometheus.Counter + chunksRemoved prometheus.Counter + gcDuration prometheus.Summary + minTime prometheus.GaugeFunc + maxTime prometheus.GaugeFunc + samplesAppended prometheus.Counter + walTruncateDuration prometheus.Summary + checkpointDeleteFail prometheus.Counter } func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics { @@ -150,6 +151,10 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics { Name: "prometheus_tsdb_head_samples_appended_total", Help: "Total number of appended samples.", }) + m.checkpointDeleteFail = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_checkpoint_delete_fail", + Help: "Number of times deletion of old checkpoint failed.", + }) if r != nil { r.MustRegister( @@ -166,6 +171,7 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics { m.gcDuration, m.walTruncateDuration, m.samplesAppended, + m.checkpointDeleteFail, ) } return m @@ -469,7 +475,7 @@ func (h *Head) Truncate(mint int64) error { keep := func(id uint64) bool { return h.series.getByID(id) != nil } - if _, err = Checkpoint(h.logger, h.wal, m, n, keep, mint); err != nil { + if _, err = Checkpoint(h.logger, h.wal, m, n, keep, mint, h.metrics.checkpointDeleteFail); err != nil { return errors.Wrap(err, "create checkpoint") } h.metrics.walTruncateDuration.Observe(time.Since(start).Seconds()) diff --git a/wal/wal.go b/wal/wal.go index aa52738fa2..ead1d546bc 100644 --- a/wal/wal.go +++ b/wal/wal.go @@ -162,6 +162,7 @@ type WAL struct { fsyncDuration prometheus.Summary pageFlushes prometheus.Counter pageCompletions prometheus.Counter + truncateFail prometheus.Counter } // New returns a new WAL over the given directory. @@ -201,8 +202,12 @@ func NewSize(logger log.Logger, reg prometheus.Registerer, dir string, segmentSi Name: "prometheus_tsdb_wal_completed_pages_total", Help: "Total number of completed pages.", }) + w.truncateFail = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_wal_truncate_fail", + Help: "Number of times WAL truncation failed.", + }) if reg != nil { - reg.MustRegister(w.fsyncDuration, w.pageFlushes, w.pageCompletions) + reg.MustRegister(w.fsyncDuration, w.pageFlushes, w.pageCompletions, w.truncateFail) } _, j, err := w.Segments() @@ -530,6 +535,7 @@ func (w *WAL) Segments() (m, n int, err error) { func (w *WAL) Truncate(i int) error { refs, err := listSegments(w.dir) if err != nil { + w.truncateFail.Add(float64(1)) return err } for _, r := range refs { @@ -537,6 +543,7 @@ func (w *WAL) Truncate(i int) error { break } if err := os.Remove(filepath.Join(w.dir, r.s)); err != nil { + w.truncateFail.Add(float64(1)) return err } }