Add new metrics.

1. 'prometheus_tsdb_wal_truncate_fail' for failed WAL truncation.
2. 'prometheus_tsdb_checkpoint_delete_fail' for failed old checkpoint delete.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
This commit is contained in:
Ganesh Vernekar 2018-09-25 17:19:09 +05:30
parent a971f52ac8
commit 632dfb349e
No known key found for this signature in database
GPG key ID: 0241A11211763456
4 changed files with 33 additions and 17 deletions

View file

@ -26,6 +26,7 @@ import (
"github.com/go-kit/kit/log" "github.com/go-kit/kit/log"
"github.com/go-kit/kit/log/level" "github.com/go-kit/kit/log/level"
"github.com/pkg/errors" "github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/tsdb/fileutil" "github.com/prometheus/tsdb/fileutil"
"github.com/prometheus/tsdb/wal" "github.com/prometheus/tsdb/wal"
) )
@ -102,7 +103,7 @@ const checkpointPrefix = "checkpoint."
// it with the original WAL. // it with the original WAL.
// //
// Non-critical errors are logged and not returned. // Non-critical errors are logged and not returned.
func Checkpoint(logger log.Logger, w *wal.WAL, m, n int, keep func(id uint64) bool, mint int64) (*CheckpointStats, error) { func Checkpoint(logger log.Logger, w *wal.WAL, m, n int, keep func(id uint64) bool, mint int64, checkpointDeleteFail prometheus.Counter) (*CheckpointStats, error) {
if logger == nil { if logger == nil {
logger = log.NewNopLogger() logger = log.NewNopLogger()
} }
@ -283,6 +284,7 @@ func Checkpoint(logger log.Logger, w *wal.WAL, m, n int, keep func(id uint64) bo
// occupying disk space. // occupying disk space.
// They will just be ignored since a higher checkpoint exists. // They will just be ignored since a higher checkpoint exists.
level.Error(logger).Log("msg", "delete old checkpoints", "err", err) level.Error(logger).Log("msg", "delete old checkpoints", "err", err)
checkpointDeleteFail.Add(float64(1))
} }
return stats, nil return stats, nil
} }

View file

@ -20,6 +20,7 @@ import (
"path/filepath" "path/filepath"
"testing" "testing"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/tsdb/fileutil" "github.com/prometheus/tsdb/fileutil"
"github.com/prometheus/tsdb/labels" "github.com/prometheus/tsdb/labels"
"github.com/prometheus/tsdb/testutil" "github.com/prometheus/tsdb/testutil"
@ -139,7 +140,7 @@ func TestCheckpoint(t *testing.T) {
_, err = Checkpoint(nil, w, 100, 106, func(x uint64) bool { _, err = Checkpoint(nil, w, 100, 106, func(x uint64) bool {
return x%2 == 0 return x%2 == 0
}, last/2) }, last/2, prometheus.NewCounter(prometheus.CounterOpts{}))
testutil.Ok(t, err) testutil.Ok(t, err)
// Only the new checkpoint should be left. // Only the new checkpoint should be left.

View file

@ -89,6 +89,7 @@ type headMetrics struct {
maxTime prometheus.GaugeFunc maxTime prometheus.GaugeFunc
samplesAppended prometheus.Counter samplesAppended prometheus.Counter
walTruncateDuration prometheus.Summary walTruncateDuration prometheus.Summary
checkpointDeleteFail prometheus.Counter
} }
func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics { func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
@ -150,6 +151,10 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
Name: "prometheus_tsdb_head_samples_appended_total", Name: "prometheus_tsdb_head_samples_appended_total",
Help: "Total number of appended samples.", Help: "Total number of appended samples.",
}) })
m.checkpointDeleteFail = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_checkpoint_delete_fail",
Help: "Number of times deletion of old checkpoint failed.",
})
if r != nil { if r != nil {
r.MustRegister( r.MustRegister(
@ -166,6 +171,7 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
m.gcDuration, m.gcDuration,
m.walTruncateDuration, m.walTruncateDuration,
m.samplesAppended, m.samplesAppended,
m.checkpointDeleteFail,
) )
} }
return m return m
@ -469,7 +475,7 @@ func (h *Head) Truncate(mint int64) error {
keep := func(id uint64) bool { keep := func(id uint64) bool {
return h.series.getByID(id) != nil return h.series.getByID(id) != nil
} }
if _, err = Checkpoint(h.logger, h.wal, m, n, keep, mint); err != nil { if _, err = Checkpoint(h.logger, h.wal, m, n, keep, mint, h.metrics.checkpointDeleteFail); err != nil {
return errors.Wrap(err, "create checkpoint") return errors.Wrap(err, "create checkpoint")
} }
h.metrics.walTruncateDuration.Observe(time.Since(start).Seconds()) h.metrics.walTruncateDuration.Observe(time.Since(start).Seconds())

View file

@ -162,6 +162,7 @@ type WAL struct {
fsyncDuration prometheus.Summary fsyncDuration prometheus.Summary
pageFlushes prometheus.Counter pageFlushes prometheus.Counter
pageCompletions prometheus.Counter pageCompletions prometheus.Counter
truncateFail prometheus.Counter
} }
// New returns a new WAL over the given directory. // New returns a new WAL over the given directory.
@ -201,8 +202,12 @@ func NewSize(logger log.Logger, reg prometheus.Registerer, dir string, segmentSi
Name: "prometheus_tsdb_wal_completed_pages_total", Name: "prometheus_tsdb_wal_completed_pages_total",
Help: "Total number of completed pages.", Help: "Total number of completed pages.",
}) })
w.truncateFail = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_truncate_fail",
Help: "Number of times WAL truncation failed.",
})
if reg != nil { if reg != nil {
reg.MustRegister(w.fsyncDuration, w.pageFlushes, w.pageCompletions) reg.MustRegister(w.fsyncDuration, w.pageFlushes, w.pageCompletions, w.truncateFail)
} }
_, j, err := w.Segments() _, j, err := w.Segments()
@ -530,6 +535,7 @@ func (w *WAL) Segments() (m, n int, err error) {
func (w *WAL) Truncate(i int) error { func (w *WAL) Truncate(i int) error {
refs, err := listSegments(w.dir) refs, err := listSegments(w.dir)
if err != nil { if err != nil {
w.truncateFail.Add(float64(1))
return err return err
} }
for _, r := range refs { for _, r := range refs {
@ -537,6 +543,7 @@ func (w *WAL) Truncate(i int) error {
break break
} }
if err := os.Remove(filepath.Join(w.dir, r.s)); err != nil { if err := os.Remove(filepath.Join(w.dir, r.s)); err != nil {
w.truncateFail.Add(float64(1))
return err return err
} }
} }