Add new metrics.

1. 'prometheus_tsdb_wal_truncate_fail' for failed WAL truncation.
2. 'prometheus_tsdb_checkpoint_delete_fail' for failed old checkpoint delete.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
This commit is contained in:
Ganesh Vernekar 2018-09-25 17:19:09 +05:30
parent a971f52ac8
commit 632dfb349e
No known key found for this signature in database
GPG key ID: 0241A11211763456
4 changed files with 33 additions and 17 deletions

View file

@ -26,6 +26,7 @@ import (
"github.com/go-kit/kit/log" "github.com/go-kit/kit/log"
"github.com/go-kit/kit/log/level" "github.com/go-kit/kit/log/level"
"github.com/pkg/errors" "github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/tsdb/fileutil" "github.com/prometheus/tsdb/fileutil"
"github.com/prometheus/tsdb/wal" "github.com/prometheus/tsdb/wal"
) )
@ -102,7 +103,7 @@ const checkpointPrefix = "checkpoint."
// it with the original WAL. // it with the original WAL.
// //
// Non-critical errors are logged and not returned. // Non-critical errors are logged and not returned.
func Checkpoint(logger log.Logger, w *wal.WAL, m, n int, keep func(id uint64) bool, mint int64) (*CheckpointStats, error) { func Checkpoint(logger log.Logger, w *wal.WAL, m, n int, keep func(id uint64) bool, mint int64, checkpointDeleteFail prometheus.Counter) (*CheckpointStats, error) {
if logger == nil { if logger == nil {
logger = log.NewNopLogger() logger = log.NewNopLogger()
} }
@ -283,6 +284,7 @@ func Checkpoint(logger log.Logger, w *wal.WAL, m, n int, keep func(id uint64) bo
// occupying disk space. // occupying disk space.
// They will just be ignored since a higher checkpoint exists. // They will just be ignored since a higher checkpoint exists.
level.Error(logger).Log("msg", "delete old checkpoints", "err", err) level.Error(logger).Log("msg", "delete old checkpoints", "err", err)
checkpointDeleteFail.Add(float64(1))
} }
return stats, nil return stats, nil
} }

View file

@ -20,6 +20,7 @@ import (
"path/filepath" "path/filepath"
"testing" "testing"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/tsdb/fileutil" "github.com/prometheus/tsdb/fileutil"
"github.com/prometheus/tsdb/labels" "github.com/prometheus/tsdb/labels"
"github.com/prometheus/tsdb/testutil" "github.com/prometheus/tsdb/testutil"
@ -139,7 +140,7 @@ func TestCheckpoint(t *testing.T) {
_, err = Checkpoint(nil, w, 100, 106, func(x uint64) bool { _, err = Checkpoint(nil, w, 100, 106, func(x uint64) bool {
return x%2 == 0 return x%2 == 0
}, last/2) }, last/2, prometheus.NewCounter(prometheus.CounterOpts{}))
testutil.Ok(t, err) testutil.Ok(t, err)
// Only the new checkpoint should be left. // Only the new checkpoint should be left.

34
head.go
View file

@ -76,19 +76,20 @@ type Head struct {
} }
type headMetrics struct { type headMetrics struct {
activeAppenders prometheus.Gauge activeAppenders prometheus.Gauge
series prometheus.Gauge series prometheus.Gauge
seriesCreated prometheus.Counter seriesCreated prometheus.Counter
seriesRemoved prometheus.Counter seriesRemoved prometheus.Counter
seriesNotFound prometheus.Counter seriesNotFound prometheus.Counter
chunks prometheus.Gauge chunks prometheus.Gauge
chunksCreated prometheus.Counter chunksCreated prometheus.Counter
chunksRemoved prometheus.Counter chunksRemoved prometheus.Counter
gcDuration prometheus.Summary gcDuration prometheus.Summary
minTime prometheus.GaugeFunc minTime prometheus.GaugeFunc
maxTime prometheus.GaugeFunc maxTime prometheus.GaugeFunc
samplesAppended prometheus.Counter samplesAppended prometheus.Counter
walTruncateDuration prometheus.Summary walTruncateDuration prometheus.Summary
checkpointDeleteFail prometheus.Counter
} }
func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics { func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
@ -150,6 +151,10 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
Name: "prometheus_tsdb_head_samples_appended_total", Name: "prometheus_tsdb_head_samples_appended_total",
Help: "Total number of appended samples.", Help: "Total number of appended samples.",
}) })
m.checkpointDeleteFail = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_checkpoint_delete_fail",
Help: "Number of times deletion of old checkpoint failed.",
})
if r != nil { if r != nil {
r.MustRegister( r.MustRegister(
@ -166,6 +171,7 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
m.gcDuration, m.gcDuration,
m.walTruncateDuration, m.walTruncateDuration,
m.samplesAppended, m.samplesAppended,
m.checkpointDeleteFail,
) )
} }
return m return m
@ -469,7 +475,7 @@ func (h *Head) Truncate(mint int64) error {
keep := func(id uint64) bool { keep := func(id uint64) bool {
return h.series.getByID(id) != nil return h.series.getByID(id) != nil
} }
if _, err = Checkpoint(h.logger, h.wal, m, n, keep, mint); err != nil { if _, err = Checkpoint(h.logger, h.wal, m, n, keep, mint, h.metrics.checkpointDeleteFail); err != nil {
return errors.Wrap(err, "create checkpoint") return errors.Wrap(err, "create checkpoint")
} }
h.metrics.walTruncateDuration.Observe(time.Since(start).Seconds()) h.metrics.walTruncateDuration.Observe(time.Since(start).Seconds())

View file

@ -162,6 +162,7 @@ type WAL struct {
fsyncDuration prometheus.Summary fsyncDuration prometheus.Summary
pageFlushes prometheus.Counter pageFlushes prometheus.Counter
pageCompletions prometheus.Counter pageCompletions prometheus.Counter
truncateFail prometheus.Counter
} }
// New returns a new WAL over the given directory. // New returns a new WAL over the given directory.
@ -201,8 +202,12 @@ func NewSize(logger log.Logger, reg prometheus.Registerer, dir string, segmentSi
Name: "prometheus_tsdb_wal_completed_pages_total", Name: "prometheus_tsdb_wal_completed_pages_total",
Help: "Total number of completed pages.", Help: "Total number of completed pages.",
}) })
w.truncateFail = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_truncate_fail",
Help: "Number of times WAL truncation failed.",
})
if reg != nil { if reg != nil {
reg.MustRegister(w.fsyncDuration, w.pageFlushes, w.pageCompletions) reg.MustRegister(w.fsyncDuration, w.pageFlushes, w.pageCompletions, w.truncateFail)
} }
_, j, err := w.Segments() _, j, err := w.Segments()
@ -530,6 +535,7 @@ func (w *WAL) Segments() (m, n int, err error) {
func (w *WAL) Truncate(i int) error { func (w *WAL) Truncate(i int) error {
refs, err := listSegments(w.dir) refs, err := listSegments(w.dir)
if err != nil { if err != nil {
w.truncateFail.Add(float64(1))
return err return err
} }
for _, r := range refs { for _, r := range refs {
@ -537,6 +543,7 @@ func (w *WAL) Truncate(i int) error {
break break
} }
if err := os.Remove(filepath.Join(w.dir, r.s)); err != nil { if err := os.Remove(filepath.Join(w.dir, r.s)); err != nil {
w.truncateFail.Add(float64(1))
return err return err
} }
} }