mirror of
https://github.com/prometheus/prometheus.git
synced 2024-12-24 21:24:05 -08:00
Add new metrics.
1. 'prometheus_tsdb_wal_truncate_fail' for failed WAL truncation. 2. 'prometheus_tsdb_checkpoint_delete_fail' for failed old checkpoint delete. Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
This commit is contained in:
parent
a971f52ac8
commit
632dfb349e
|
@ -26,6 +26,7 @@ import (
|
||||||
"github.com/go-kit/kit/log"
|
"github.com/go-kit/kit/log"
|
||||||
"github.com/go-kit/kit/log/level"
|
"github.com/go-kit/kit/log/level"
|
||||||
"github.com/pkg/errors"
|
"github.com/pkg/errors"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
"github.com/prometheus/tsdb/fileutil"
|
"github.com/prometheus/tsdb/fileutil"
|
||||||
"github.com/prometheus/tsdb/wal"
|
"github.com/prometheus/tsdb/wal"
|
||||||
)
|
)
|
||||||
|
@ -102,7 +103,7 @@ const checkpointPrefix = "checkpoint."
|
||||||
// it with the original WAL.
|
// it with the original WAL.
|
||||||
//
|
//
|
||||||
// Non-critical errors are logged and not returned.
|
// Non-critical errors are logged and not returned.
|
||||||
func Checkpoint(logger log.Logger, w *wal.WAL, m, n int, keep func(id uint64) bool, mint int64) (*CheckpointStats, error) {
|
func Checkpoint(logger log.Logger, w *wal.WAL, m, n int, keep func(id uint64) bool, mint int64, checkpointDeleteFail prometheus.Counter) (*CheckpointStats, error) {
|
||||||
if logger == nil {
|
if logger == nil {
|
||||||
logger = log.NewNopLogger()
|
logger = log.NewNopLogger()
|
||||||
}
|
}
|
||||||
|
@ -283,6 +284,7 @@ func Checkpoint(logger log.Logger, w *wal.WAL, m, n int, keep func(id uint64) bo
|
||||||
// occupying disk space.
|
// occupying disk space.
|
||||||
// They will just be ignored since a higher checkpoint exists.
|
// They will just be ignored since a higher checkpoint exists.
|
||||||
level.Error(logger).Log("msg", "delete old checkpoints", "err", err)
|
level.Error(logger).Log("msg", "delete old checkpoints", "err", err)
|
||||||
|
checkpointDeleteFail.Add(float64(1))
|
||||||
}
|
}
|
||||||
return stats, nil
|
return stats, nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,7 @@ import (
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
"github.com/prometheus/tsdb/fileutil"
|
"github.com/prometheus/tsdb/fileutil"
|
||||||
"github.com/prometheus/tsdb/labels"
|
"github.com/prometheus/tsdb/labels"
|
||||||
"github.com/prometheus/tsdb/testutil"
|
"github.com/prometheus/tsdb/testutil"
|
||||||
|
@ -139,7 +140,7 @@ func TestCheckpoint(t *testing.T) {
|
||||||
|
|
||||||
_, err = Checkpoint(nil, w, 100, 106, func(x uint64) bool {
|
_, err = Checkpoint(nil, w, 100, 106, func(x uint64) bool {
|
||||||
return x%2 == 0
|
return x%2 == 0
|
||||||
}, last/2)
|
}, last/2, prometheus.NewCounter(prometheus.CounterOpts{}))
|
||||||
testutil.Ok(t, err)
|
testutil.Ok(t, err)
|
||||||
|
|
||||||
// Only the new checkpoint should be left.
|
// Only the new checkpoint should be left.
|
||||||
|
|
8
head.go
8
head.go
|
@ -89,6 +89,7 @@ type headMetrics struct {
|
||||||
maxTime prometheus.GaugeFunc
|
maxTime prometheus.GaugeFunc
|
||||||
samplesAppended prometheus.Counter
|
samplesAppended prometheus.Counter
|
||||||
walTruncateDuration prometheus.Summary
|
walTruncateDuration prometheus.Summary
|
||||||
|
checkpointDeleteFail prometheus.Counter
|
||||||
}
|
}
|
||||||
|
|
||||||
func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
|
func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
|
||||||
|
@ -150,6 +151,10 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
|
||||||
Name: "prometheus_tsdb_head_samples_appended_total",
|
Name: "prometheus_tsdb_head_samples_appended_total",
|
||||||
Help: "Total number of appended samples.",
|
Help: "Total number of appended samples.",
|
||||||
})
|
})
|
||||||
|
m.checkpointDeleteFail = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
|
Name: "prometheus_tsdb_checkpoint_delete_fail",
|
||||||
|
Help: "Number of times deletion of old checkpoint failed.",
|
||||||
|
})
|
||||||
|
|
||||||
if r != nil {
|
if r != nil {
|
||||||
r.MustRegister(
|
r.MustRegister(
|
||||||
|
@ -166,6 +171,7 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
|
||||||
m.gcDuration,
|
m.gcDuration,
|
||||||
m.walTruncateDuration,
|
m.walTruncateDuration,
|
||||||
m.samplesAppended,
|
m.samplesAppended,
|
||||||
|
m.checkpointDeleteFail,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
return m
|
return m
|
||||||
|
@ -469,7 +475,7 @@ func (h *Head) Truncate(mint int64) error {
|
||||||
keep := func(id uint64) bool {
|
keep := func(id uint64) bool {
|
||||||
return h.series.getByID(id) != nil
|
return h.series.getByID(id) != nil
|
||||||
}
|
}
|
||||||
if _, err = Checkpoint(h.logger, h.wal, m, n, keep, mint); err != nil {
|
if _, err = Checkpoint(h.logger, h.wal, m, n, keep, mint, h.metrics.checkpointDeleteFail); err != nil {
|
||||||
return errors.Wrap(err, "create checkpoint")
|
return errors.Wrap(err, "create checkpoint")
|
||||||
}
|
}
|
||||||
h.metrics.walTruncateDuration.Observe(time.Since(start).Seconds())
|
h.metrics.walTruncateDuration.Observe(time.Since(start).Seconds())
|
||||||
|
|
|
@ -162,6 +162,7 @@ type WAL struct {
|
||||||
fsyncDuration prometheus.Summary
|
fsyncDuration prometheus.Summary
|
||||||
pageFlushes prometheus.Counter
|
pageFlushes prometheus.Counter
|
||||||
pageCompletions prometheus.Counter
|
pageCompletions prometheus.Counter
|
||||||
|
truncateFail prometheus.Counter
|
||||||
}
|
}
|
||||||
|
|
||||||
// New returns a new WAL over the given directory.
|
// New returns a new WAL over the given directory.
|
||||||
|
@ -201,8 +202,12 @@ func NewSize(logger log.Logger, reg prometheus.Registerer, dir string, segmentSi
|
||||||
Name: "prometheus_tsdb_wal_completed_pages_total",
|
Name: "prometheus_tsdb_wal_completed_pages_total",
|
||||||
Help: "Total number of completed pages.",
|
Help: "Total number of completed pages.",
|
||||||
})
|
})
|
||||||
|
w.truncateFail = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
|
Name: "prometheus_tsdb_wal_truncate_fail",
|
||||||
|
Help: "Number of times WAL truncation failed.",
|
||||||
|
})
|
||||||
if reg != nil {
|
if reg != nil {
|
||||||
reg.MustRegister(w.fsyncDuration, w.pageFlushes, w.pageCompletions)
|
reg.MustRegister(w.fsyncDuration, w.pageFlushes, w.pageCompletions, w.truncateFail)
|
||||||
}
|
}
|
||||||
|
|
||||||
_, j, err := w.Segments()
|
_, j, err := w.Segments()
|
||||||
|
@ -530,6 +535,7 @@ func (w *WAL) Segments() (m, n int, err error) {
|
||||||
func (w *WAL) Truncate(i int) error {
|
func (w *WAL) Truncate(i int) error {
|
||||||
refs, err := listSegments(w.dir)
|
refs, err := listSegments(w.dir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
w.truncateFail.Add(float64(1))
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
for _, r := range refs {
|
for _, r := range refs {
|
||||||
|
@ -537,6 +543,7 @@ func (w *WAL) Truncate(i int) error {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
if err := os.Remove(filepath.Join(w.dir, r.s)); err != nil {
|
if err := os.Remove(filepath.Join(w.dir, r.s)); err != nil {
|
||||||
|
w.truncateFail.Add(float64(1))
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue