mirror of
https://github.com/prometheus/prometheus.git
synced 2024-12-24 05:04:05 -08:00
Add new metrics.
1. 'prometheus_tsdb_wal_truncate_fail' for failed WAL truncation. 2. 'prometheus_tsdb_checkpoint_delete_fail' for failed old checkpoint delete. Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
This commit is contained in:
parent
a971f52ac8
commit
632dfb349e
|
@ -26,6 +26,7 @@ import (
|
|||
"github.com/go-kit/kit/log"
|
||||
"github.com/go-kit/kit/log/level"
|
||||
"github.com/pkg/errors"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/tsdb/fileutil"
|
||||
"github.com/prometheus/tsdb/wal"
|
||||
)
|
||||
|
@ -102,7 +103,7 @@ const checkpointPrefix = "checkpoint."
|
|||
// it with the original WAL.
|
||||
//
|
||||
// Non-critical errors are logged and not returned.
|
||||
func Checkpoint(logger log.Logger, w *wal.WAL, m, n int, keep func(id uint64) bool, mint int64) (*CheckpointStats, error) {
|
||||
func Checkpoint(logger log.Logger, w *wal.WAL, m, n int, keep func(id uint64) bool, mint int64, checkpointDeleteFail prometheus.Counter) (*CheckpointStats, error) {
|
||||
if logger == nil {
|
||||
logger = log.NewNopLogger()
|
||||
}
|
||||
|
@ -283,6 +284,7 @@ func Checkpoint(logger log.Logger, w *wal.WAL, m, n int, keep func(id uint64) bo
|
|||
// occupying disk space.
|
||||
// They will just be ignored since a higher checkpoint exists.
|
||||
level.Error(logger).Log("msg", "delete old checkpoints", "err", err)
|
||||
checkpointDeleteFail.Add(float64(1))
|
||||
}
|
||||
return stats, nil
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@ import (
|
|||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/tsdb/fileutil"
|
||||
"github.com/prometheus/tsdb/labels"
|
||||
"github.com/prometheus/tsdb/testutil"
|
||||
|
@ -139,7 +140,7 @@ func TestCheckpoint(t *testing.T) {
|
|||
|
||||
_, err = Checkpoint(nil, w, 100, 106, func(x uint64) bool {
|
||||
return x%2 == 0
|
||||
}, last/2)
|
||||
}, last/2, prometheus.NewCounter(prometheus.CounterOpts{}))
|
||||
testutil.Ok(t, err)
|
||||
|
||||
// Only the new checkpoint should be left.
|
||||
|
|
34
head.go
34
head.go
|
@ -76,19 +76,20 @@ type Head struct {
|
|||
}
|
||||
|
||||
type headMetrics struct {
|
||||
activeAppenders prometheus.Gauge
|
||||
series prometheus.Gauge
|
||||
seriesCreated prometheus.Counter
|
||||
seriesRemoved prometheus.Counter
|
||||
seriesNotFound prometheus.Counter
|
||||
chunks prometheus.Gauge
|
||||
chunksCreated prometheus.Counter
|
||||
chunksRemoved prometheus.Counter
|
||||
gcDuration prometheus.Summary
|
||||
minTime prometheus.GaugeFunc
|
||||
maxTime prometheus.GaugeFunc
|
||||
samplesAppended prometheus.Counter
|
||||
walTruncateDuration prometheus.Summary
|
||||
activeAppenders prometheus.Gauge
|
||||
series prometheus.Gauge
|
||||
seriesCreated prometheus.Counter
|
||||
seriesRemoved prometheus.Counter
|
||||
seriesNotFound prometheus.Counter
|
||||
chunks prometheus.Gauge
|
||||
chunksCreated prometheus.Counter
|
||||
chunksRemoved prometheus.Counter
|
||||
gcDuration prometheus.Summary
|
||||
minTime prometheus.GaugeFunc
|
||||
maxTime prometheus.GaugeFunc
|
||||
samplesAppended prometheus.Counter
|
||||
walTruncateDuration prometheus.Summary
|
||||
checkpointDeleteFail prometheus.Counter
|
||||
}
|
||||
|
||||
func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
|
||||
|
@ -150,6 +151,10 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
|
|||
Name: "prometheus_tsdb_head_samples_appended_total",
|
||||
Help: "Total number of appended samples.",
|
||||
})
|
||||
m.checkpointDeleteFail = prometheus.NewCounter(prometheus.CounterOpts{
|
||||
Name: "prometheus_tsdb_checkpoint_delete_fail",
|
||||
Help: "Number of times deletion of old checkpoint failed.",
|
||||
})
|
||||
|
||||
if r != nil {
|
||||
r.MustRegister(
|
||||
|
@ -166,6 +171,7 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
|
|||
m.gcDuration,
|
||||
m.walTruncateDuration,
|
||||
m.samplesAppended,
|
||||
m.checkpointDeleteFail,
|
||||
)
|
||||
}
|
||||
return m
|
||||
|
@ -469,7 +475,7 @@ func (h *Head) Truncate(mint int64) error {
|
|||
keep := func(id uint64) bool {
|
||||
return h.series.getByID(id) != nil
|
||||
}
|
||||
if _, err = Checkpoint(h.logger, h.wal, m, n, keep, mint); err != nil {
|
||||
if _, err = Checkpoint(h.logger, h.wal, m, n, keep, mint, h.metrics.checkpointDeleteFail); err != nil {
|
||||
return errors.Wrap(err, "create checkpoint")
|
||||
}
|
||||
h.metrics.walTruncateDuration.Observe(time.Since(start).Seconds())
|
||||
|
|
|
@ -162,6 +162,7 @@ type WAL struct {
|
|||
fsyncDuration prometheus.Summary
|
||||
pageFlushes prometheus.Counter
|
||||
pageCompletions prometheus.Counter
|
||||
truncateFail prometheus.Counter
|
||||
}
|
||||
|
||||
// New returns a new WAL over the given directory.
|
||||
|
@ -201,8 +202,12 @@ func NewSize(logger log.Logger, reg prometheus.Registerer, dir string, segmentSi
|
|||
Name: "prometheus_tsdb_wal_completed_pages_total",
|
||||
Help: "Total number of completed pages.",
|
||||
})
|
||||
w.truncateFail = prometheus.NewCounter(prometheus.CounterOpts{
|
||||
Name: "prometheus_tsdb_wal_truncate_fail",
|
||||
Help: "Number of times WAL truncation failed.",
|
||||
})
|
||||
if reg != nil {
|
||||
reg.MustRegister(w.fsyncDuration, w.pageFlushes, w.pageCompletions)
|
||||
reg.MustRegister(w.fsyncDuration, w.pageFlushes, w.pageCompletions, w.truncateFail)
|
||||
}
|
||||
|
||||
_, j, err := w.Segments()
|
||||
|
@ -530,6 +535,7 @@ func (w *WAL) Segments() (m, n int, err error) {
|
|||
func (w *WAL) Truncate(i int) error {
|
||||
refs, err := listSegments(w.dir)
|
||||
if err != nil {
|
||||
w.truncateFail.Add(float64(1))
|
||||
return err
|
||||
}
|
||||
for _, r := range refs {
|
||||
|
@ -537,6 +543,7 @@ func (w *WAL) Truncate(i int) error {
|
|||
break
|
||||
}
|
||||
if err := os.Remove(filepath.Join(w.dir, r.s)); err != nil {
|
||||
w.truncateFail.Add(float64(1))
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue