Merge pull request #6337 from cstyan/rw-log-replay

Log the start and end of the WAL replay within the WAL watcher.
This commit is contained in:
Callum Styan 2019-12-03 13:24:26 -08:00 committed by GitHub
commit 5830e03691
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 24 additions and 6 deletions

View file

@ -35,6 +35,7 @@ import (
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/config"
"github.com/prometheus/prometheus/pkg/labels"
"github.com/prometheus/prometheus/pkg/timestamp"
"github.com/prometheus/prometheus/prompb"
"github.com/prometheus/prometheus/tsdb/record"
"github.com/prometheus/prometheus/util/testutil"
@ -531,7 +532,7 @@ func BenchmarkStartup(b *testing.B) {
m := NewQueueManager(nil, logger, dir,
newEWMARate(ewmaWeight, shardUpdateDuration),
config.DefaultQueueConfig, nil, nil, c, 1*time.Minute)
m.watcher.StartTime = math.MaxInt64
m.watcher.SetStartTime(timestamp.Time(math.MaxInt64))
m.watcher.MaxSegment = segments[len(segments)-2]
err := m.watcher.Run()
testutil.Ok(b, err)

View file

@ -68,7 +68,9 @@ type Watcher struct {
metrics *WatcherMetrics
readerMetrics *liveReaderMetrics
StartTime int64
startTime time.Time
startTimestamp int64 // the start time as a Prometheus timestamp
sendSamples bool
recordsReadMetric *prometheus.CounterVec
recordDecodeFailsMetric prometheus.Counter
@ -191,7 +193,7 @@ func (w *Watcher) loop() {
// We may encounter failures processing the WAL; we should wait and retry.
for !isClosed(w.quit) {
w.StartTime = timestamp.FromTime(time.Now())
w.SetStartTime(time.Now())
if err := w.Run(); err != nil {
level.Error(w.logger).Log("msg", "error tailing WAL", "err", err)
}
@ -212,6 +214,12 @@ func (w *Watcher) Run() error {
return errors.Wrap(err, "wal.Segments")
}
// We want to ensure this is false across iterations since
// Run will be called again if there was a failure to read the WAL.
w.sendSamples = false
level.Info(w.logger).Log("msg", "replaying WAL", "queue", w.name)
// Backfill from the checkpoint first if it exists.
lastCheckpoint, checkpointIndex, err := LastCheckpoint(w.walDir)
if err != nil && err != record.ErrNotFound {
@ -456,7 +464,6 @@ func (w *Watcher) readSegment(r *LiveReader, segmentNum int, tail bool) error {
samples []record.RefSample
send []record.RefSample
)
for r.Next() && !isClosed(w.quit) {
rec := r.Record()
w.recordsReadMetric.WithLabelValues(recordType(dec.Type(rec))).Inc()
@ -482,7 +489,12 @@ func (w *Watcher) readSegment(r *LiveReader, segmentNum int, tail bool) error {
return err
}
for _, s := range samples {
if s.T > w.StartTime {
if s.T > w.startTimestamp {
if !w.sendSamples {
w.sendSamples = true
duration := time.Since(w.startTime)
level.Info(w.logger).Log("msg", "done replaying WAL", "duration", duration)
}
send = append(send, s)
}
}
@ -505,6 +517,11 @@ func (w *Watcher) readSegment(r *LiveReader, segmentNum int, tail bool) error {
return r.Err()
}
func (w *Watcher) SetStartTime(t time.Time) {
w.startTime = t
w.startTimestamp = timestamp.FromTime(t)
}
func recordType(rt record.Type) string {
switch rt {
case record.Invalid:

View file

@ -139,7 +139,7 @@ func TestTailSamples(t *testing.T) {
wt := newWriteToMock()
watcher := NewWatcher(nil, wMetrics, nil, "", wt, dir)
watcher.StartTime = now.UnixNano()
watcher.SetStartTime(now)
// Set the Watcher's metrics so they're not nil pointers.
watcher.setMetrics()