mirror of
https://github.com/prometheus/prometheus.git
synced 2025-03-05 20:59:13 -08:00
If reading the WAL fails, try again. Also, read from the segment containing the index for the last checkpoint, not the first segment.
Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>
This commit is contained in:
parent
d6f911b511
commit
bdc6b764b0
|
@ -60,7 +60,7 @@ func NewStorage(l log.Logger, reg prometheus.Registerer, stCallback startTimeCal
|
||||||
}
|
}
|
||||||
shardUpdateDuration := 10 * time.Second
|
shardUpdateDuration := 10 * time.Second
|
||||||
s := &Storage{
|
s := &Storage{
|
||||||
logger: logging.RateLimit(logging.Dedupe(l, 1*time.Minute), 1),
|
logger: logging.Dedupe(l, 1*time.Minute),
|
||||||
localStartTimeCallback: stCallback,
|
localStartTimeCallback: stCallback,
|
||||||
flushDeadline: flushDeadline,
|
flushDeadline: flushDeadline,
|
||||||
walDir: walDir,
|
walDir: walDir,
|
||||||
|
|
|
@ -19,6 +19,7 @@ import (
|
||||||
"math"
|
"math"
|
||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
@ -182,7 +183,7 @@ func NewWALWatcher(logger log.Logger, name string, writer writeTo, walDir string
|
||||||
|
|
||||||
func (w *WALWatcher) Start() {
|
func (w *WALWatcher) Start() {
|
||||||
level.Info(w.logger).Log("msg", "starting WAL watcher", "queue", w.name)
|
level.Info(w.logger).Log("msg", "starting WAL watcher", "queue", w.name)
|
||||||
go w.runWatcher()
|
go w.loop()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (w *WALWatcher) Stop() {
|
func (w *WALWatcher) Stop() {
|
||||||
|
@ -190,71 +191,95 @@ func (w *WALWatcher) Stop() {
|
||||||
close(w.quit)
|
close(w.quit)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (w *WALWatcher) runWatcher() {
|
func (w *WALWatcher) loop() {
|
||||||
// The WAL dir may not exist when Prometheus first starts up.
|
// We may encourter failures processing the WAL; we should wait and retry.
|
||||||
|
|
||||||
for {
|
for {
|
||||||
if _, err := os.Stat(w.walDir); os.IsNotExist(err) {
|
if err := w.run(); err != nil {
|
||||||
time.Sleep(time.Second)
|
level.Error(w.logger).Log("msg", "error tailing WAL", "err", err)
|
||||||
} else {
|
}
|
||||||
break
|
|
||||||
|
select {
|
||||||
|
case <-w.quit:
|
||||||
|
return
|
||||||
|
case <-time.After(5 * time.Second):
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *WALWatcher) run() error {
|
||||||
nw, err := wal.New(nil, nil, w.walDir)
|
nw, err := wal.New(nil, nil, w.walDir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
level.Error(w.logger).Log("err", err)
|
return errors.Wrap(err, "wal.New")
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
first, last, err := nw.Segments()
|
|
||||||
if err != nil {
|
|
||||||
level.Error(w.logger).Log("err", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
if last == -1 {
|
|
||||||
level.Error(w.logger).Log("err", err)
|
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Backfill from the checkpoint first if it exists.
|
// Backfill from the checkpoint first if it exists.
|
||||||
dir, _, err := tsdb.LastCheckpoint(w.walDir)
|
var nextIndex int
|
||||||
|
w.lastCheckpoint, nextIndex, err = tsdb.LastCheckpoint(w.walDir)
|
||||||
if err != nil && err != tsdb.ErrNotFound {
|
if err != nil && err != tsdb.ErrNotFound {
|
||||||
level.Error(w.logger).Log("msg", "error looking for existing checkpoint, some samples may be dropped", "err", errors.Wrap(err, "find last checkpoint"))
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
level.Debug(w.logger).Log("msg", "reading checkpoint", "dir", dir)
|
level.Debug(w.logger).Log("msg", "reading checkpoint", "dir", w.lastCheckpoint, "startFrom", nextIndex)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
w.lastCheckpoint = dir
|
if err = w.readCheckpoint(w.lastCheckpoint); err != nil {
|
||||||
err = w.readCheckpoint(dir)
|
return err
|
||||||
if err != nil {
|
|
||||||
level.Error(w.logger).Log("msg", "error reading existing checkpoint, some samples may be dropped", "err", err)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
w.currentSegment = first
|
w.currentSegment, err = w.findSegmentForIndex(nextIndex)
|
||||||
tail := false
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
level.Debug(w.logger).Log("msg", "starting from", "currentSegment", w.currentSegment)
|
||||||
|
|
||||||
for {
|
for {
|
||||||
if w.currentSegment == last {
|
|
||||||
tail = true
|
|
||||||
}
|
|
||||||
|
|
||||||
w.currentSegmentMetric.Set(float64(w.currentSegment))
|
w.currentSegmentMetric.Set(float64(w.currentSegment))
|
||||||
level.Info(w.logger).Log("msg", "process segment", "segment", w.currentSegment, "tail", tail)
|
level.Info(w.logger).Log("msg", "process segment", "segment", w.currentSegment)
|
||||||
|
|
||||||
// On start, after reading the existing WAL for series records, we have a pointer to what is the latest segment.
|
// On start, after reading the existing WAL for series records, we have a pointer to what is the latest segment.
|
||||||
// On subsequent calls to this function, currentSegment will have been incremented and we should open that segment.
|
// On subsequent calls to this function, currentSegment will have been incremented and we should open that segment.
|
||||||
if err := w.watch(nw, w.currentSegment, tail); err != nil {
|
if err := w.watch(nw, w.currentSegment, true); err != nil {
|
||||||
level.Error(w.logger).Log("msg", "runWatcher is ending", "err", err)
|
level.Error(w.logger).Log("msg", "runWatcher is ending", "err", err)
|
||||||
return
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
w.currentSegment++
|
w.currentSegment++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use tail true to indicate that the reader is currently on a segment that is
|
func (w *WALWatcher) findSegmentForIndex(index int) (int, error) {
|
||||||
|
files, err := fileutil.ReadDir(w.walDir)
|
||||||
|
if err != nil {
|
||||||
|
return -1, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var refs []int
|
||||||
|
var last int
|
||||||
|
for _, fn := range files {
|
||||||
|
k, err := strconv.Atoi(fn)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if len(refs) > 0 && k > last+1 {
|
||||||
|
return -1, errors.New("segments are not sequential")
|
||||||
|
}
|
||||||
|
refs = append(refs, k)
|
||||||
|
last = k
|
||||||
|
}
|
||||||
|
sort.Sort(sort.IntSlice(refs))
|
||||||
|
|
||||||
|
for _, r := range refs {
|
||||||
|
if r >= index {
|
||||||
|
return r, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return -1, errors.New("failed to find segment for index")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use tail true to indicate thatreader is currently on a segment that is
|
||||||
// actively being written to. If false, assume it's a full segment and we're
|
// actively being written to. If false, assume it's a full segment and we're
|
||||||
// replaying it on start to cache the series records.
|
// replaying it on start to cache the series records.
|
||||||
func (w *WALWatcher) watch(wl *wal.WAL, segmentNum int, tail bool) error {
|
func (w *WALWatcher) watch(wl *wal.WAL, segmentNum int, tail bool) error {
|
||||||
|
@ -444,11 +469,15 @@ func (w *WALWatcher) readCheckpoint(checkpointDir string) error {
|
||||||
|
|
||||||
// w.readSeriesRecords(wal.NewLiveReader(sr), i, size)
|
// w.readSeriesRecords(wal.NewLiveReader(sr), i, size)
|
||||||
r := wal.NewLiveReader(sr)
|
r := wal.NewLiveReader(sr)
|
||||||
w.readSegment(r)
|
if err := w.readSegment(r); err != nil {
|
||||||
|
return errors.Wrap(err, "readSegment")
|
||||||
|
}
|
||||||
|
|
||||||
if r.TotalRead() != size {
|
if r.TotalRead() != size {
|
||||||
level.Warn(w.logger).Log("msg", "may not have read all data from checkpoint")
|
level.Warn(w.logger).Log("msg", "may not have read all data from checkpoint")
|
||||||
}
|
}
|
||||||
level.Debug(w.logger).Log("msg", "read series references from checkpoint", "checkpoint", checkpointDir)
|
level.Debug(w.logger).Log("msg", "read series references from checkpoint", "checkpoint", checkpointDir)
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue