mirror of
https://github.com/prometheus/prometheus.git
synced 2024-11-09 23:24:05 -08:00
repair wal when the record cannot be decoded (#453)
* repair wal when the record cannot be decoded Currently repair is run only when the error happens in the reader. A corruption can occur after the record is read and when it is decoded. This change wraps the error at decoding as a CorruptionErr as this error is expected to trigger a repair. Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
This commit is contained in:
parent
24520727a4
commit
0493efb7c5
24
head.go
24
head.go
|
@ -349,7 +349,11 @@ func (h *Head) loadWAL(r *wal.Reader) error {
|
||||||
case RecordSeries:
|
case RecordSeries:
|
||||||
series, err = dec.Series(rec, series)
|
series, err = dec.Series(rec, series)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return errors.Wrap(err, "decode series")
|
return &wal.CorruptionErr{
|
||||||
|
Err: errors.Wrap(err, "decode series"),
|
||||||
|
Segment: r.Segment(),
|
||||||
|
Offset: r.Offset(),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for _, s := range series {
|
for _, s := range series {
|
||||||
h.getOrCreateWithID(s.Ref, s.Labels.Hash(), s.Labels)
|
h.getOrCreateWithID(s.Ref, s.Labels.Hash(), s.Labels)
|
||||||
|
@ -362,7 +366,11 @@ func (h *Head) loadWAL(r *wal.Reader) error {
|
||||||
samples, err = dec.Samples(rec, samples)
|
samples, err = dec.Samples(rec, samples)
|
||||||
s := samples
|
s := samples
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return errors.Wrap(err, "decode samples")
|
return &wal.CorruptionErr{
|
||||||
|
Err: errors.Wrap(err, "decode samples"),
|
||||||
|
Segment: r.Segment(),
|
||||||
|
Offset: r.Offset(),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// We split up the samples into chunks of 5000 samples or less.
|
// We split up the samples into chunks of 5000 samples or less.
|
||||||
// With O(300 * #cores) in-flight sample batches, large scrapes could otherwise
|
// With O(300 * #cores) in-flight sample batches, large scrapes could otherwise
|
||||||
|
@ -395,7 +403,11 @@ func (h *Head) loadWAL(r *wal.Reader) error {
|
||||||
case RecordTombstones:
|
case RecordTombstones:
|
||||||
tstones, err = dec.Tombstones(rec, tstones)
|
tstones, err = dec.Tombstones(rec, tstones)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return errors.Wrap(err, "decode tombstones")
|
return &wal.CorruptionErr{
|
||||||
|
Err: errors.Wrap(err, "decode tombstones"),
|
||||||
|
Segment: r.Segment(),
|
||||||
|
Offset: r.Offset(),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for _, s := range tstones {
|
for _, s := range tstones {
|
||||||
for _, itv := range s.intervals {
|
for _, itv := range s.intervals {
|
||||||
|
@ -406,7 +418,11 @@ func (h *Head) loadWAL(r *wal.Reader) error {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
return errors.Errorf("invalid record type %v", dec.Type(rec))
|
return &wal.CorruptionErr{
|
||||||
|
Err: errors.Errorf("invalid record type %v", dec.Type(rec)),
|
||||||
|
Segment: r.Segment(),
|
||||||
|
Offset: r.Offset(),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if r.Err() != nil {
|
if r.Err() != nil {
|
||||||
|
|
79
head_test.go
79
head_test.go
|
@ -861,3 +861,82 @@ func TestHead_LogRollback(t *testing.T) {
|
||||||
testutil.Assert(t, ok, "expected series record but got %+v", recs[0])
|
testutil.Assert(t, ok, "expected series record but got %+v", recs[0])
|
||||||
testutil.Equals(t, []RefSeries{{Ref: 1, Labels: labels.FromStrings("a", "b")}}, series)
|
testutil.Equals(t, []RefSeries{{Ref: 1, Labels: labels.FromStrings("a", "b")}}, series)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestWalRepair(t *testing.T) {
|
||||||
|
var enc RecordEncoder
|
||||||
|
for name, test := range map[string]struct {
|
||||||
|
corrFunc func(rec []byte) []byte // Func that applies the corruption to a record.
|
||||||
|
rec []byte
|
||||||
|
totalRecs int
|
||||||
|
expRecs int
|
||||||
|
}{
|
||||||
|
"invalid_record": {
|
||||||
|
func(rec []byte) []byte {
|
||||||
|
rec[0] = byte(RecordInvalid)
|
||||||
|
return rec
|
||||||
|
},
|
||||||
|
enc.Series([]RefSeries{{Ref: 1, Labels: labels.FromStrings("a", "b")}}, []byte{}),
|
||||||
|
9,
|
||||||
|
5,
|
||||||
|
},
|
||||||
|
"decode_series": {
|
||||||
|
func(rec []byte) []byte {
|
||||||
|
return rec[:3]
|
||||||
|
},
|
||||||
|
enc.Series([]RefSeries{{Ref: 1, Labels: labels.FromStrings("a", "b")}}, []byte{}),
|
||||||
|
9,
|
||||||
|
5,
|
||||||
|
},
|
||||||
|
"decode_samples": {
|
||||||
|
func(rec []byte) []byte {
|
||||||
|
return rec[:3]
|
||||||
|
},
|
||||||
|
enc.Samples([]RefSample{{Ref: 0, T: 99, V: 1}}, []byte{}),
|
||||||
|
9,
|
||||||
|
5,
|
||||||
|
},
|
||||||
|
"decode_tombstone": {
|
||||||
|
func(rec []byte) []byte {
|
||||||
|
return rec[:3]
|
||||||
|
},
|
||||||
|
enc.Tombstones([]Stone{{ref: 1, intervals: Intervals{}}}, []byte{}),
|
||||||
|
9,
|
||||||
|
5,
|
||||||
|
},
|
||||||
|
} {
|
||||||
|
t.Run(name, func(t *testing.T) {
|
||||||
|
dir, err := ioutil.TempDir("", "wal_head_repair")
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
defer os.RemoveAll(dir)
|
||||||
|
|
||||||
|
w, err := wal.New(nil, nil, dir)
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
|
||||||
|
for i := 1; i <= test.totalRecs; i++ {
|
||||||
|
// At this point insert a corrupted record.
|
||||||
|
if i-1 == test.expRecs {
|
||||||
|
testutil.Ok(t, w.Log(test.corrFunc(test.rec)))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
testutil.Ok(t, w.Log(test.rec))
|
||||||
|
}
|
||||||
|
|
||||||
|
h, err := NewHead(nil, nil, w, 1)
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
testutil.Ok(t, h.Init())
|
||||||
|
|
||||||
|
sr, err := wal.NewSegmentsReader(dir)
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
defer sr.Close()
|
||||||
|
r := wal.NewReader(sr)
|
||||||
|
|
||||||
|
var actRec int
|
||||||
|
for r.Next() {
|
||||||
|
actRec++
|
||||||
|
}
|
||||||
|
testutil.Ok(t, r.Err())
|
||||||
|
testutil.Equals(t, test.expRecs, actRec, "Wrong number of intact records")
|
||||||
|
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
20
wal/wal.go
20
wal/wal.go
|
@ -341,6 +341,10 @@ func (w *WAL) Repair(origErr error) error {
|
||||||
r := NewReader(bufio.NewReader(f))
|
r := NewReader(bufio.NewReader(f))
|
||||||
|
|
||||||
for r.Next() {
|
for r.Next() {
|
||||||
|
// Add records only up to the where the error was.
|
||||||
|
if r.Offset() >= cerr.Offset {
|
||||||
|
break
|
||||||
|
}
|
||||||
if err := w.Log(r.Record()); err != nil {
|
if err := w.Log(r.Record()); err != nil {
|
||||||
return errors.Wrap(err, "insert record")
|
return errors.Wrap(err, "insert record")
|
||||||
}
|
}
|
||||||
|
@ -869,6 +873,22 @@ func (r *Reader) Record() []byte {
|
||||||
return r.rec
|
return r.rec
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Segment returns the current segment being read.
|
||||||
|
func (r *Reader) Segment() int {
|
||||||
|
if b, ok := r.rdr.(*segmentBufReader); ok {
|
||||||
|
return b.segs[b.cur].Index()
|
||||||
|
}
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
|
||||||
|
// Offset returns the current position of the segment being read.
|
||||||
|
func (r *Reader) Offset() int64 {
|
||||||
|
if b, ok := r.rdr.(*segmentBufReader); ok {
|
||||||
|
return int64(b.off)
|
||||||
|
}
|
||||||
|
return r.total
|
||||||
|
}
|
||||||
|
|
||||||
func min(i, j int) int {
|
func min(i, j int) int {
|
||||||
if i < j {
|
if i < j {
|
||||||
return i
|
return i
|
||||||
|
|
Loading…
Reference in a new issue