prometheus/tsdb/wal.go

1304 lines
31 KiB
Go
Raw Normal View History

2017-04-10 11:59:45 -07:00
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
2016-12-22 03:05:24 -08:00
package tsdb
import (
2017-01-16 05:18:25 -08:00
"bufio"
2016-12-22 03:05:24 -08:00
"encoding/binary"
"errors"
"fmt"
"hash"
2016-12-22 03:05:24 -08:00
"hash/crc32"
"io"
"math"
"os"
"path/filepath"
"sync"
"time"
2016-12-22 03:05:24 -08:00
"github.com/go-kit/log"
"github.com/go-kit/log/level"
2017-10-04 12:51:34 -07:00
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/storage"
"github.com/prometheus/prometheus/tsdb/chunks"
"github.com/prometheus/prometheus/tsdb/encoding"
"github.com/prometheus/prometheus/tsdb/fileutil"
"github.com/prometheus/prometheus/tsdb/record"
"github.com/prometheus/prometheus/tsdb/tombstones"
"github.com/prometheus/prometheus/tsdb/wlog"
Use zeropool.Pool to workaround SA6002 (#12189) * Use zeropool.Pool to workaround SA6002 I built a tiny library called https://github.com/colega/zeropool to workaround the SA6002 staticheck issue. While searching for the references of that SA6002 staticheck issues on Github first results was Prometheus itself, with quite a lot of ignores of it. This changes the usages of `sync.Pool` to `zeropool.Pool[T]` where a pointer is not available. Also added a benchmark for HeadAppender Append/Commit when series already exist, which is one of the most usual cases IMO, as I didn't find any. Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Improve BenchmarkHeadAppender with more cases Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * A little copying is better than a little dependency https://www.youtube.com/watch?v=PAAkCSZUG1c&t=9m28s Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Fix imports order Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Add license header Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Copyright should be on one of the first 3 lines Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Use require.Equal for testing I don't depend on testify in my lib, but here we have it available. Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Avoid flaky test Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Also use zeropool for pointsPool in engine.go Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> --------- Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com>
2023-03-29 12:34:34 -07:00
"github.com/prometheus/prometheus/util/zeropool"
2016-12-22 03:05:24 -08:00
)
// WALEntryType indicates what data a WAL entry contains.
type WALEntryType uint8
2016-12-22 03:05:24 -08:00
const (
2017-02-14 15:54:52 -08:00
// WALMagic is a 4 byte number every WAL segment file starts with.
WALMagic = uint32(0x43AF00EF)
2017-02-14 15:54:52 -08:00
// WALFormatDefault is the version flag for the default outer segment file format.
WALFormatDefault = byte(1)
)
2017-02-14 15:54:52 -08:00
// Entry types in a segment file.
const (
WALEntrySymbols WALEntryType = 1
WALEntrySeries WALEntryType = 2
WALEntrySamples WALEntryType = 3
WALEntryDeletes WALEntryType = 4
2016-12-22 03:05:24 -08:00
)
2017-10-04 12:51:34 -07:00
type walMetrics struct {
fsyncDuration prometheus.Summary
2017-10-06 04:50:20 -07:00
corruptions prometheus.Counter
2017-10-04 12:51:34 -07:00
}
func newWalMetrics(r prometheus.Registerer) *walMetrics {
2017-10-04 12:51:34 -07:00
m := &walMetrics{}
m.fsyncDuration = prometheus.NewSummary(prometheus.SummaryOpts{
Name: "prometheus_tsdb_wal_fsync_duration_seconds",
Help: "Duration of WAL fsync.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
2017-10-04 12:51:34 -07:00
})
2017-10-06 04:50:20 -07:00
m.corruptions = prometheus.NewCounter(prometheus.CounterOpts{
Name: "prometheus_tsdb_wal_corruptions_total",
2017-10-06 04:50:20 -07:00
Help: "Total number of WAL corruptions.",
})
2017-10-04 12:51:34 -07:00
if r != nil {
r.MustRegister(
m.fsyncDuration,
2017-10-06 04:50:20 -07:00
m.corruptions,
2017-10-04 12:51:34 -07:00
)
}
return m
}
2017-05-13 08:09:26 -07:00
// WAL is a write ahead log that can log new series labels and samples.
// It must be completely read before new entries are logged.
//
// Deprecated: use wlog pkg combined with the record codex instead.
2017-05-13 08:09:26 -07:00
type WAL interface {
Reader() WALReader
LogSeries([]record.RefSeries) error
LogSamples([]record.RefSample) error
LogDeletes([]tombstones.Stone) error
Truncate(mint int64, keep func(uint64) bool) error
2017-05-13 08:09:26 -07:00
Close() error
}
// WALReader reads entries from a WAL.
type WALReader interface {
Read(
seriesf func([]record.RefSeries),
samplesf func([]record.RefSample),
deletesf func([]tombstones.Stone),
) error
2017-05-13 08:09:26 -07:00
}
// segmentFile wraps a file object of a segment and tracks the highest timestamp
// it contains. During WAL truncating, all segments with no higher timestamp than
// the truncation threshold can be compacted.
type segmentFile struct {
*os.File
maxTime int64 // highest tombstone or sample timestamp in segment
minSeries chunks.HeadSeriesRef // lowerst series ID in segment
}
func newSegmentFile(f *os.File) *segmentFile {
return &segmentFile{
File: f,
maxTime: math.MinInt64,
minSeries: math.MaxUint64,
}
}
2017-02-13 23:53:19 -08:00
const (
walSegmentSizeBytes = 256 * 1024 * 1024 // 256 MB
2017-02-13 23:53:19 -08:00
)
// The table gets initialized with sync.Once but may still cause a race
// with any other use of the crc32 package anywhere. Thus we initialize it
// before.
var castagnoliTable *crc32.Table
func init() {
castagnoliTable = crc32.MakeTable(crc32.Castagnoli)
}
// newCRC32 initializes a CRC32 hash with a preconfigured polynomial, so the
// polynomial may be easily changed in one location at a later time, if necessary.
func newCRC32() hash.Hash32 {
return crc32.New(castagnoliTable)
}
// SegmentWAL is a write ahead log for series data.
//
// Deprecated: use wlog pkg combined with the record coders instead.
type SegmentWAL struct {
mtx sync.Mutex
2017-10-04 12:51:34 -07:00
metrics *walMetrics
dirFile *os.File
files []*segmentFile
logger log.Logger
flushInterval time.Duration
segmentSize int64
crc32 hash.Hash32
cur *bufio.Writer
curN int64
stopc chan struct{}
donec chan struct{}
2017-11-01 10:11:09 -07:00
actorc chan func() error // sequentialized background operations
buffers sync.Pool
}
2017-05-13 08:09:26 -07:00
// OpenSegmentWAL opens or creates a write ahead log in the given directory.
2016-12-22 06:18:33 -08:00
// The WAL must be read completely before new data is written.
2017-10-04 12:51:34 -07:00
func OpenSegmentWAL(dir string, logger log.Logger, flushInterval time.Duration, r prometheus.Registerer) (*SegmentWAL, error) {
if err := os.MkdirAll(dir, 0o777); err != nil {
2016-12-22 03:05:24 -08:00
return nil, err
}
2017-02-13 23:53:19 -08:00
df, err := fileutil.OpenDir(dir)
2016-12-22 07:14:34 -08:00
if err != nil {
return nil, err
}
if logger == nil {
logger = log.NewNopLogger()
}
2016-12-22 03:05:24 -08:00
2017-05-13 08:09:26 -07:00
w := &SegmentWAL{
2017-02-13 23:53:19 -08:00
dirFile: df,
logger: logger,
flushInterval: flushInterval,
donec: make(chan struct{}),
stopc: make(chan struct{}),
actorc: make(chan func() error, 2),
2017-02-14 15:54:52 -08:00
segmentSize: walSegmentSizeBytes,
crc32: newCRC32(),
2016-12-22 03:05:24 -08:00
}
w.metrics = newWalMetrics(r)
fns, err := sequenceFiles(w.dirFile.Name())
if err != nil {
2017-02-13 23:53:19 -08:00
return nil, err
}
2017-10-20 03:12:49 -07:00
for i, fn := range fns {
f, err := w.openSegmentFile(fn)
2017-10-20 03:12:49 -07:00
if err == nil {
w.files = append(w.files, newSegmentFile(f))
continue
}
level.Warn(logger).Log("msg", "Invalid segment file detected, truncating WAL", "err", err, "file", fn)
2017-10-20 03:12:49 -07:00
for _, fn := range fns[i:] {
if err := os.Remove(fn); err != nil {
return w, fmt.Errorf("removing segment failed: %w", err)
2017-10-20 03:12:49 -07:00
}
}
2017-10-20 03:12:49 -07:00
break
}
2017-02-13 23:53:19 -08:00
go w.run(flushInterval)
2016-12-22 03:05:24 -08:00
return w, nil
}
// repairingWALReader wraps a WAL reader and truncates its underlying SegmentWAL after the last
// valid entry if it encounters corruption.
type repairingWALReader struct {
wal *SegmentWAL
r WALReader
}
func (r *repairingWALReader) Read(
seriesf func([]record.RefSeries),
samplesf func([]record.RefSample),
deletesf func([]tombstones.Stone),
) error {
err := r.r.Read(seriesf, samplesf, deletesf)
if err == nil {
return nil
}
var cerr *walCorruptionErr
if !errors.As(err, &cerr) {
return err
}
2017-10-06 04:50:20 -07:00
r.wal.metrics.corruptions.Inc()
return r.wal.truncate(cerr.err, cerr.file, cerr.lastOffset)
}
// truncate the WAL after the last valid entry.
func (w *SegmentWAL) truncate(err error, file int, lastOffset int64) error {
level.Error(w.logger).Log("msg", "WAL corruption detected; truncating",
"err", err, "file", w.files[file].Name(), "pos", lastOffset)
// Close and delete all files after the current one.
for _, f := range w.files[file+1:] {
if err := f.Close(); err != nil {
return err
}
if err := os.Remove(f.Name()); err != nil {
return err
}
}
w.mtx.Lock()
defer w.mtx.Unlock()
w.files = w.files[:file+1]
// Seek the current file to the last valid offset where we continue writing from.
_, err = w.files[file].Seek(lastOffset, io.SeekStart)
return err
}
// Reader returns a new reader over the write ahead log data.
2017-02-14 21:54:59 -08:00
// It must be completely consumed before writing to the WAL.
func (w *SegmentWAL) Reader() WALReader {
return &repairingWALReader{
wal: w,
r: newWALReader(w.files, w.logger),
}
}
func (w *SegmentWAL) getBuffer() *encoding.Encbuf {
b := w.buffers.Get()
if b == nil {
return &encoding.Encbuf{B: make([]byte, 0, 64*1024)}
}
return b.(*encoding.Encbuf)
}
func (w *SegmentWAL) putBuffer(b *encoding.Encbuf) {
b.Reset()
w.buffers.Put(b)
2016-12-22 06:18:33 -08:00
}
// Truncate deletes the values prior to mint and the series which the keep function
// does not indicate to preserve.
func (w *SegmentWAL) Truncate(mint int64, keep func(chunks.HeadSeriesRef) bool) error {
// The last segment is always active.
if len(w.files) < 2 {
return nil
}
var candidates []*segmentFile
// All files have to be traversed as there could be two segments for a block
// with first block having times (10000, 20000) and SECOND one having (0, 10000).
for _, sf := range w.files[:len(w.files)-1] {
if sf.maxTime >= mint {
break
}
// Past WAL files are closed. We have to reopen them for another read.
f, err := w.openSegmentFile(sf.Name())
if err != nil {
return fmt.Errorf("open old WAL segment for read: %w", err)
}
candidates = append(candidates, &segmentFile{
File: f,
minSeries: sf.minSeries,
maxTime: sf.maxTime,
})
}
if len(candidates) == 0 {
return nil
}
r := newWALReader(candidates, w.logger)
// Create a new tmp file.
f, err := w.createSegmentFile(filepath.Join(w.dirFile.Name(), "compact.tmp"))
if err != nil {
return fmt.Errorf("create compaction segment: %w", err)
}
defer func() {
if err := os.RemoveAll(f.Name()); err != nil {
level.Error(w.logger).Log("msg", "remove tmp file", "err", err.Error())
}
}()
var (
csf = newSegmentFile(f)
crc32 = newCRC32()
decSeries = []record.RefSeries{}
activeSeries = []record.RefSeries{}
)
for r.next() {
rt, flag, byt := r.at()
if rt != WALEntrySeries {
continue
}
decSeries = decSeries[:0]
activeSeries = activeSeries[:0]
err := r.decodeSeries(flag, byt, &decSeries)
if err != nil {
return fmt.Errorf("decode samples while truncating: %w", err)
}
for _, s := range decSeries {
if keep(s.Ref) {
activeSeries = append(activeSeries, s)
}
}
buf := w.getBuffer()
flag = w.encodeSeries(buf, activeSeries)
_, err = w.writeTo(csf, crc32, WALEntrySeries, flag, buf.Get())
w.putBuffer(buf)
if err != nil {
return fmt.Errorf("write to compaction segment: %w", err)
}
}
if err := r.Err(); err != nil {
return fmt.Errorf("read candidate WAL files: %w", err)
}
off, err := csf.Seek(0, io.SeekCurrent)
2017-09-06 07:20:37 -07:00
if err != nil {
return err
}
if err := csf.Truncate(off); err != nil {
return err
}
if err := csf.Sync(); err != nil {
return nil
}
if err := csf.Close(); err != nil {
return nil
}
_ = candidates[0].Close() // need close before remove on platform windows
if err := fileutil.Replace(csf.Name(), candidates[0].Name()); err != nil {
return fmt.Errorf("rename compaction segment: %w", err)
}
for _, f := range candidates[1:] {
f.Close() // need close before remove on platform windows
if err := os.RemoveAll(f.Name()); err != nil {
return fmt.Errorf("delete WAL segment file: %w", err)
}
}
2017-09-06 07:20:37 -07:00
if err := w.dirFile.Sync(); err != nil {
return err
}
2017-09-07 01:58:34 -07:00
// The file object of csf still holds the name before rename. Recreate it so
2018-04-08 02:28:30 -07:00
// subsequent truncations do not look at a non-existent file name.
2017-09-07 01:58:34 -07:00
csf.File, err = w.openSegmentFile(candidates[0].Name())
if err != nil {
return err
}
// We don't need it to be open.
if err := csf.Close(); err != nil {
return err
}
2017-09-07 01:58:34 -07:00
2017-09-06 07:20:37 -07:00
w.mtx.Lock()
w.files = append([]*segmentFile{csf}, w.files[len(candidates):]...)
w.mtx.Unlock()
return nil
}
// LogSeries writes a batch of new series labels to the log.
// The series have to be ordered.
func (w *SegmentWAL) LogSeries(series []record.RefSeries) error {
buf := w.getBuffer()
flag := w.encodeSeries(buf, series)
2017-09-07 23:48:19 -07:00
w.mtx.Lock()
defer w.mtx.Unlock()
err := w.write(WALEntrySeries, flag, buf.Get())
w.putBuffer(buf)
if err != nil {
return fmt.Errorf("log series: %w", err)
}
tf := w.head()
for _, s := range series {
if tf.minSeries > s.Ref {
tf.minSeries = s.Ref
}
2016-12-22 03:05:24 -08:00
}
return nil
}
// LogSamples writes a batch of new samples to the log.
func (w *SegmentWAL) LogSamples(samples []record.RefSample) error {
buf := w.getBuffer()
flag := w.encodeSamples(buf, samples)
2017-09-07 23:48:19 -07:00
w.mtx.Lock()
defer w.mtx.Unlock()
err := w.write(WALEntrySamples, flag, buf.Get())
w.putBuffer(buf)
if err != nil {
return fmt.Errorf("log series: %w", err)
}
tf := w.head()
for _, s := range samples {
if tf.maxTime < s.T {
tf.maxTime = s.T
}
2016-12-22 03:05:24 -08:00
}
return nil
}
// LogDeletes write a batch of new deletes to the log.
func (w *SegmentWAL) LogDeletes(stones []tombstones.Stone) error {
buf := w.getBuffer()
flag := w.encodeDeletes(buf, stones)
2017-09-07 23:48:19 -07:00
w.mtx.Lock()
defer w.mtx.Unlock()
err := w.write(WALEntryDeletes, flag, buf.Get())
w.putBuffer(buf)
if err != nil {
return fmt.Errorf("log series: %w", err)
}
tf := w.head()
for _, s := range stones {
for _, iv := range s.Intervals {
if tf.maxTime < iv.Maxt {
tf.maxTime = iv.Maxt
}
}
}
2016-12-22 03:05:24 -08:00
return nil
}
// openSegmentFile opens the given segment file and consumes and validates header.
func (w *SegmentWAL) openSegmentFile(name string) (*os.File, error) {
2017-04-28 06:41:42 -07:00
// We must open all files in read/write mode as we may have to truncate along
// the way and any file may become the head.
f, err := os.OpenFile(name, os.O_RDWR, 0o666)
if err != nil {
return nil, err
2017-02-13 23:53:19 -08:00
}
metab := make([]byte, 8)
2017-02-13 23:53:19 -08:00
// If there is an error, we need close f for platform windows before gc.
// Otherwise, file op may fail.
hasError := true
defer func() {
if hasError {
f.Close()
}
}()
switch n, err := f.Read(metab); {
case err != nil:
return nil, fmt.Errorf("validate meta %q: %w", f.Name(), err)
case n != 8:
return nil, fmt.Errorf("invalid header size %d in %q", n, f.Name())
}
if m := binary.BigEndian.Uint32(metab[:4]); m != WALMagic {
return nil, fmt.Errorf("invalid magic header %x in %q", m, f.Name())
}
if metab[4] != WALFormatDefault {
return nil, fmt.Errorf("unknown WAL segment format %d in %q", metab[4], f.Name())
}
hasError = false
return f, nil
}
// createSegmentFile creates a new segment file with the given name. It preallocates
// the standard segment size if possible and writes the header.
func (w *SegmentWAL) createSegmentFile(name string) (*os.File, error) {
f, err := os.Create(name)
if err != nil {
return nil, err
}
if err = fileutil.Preallocate(f, w.segmentSize, true); err != nil {
return nil, err
}
// Write header metadata for new file.
metab := make([]byte, 8)
binary.BigEndian.PutUint32(metab[:4], WALMagic)
metab[4] = WALFormatDefault
if _, err := f.Write(metab); err != nil {
return nil, err
}
return f, err
2017-02-13 23:53:19 -08:00
}
2017-04-28 06:41:42 -07:00
// cut finishes the currently active segments and opens the next one.
2017-02-13 23:53:19 -08:00
// The encoder is reset to point to the new segment.
2017-05-13 08:09:26 -07:00
func (w *SegmentWAL) cut() error {
// Sync current head to disk and close.
if hf := w.head(); hf != nil {
if err := w.flush(); err != nil {
2017-02-13 23:53:19 -08:00
return err
}
// Finish last segment asynchronously to not block the WAL moving along
// in the new segment.
go func() {
w.actorc <- func() error {
off, err := hf.Seek(0, io.SeekCurrent)
if err != nil {
return fmt.Errorf("finish old segment %s: %w", hf.Name(), err)
}
if err := hf.Truncate(off); err != nil {
return fmt.Errorf("finish old segment %s: %w", hf.Name(), err)
}
if err := hf.Sync(); err != nil {
return fmt.Errorf("finish old segment %s: %w", hf.Name(), err)
}
if err := hf.Close(); err != nil {
return fmt.Errorf("finish old segment %s: %w", hf.Name(), err)
}
return nil
}
}()
2017-02-13 23:53:19 -08:00
}
p, _, err := nextSequenceFile(w.dirFile.Name())
2017-02-13 23:53:19 -08:00
if err != nil {
return err
}
f, err := w.createSegmentFile(p)
2017-02-13 23:53:19 -08:00
if err != nil {
return err
}
go func() {
w.actorc <- func() error {
if err := w.dirFile.Sync(); err != nil {
return fmt.Errorf("sync WAL directory: %w", err)
}
return nil
}
}()
w.files = append(w.files, newSegmentFile(f))
// TODO(gouthamve): make the buffer size a constant.
w.cur = bufio.NewWriterSize(f, 8*1024*1024)
2017-02-14 15:54:52 -08:00
w.curN = 8
2017-02-13 23:53:19 -08:00
return nil
}
func (w *SegmentWAL) head() *segmentFile {
2017-02-13 23:53:19 -08:00
if len(w.files) == 0 {
return nil
}
return w.files[len(w.files)-1]
2017-02-13 23:53:19 -08:00
}
2017-03-19 09:05:01 -07:00
// Sync flushes the changes to disk.
2017-05-13 08:09:26 -07:00
func (w *SegmentWAL) Sync() error {
var head *segmentFile
var err error
// Flush the writer and retrieve the reference to the head segment under mutex lock.
func() {
w.mtx.Lock()
defer w.mtx.Unlock()
if err = w.flush(); err != nil {
return
}
head = w.head()
2017-09-01 05:38:49 -07:00
}()
if err != nil {
return fmt.Errorf("flush buffer: %w", err)
}
if head != nil {
// But only fsync the head segment after releasing the mutex as it will block on disk I/O.
2017-10-04 12:51:34 -07:00
start := time.Now()
err := fileutil.Fdatasync(head.File)
w.metrics.fsyncDuration.Observe(time.Since(start).Seconds())
return err
}
return nil
2017-02-13 23:53:19 -08:00
}
2017-05-13 08:09:26 -07:00
func (w *SegmentWAL) sync() error {
if err := w.flush(); err != nil {
2016-12-22 07:14:34 -08:00
return err
}
if w.head() == nil {
return nil
}
2017-10-04 12:51:34 -07:00
start := time.Now()
err := fileutil.Fdatasync(w.head().File)
w.metrics.fsyncDuration.Observe(time.Since(start).Seconds())
return err
2016-12-22 03:05:24 -08:00
}
func (w *SegmentWAL) flush() error {
if w.cur == nil {
return nil
}
return w.cur.Flush()
}
2017-05-13 08:09:26 -07:00
func (w *SegmentWAL) run(interval time.Duration) {
var tick <-chan time.Time
if interval > 0 {
ticker := time.NewTicker(interval)
defer ticker.Stop()
tick = ticker.C
}
defer close(w.donec)
for {
// Processing all enqueued operations has precedence over shutdown and
// background syncs.
select {
case f := <-w.actorc:
if err := f(); err != nil {
level.Error(w.logger).Log("msg", "operation failed", "err", err)
}
continue
default:
}
select {
case <-w.stopc:
return
case f := <-w.actorc:
if err := f(); err != nil {
level.Error(w.logger).Log("msg", "operation failed", "err", err)
}
case <-tick:
2017-02-13 23:53:19 -08:00
if err := w.Sync(); err != nil {
level.Error(w.logger).Log("msg", "sync failed", "err", err)
}
}
}
}
2017-04-28 06:41:42 -07:00
// Close syncs all data and closes the underlying resources.
2017-05-13 08:09:26 -07:00
func (w *SegmentWAL) Close() error {
// Make sure you can call Close() multiple times.
select {
case <-w.stopc:
return nil // Already closed.
default:
}
close(w.stopc)
<-w.donec
2017-02-13 23:53:19 -08:00
w.mtx.Lock()
defer w.mtx.Unlock()
2017-02-13 23:53:19 -08:00
2016-12-22 03:05:24 -08:00
if err := w.sync(); err != nil {
return err
}
2017-02-14 15:54:52 -08:00
// On opening, a WAL must be fully consumed once. Afterwards
// only the current segment will still be open.
if hf := w.head(); hf != nil {
if err := hf.Close(); err != nil {
return fmt.Errorf("closing WAL head %s: %w", hf.Name(), err)
}
2017-02-14 21:54:59 -08:00
}
if err := w.dirFile.Close(); err != nil {
return fmt.Errorf("closing WAL dir %s: %w", w.dirFile.Name(), err)
}
return nil
2016-12-22 03:05:24 -08:00
}
func (w *SegmentWAL) write(t WALEntryType, flag uint8, buf []byte) error {
2017-04-28 06:41:42 -07:00
// Cut to the next segment if the entry exceeds the file size unless it would also
2017-02-14 21:54:59 -08:00
// exceed the size of a new segment.
// TODO(gouthamve): Add a test for this case where the commit is greater than segmentSize.
2017-02-14 21:54:59 -08:00
var (
sz = int64(len(buf)) + 6
2017-02-14 21:54:59 -08:00
newsz = w.curN + sz
)
// XXX(fabxc): this currently cuts a new file whenever the WAL was newly opened.
// Probably fine in general but may yield a lot of short files in some cases.
2017-02-14 21:54:59 -08:00
if w.cur == nil || w.curN > w.segmentSize || newsz > w.segmentSize && sz <= w.segmentSize {
2017-02-13 23:53:19 -08:00
if err := w.cut(); err != nil {
return err
}
}
n, err := w.writeTo(w.cur, w.crc32, t, flag, buf)
w.curN += int64(n)
2017-01-06 09:36:42 -08:00
return err
}
func (w *SegmentWAL) writeTo(wr io.Writer, crc32 hash.Hash, t WALEntryType, flag uint8, buf []byte) (int, error) {
if len(buf) == 0 {
return 0, nil
}
crc32.Reset()
wr = io.MultiWriter(crc32, wr)
2016-12-22 03:05:24 -08:00
var b [6]byte
b[0] = byte(t)
2016-12-22 03:05:24 -08:00
b[1] = flag
2017-01-06 09:36:42 -08:00
binary.BigEndian.PutUint32(b[2:], uint32(len(buf)))
2016-12-22 03:05:24 -08:00
n1, err := wr.Write(b[:])
if err != nil {
return n1, err
2016-12-22 03:05:24 -08:00
}
n2, err := wr.Write(buf)
if err != nil {
return n1 + n2, err
2016-12-22 03:05:24 -08:00
}
n3, err := wr.Write(crc32.Sum(b[:0]))
2016-12-22 03:05:24 -08:00
return n1 + n2 + n3, err
2016-12-22 03:05:24 -08:00
}
const (
walSeriesSimple = 1
walSamplesSimple = 1
walDeletesSimple = 1
2016-12-22 03:05:24 -08:00
)
func (w *SegmentWAL) encodeSeries(buf *encoding.Encbuf, series []record.RefSeries) uint8 {
for _, s := range series {
buf.PutBE64(uint64(s.Ref))
record.EncodeLabels(buf, s.Labels)
2016-12-22 03:05:24 -08:00
}
return walSeriesSimple
2016-12-22 03:05:24 -08:00
}
func (w *SegmentWAL) encodeSamples(buf *encoding.Encbuf, samples []record.RefSample) uint8 {
2016-12-22 03:05:24 -08:00
if len(samples) == 0 {
return walSamplesSimple
2016-12-22 03:05:24 -08:00
}
// Store base timestamp and base reference number of first sample.
// All samples encode their timestamp and ref as delta to those.
//
// TODO(fabxc): optimize for all samples having the same timestamp.
first := samples[0]
buf.PutBE64(uint64(first.Ref))
buf.PutBE64int64(first.T)
2016-12-22 03:05:24 -08:00
for _, s := range samples {
buf.PutVarint64(int64(s.Ref) - int64(first.Ref))
buf.PutVarint64(s.T - first.T)
buf.PutBE64(math.Float64bits(s.V))
2016-12-22 03:05:24 -08:00
}
return walSamplesSimple
2016-12-22 03:05:24 -08:00
}
func (w *SegmentWAL) encodeDeletes(buf *encoding.Encbuf, stones []tombstones.Stone) uint8 {
for _, s := range stones {
for _, iv := range s.Intervals {
buf.PutBE64(uint64(s.Ref))
buf.PutVarint64(iv.Mint)
buf.PutVarint64(iv.Maxt)
}
}
return walDeletesSimple
}
// walReader decodes and emits write ahead log entries.
type walReader struct {
logger log.Logger
files []*segmentFile
cur int
buf []byte
crc32 hash.Hash32
dec record.Decoder
2017-02-14 21:54:59 -08:00
curType WALEntryType
curFlag byte
curBuf []byte
lastOffset int64 // offset after last successfully read entry
err error
2016-12-22 06:18:33 -08:00
}
func newWALReader(files []*segmentFile, l log.Logger) *walReader {
if l == nil {
l = log.NewNopLogger()
}
return &walReader{
logger: l,
files: files,
buf: make([]byte, 0, 128*4096),
crc32: newCRC32(),
dec: record.NewDecoder(labels.NewSymbolTable()),
2016-12-22 06:18:33 -08:00
}
}
2017-02-14 21:54:59 -08:00
// Err returns the last error the reader encountered.
func (r *walReader) Err() error {
2017-02-14 21:54:59 -08:00
return r.err
}
func (r *walReader) Read(
seriesf func([]record.RefSeries),
samplesf func([]record.RefSample),
deletesf func([]tombstones.Stone),
) error {
// Concurrency for replaying the WAL is very limited. We at least split out decoding and
// processing into separate threads.
// Historically, the processing is the bottleneck with reading and decoding using only
// 15% of the CPU.
var (
Use zeropool.Pool to workaround SA6002 (#12189) * Use zeropool.Pool to workaround SA6002 I built a tiny library called https://github.com/colega/zeropool to workaround the SA6002 staticheck issue. While searching for the references of that SA6002 staticheck issues on Github first results was Prometheus itself, with quite a lot of ignores of it. This changes the usages of `sync.Pool` to `zeropool.Pool[T]` where a pointer is not available. Also added a benchmark for HeadAppender Append/Commit when series already exist, which is one of the most usual cases IMO, as I didn't find any. Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Improve BenchmarkHeadAppender with more cases Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * A little copying is better than a little dependency https://www.youtube.com/watch?v=PAAkCSZUG1c&t=9m28s Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Fix imports order Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Add license header Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Copyright should be on one of the first 3 lines Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Use require.Equal for testing I don't depend on testify in my lib, but here we have it available. Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Avoid flaky test Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Also use zeropool for pointsPool in engine.go Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> --------- Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com>
2023-03-29 12:34:34 -07:00
seriesPool zeropool.Pool[[]record.RefSeries]
samplePool zeropool.Pool[[]record.RefSample]
deletePool zeropool.Pool[[]tombstones.Stone]
)
donec := make(chan struct{})
2017-10-07 06:55:11 -07:00
datac := make(chan interface{}, 100)
go func() {
defer close(donec)
for x := range datac {
switch v := x.(type) {
case []record.RefSeries:
if seriesf != nil {
seriesf(v)
}
seriesPool.Put(v[:0])
case []record.RefSample:
if samplesf != nil {
samplesf(v)
}
samplePool.Put(v[:0])
case []tombstones.Stone:
if deletesf != nil {
deletesf(v)
}
deletePool.Put(v[:0])
default:
level.Error(r.logger).Log("msg", "unexpected data type")
}
}
}()
var err error
for r.next() {
et, flag, b := r.at()
// In decoding below we never return a walCorruptionErr for now.
// Those should generally be caught by entry decoding before.
switch et {
case WALEntrySeries:
Use zeropool.Pool to workaround SA6002 (#12189) * Use zeropool.Pool to workaround SA6002 I built a tiny library called https://github.com/colega/zeropool to workaround the SA6002 staticheck issue. While searching for the references of that SA6002 staticheck issues on Github first results was Prometheus itself, with quite a lot of ignores of it. This changes the usages of `sync.Pool` to `zeropool.Pool[T]` where a pointer is not available. Also added a benchmark for HeadAppender Append/Commit when series already exist, which is one of the most usual cases IMO, as I didn't find any. Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Improve BenchmarkHeadAppender with more cases Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * A little copying is better than a little dependency https://www.youtube.com/watch?v=PAAkCSZUG1c&t=9m28s Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Fix imports order Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Add license header Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Copyright should be on one of the first 3 lines Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Use require.Equal for testing I don't depend on testify in my lib, but here we have it available. Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Avoid flaky test Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Also use zeropool for pointsPool in engine.go Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> --------- Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com>
2023-03-29 12:34:34 -07:00
series := seriesPool.Get()
if series == nil {
series = make([]record.RefSeries, 0, 512)
}
err = r.decodeSeries(flag, b, &series)
if err != nil {
err = fmt.Errorf("decode series entry: %w", err)
break
}
datac <- series
cf := r.current()
for _, s := range series {
if cf.minSeries > s.Ref {
cf.minSeries = s.Ref
}
}
case WALEntrySamples:
Use zeropool.Pool to workaround SA6002 (#12189) * Use zeropool.Pool to workaround SA6002 I built a tiny library called https://github.com/colega/zeropool to workaround the SA6002 staticheck issue. While searching for the references of that SA6002 staticheck issues on Github first results was Prometheus itself, with quite a lot of ignores of it. This changes the usages of `sync.Pool` to `zeropool.Pool[T]` where a pointer is not available. Also added a benchmark for HeadAppender Append/Commit when series already exist, which is one of the most usual cases IMO, as I didn't find any. Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Improve BenchmarkHeadAppender with more cases Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * A little copying is better than a little dependency https://www.youtube.com/watch?v=PAAkCSZUG1c&t=9m28s Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Fix imports order Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Add license header Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Copyright should be on one of the first 3 lines Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Use require.Equal for testing I don't depend on testify in my lib, but here we have it available. Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Avoid flaky test Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Also use zeropool for pointsPool in engine.go Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> --------- Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com>
2023-03-29 12:34:34 -07:00
samples := samplePool.Get()
if samples == nil {
samples = make([]record.RefSample, 0, 512)
}
err = r.decodeSamples(flag, b, &samples)
if err != nil {
err = fmt.Errorf("decode samples entry: %w", err)
break
}
datac <- samples
// Update the times for the WAL segment file.
cf := r.current()
for _, s := range samples {
if cf.maxTime < s.T {
cf.maxTime = s.T
}
}
case WALEntryDeletes:
Use zeropool.Pool to workaround SA6002 (#12189) * Use zeropool.Pool to workaround SA6002 I built a tiny library called https://github.com/colega/zeropool to workaround the SA6002 staticheck issue. While searching for the references of that SA6002 staticheck issues on Github first results was Prometheus itself, with quite a lot of ignores of it. This changes the usages of `sync.Pool` to `zeropool.Pool[T]` where a pointer is not available. Also added a benchmark for HeadAppender Append/Commit when series already exist, which is one of the most usual cases IMO, as I didn't find any. Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Improve BenchmarkHeadAppender with more cases Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * A little copying is better than a little dependency https://www.youtube.com/watch?v=PAAkCSZUG1c&t=9m28s Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Fix imports order Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Add license header Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Copyright should be on one of the first 3 lines Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Use require.Equal for testing I don't depend on testify in my lib, but here we have it available. Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Avoid flaky test Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> * Also use zeropool for pointsPool in engine.go Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com> --------- Signed-off-by: Oleg Zaytsev <mail@olegzaytsev.com>
2023-03-29 12:34:34 -07:00
deletes := deletePool.Get()
if deletes == nil {
deletes = make([]tombstones.Stone, 0, 512)
}
err = r.decodeDeletes(flag, b, &deletes)
if err != nil {
err = fmt.Errorf("decode delete entry: %w", err)
break
}
datac <- deletes
// Update the times for the WAL segment file.
cf := r.current()
for _, s := range deletes {
for _, iv := range s.Intervals {
if cf.maxTime < iv.Maxt {
cf.maxTime = iv.Maxt
}
}
}
}
}
close(datac)
<-donec
if err != nil {
return err
2017-02-14 21:54:59 -08:00
}
if err := r.Err(); err != nil {
return fmt.Errorf("read entry: %w", err)
2017-02-14 21:54:59 -08:00
}
return nil
2017-02-14 21:54:59 -08:00
}
func (r *walReader) at() (WALEntryType, byte, []byte) {
return r.curType, r.curFlag, r.curBuf
}
// next returns decodes the next entry pair and returns true
2018-04-08 02:28:30 -07:00
// if it was successful.
func (r *walReader) next() bool {
if r.cur >= len(r.files) {
return false
}
cf := r.files[r.cur]
// Remember the offset after the last correctly read entry. If the next one
// is corrupted, this is where we can safely truncate.
r.lastOffset, r.err = cf.Seek(0, io.SeekCurrent)
if r.err != nil {
return false
}
et, flag, b, err := r.entry(cf)
// If we reached the end of the reader, advance to the next one
// and close.
// Do not close on the last one as it will still be appended to.
if errors.Is(err, io.EOF) {
if r.cur == len(r.files)-1 {
return false
}
// Current reader completed, close and move to the next one.
if err := cf.Close(); err != nil {
2017-02-14 21:54:59 -08:00
r.err = err
return false
}
r.cur++
return r.next()
}
if err != nil {
r.err = err
2017-02-14 21:54:59 -08:00
return false
2017-02-14 15:54:52 -08:00
}
2017-02-14 21:54:59 -08:00
r.curType = et
r.curFlag = flag
r.curBuf = b
2017-02-14 21:54:59 -08:00
return r.err == nil
}
func (r *walReader) current() *segmentFile {
return r.files[r.cur]
}
// walCorruptionErr is a type wrapper for errors that indicate WAL corruption
// and trigger a truncation.
type walCorruptionErr struct {
err error
file int
lastOffset int64
}
func (e *walCorruptionErr) Error() string {
return fmt.Sprintf("%s <file: %d, lastOffset: %d>", e.err, e.file, e.lastOffset)
}
func (e *walCorruptionErr) Unwrap() error {
return e.err
}
func (r *walReader) corruptionErr(s string, args ...interface{}) error {
return &walCorruptionErr{
err: fmt.Errorf(s, args...),
file: r.cur,
lastOffset: r.lastOffset,
}
}
func (r *walReader) entry(cr io.Reader) (WALEntryType, byte, []byte, error) {
r.crc32.Reset()
tr := io.TeeReader(cr, r.crc32)
2017-02-14 21:54:59 -08:00
b := make([]byte, 6)
switch n, err := tr.Read(b); {
case err != nil:
2017-02-14 21:54:59 -08:00
return 0, 0, nil, err
case n != 6:
return 0, 0, nil, r.corruptionErr("invalid entry header size %d", n)
2017-02-14 21:54:59 -08:00
}
var (
etype = WALEntryType(b[0])
flag = b[1]
length = int(binary.BigEndian.Uint32(b[2:]))
)
// Exit if we reached pre-allocated space.
if etype == 0 {
return 0, 0, nil, io.EOF
2017-02-14 15:54:52 -08:00
}
if etype != WALEntrySeries && etype != WALEntrySamples && etype != WALEntryDeletes {
return 0, 0, nil, r.corruptionErr("invalid entry type %d", etype)
}
2017-02-14 21:54:59 -08:00
if length > len(r.buf) {
r.buf = make([]byte, length)
}
buf := r.buf[:length]
switch n, err := tr.Read(buf); {
case err != nil:
2017-02-14 21:54:59 -08:00
return 0, 0, nil, err
case n != length:
return 0, 0, nil, r.corruptionErr("invalid entry body size %d", n)
2017-02-14 21:54:59 -08:00
}
switch n, err := cr.Read(b[:4]); {
case err != nil:
2017-02-14 21:54:59 -08:00
return 0, 0, nil, err
case n != 4:
return 0, 0, nil, r.corruptionErr("invalid checksum length %d", n)
2017-02-14 21:54:59 -08:00
}
if exp, has := binary.BigEndian.Uint32(b[:4]), r.crc32.Sum32(); has != exp {
return 0, 0, nil, r.corruptionErr("unexpected CRC32 checksum %x, want %x", has, exp)
2017-02-14 21:54:59 -08:00
}
return etype, flag, buf, nil
2017-02-14 15:54:52 -08:00
}
func (r *walReader) decodeSeries(flag byte, b []byte, res *[]record.RefSeries) error {
dec := encoding.Decbuf{B: b}
for len(dec.B) > 0 && dec.Err() == nil {
ref := chunks.HeadSeriesRef(dec.Be64())
lset := r.dec.DecodeLabels(&dec)
2016-12-22 06:18:33 -08:00
*res = append(*res, record.RefSeries{
Ref: ref,
Labels: lset,
})
}
if dec.Err() != nil {
return dec.Err()
}
if len(dec.B) > 0 {
return fmt.Errorf("unexpected %d bytes left in entry", len(dec.B))
2016-12-22 06:18:33 -08:00
}
return nil
2016-12-22 06:18:33 -08:00
}
func (r *walReader) decodeSamples(flag byte, b []byte, res *[]record.RefSample) error {
if len(b) == 0 {
return nil
}
dec := encoding.Decbuf{B: b}
2016-12-22 06:18:33 -08:00
var (
baseRef = dec.Be64()
baseTime = dec.Be64int64()
2016-12-22 06:18:33 -08:00
)
for len(dec.B) > 0 && dec.Err() == nil {
dref := dec.Varint64()
dtime := dec.Varint64()
val := dec.Be64()
2016-12-22 06:18:33 -08:00
*res = append(*res, record.RefSample{
Ref: chunks.HeadSeriesRef(int64(baseRef) + dref),
T: baseTime + dtime,
V: math.Float64frombits(val),
})
}
2016-12-22 06:18:33 -08:00
if err := dec.Err(); err != nil {
return fmt.Errorf("decode error after %d samples: %w", len(*res), err)
}
if len(dec.B) > 0 {
return fmt.Errorf("unexpected %d bytes left in entry", len(dec.B))
2016-12-22 06:18:33 -08:00
}
return nil
}
func (r *walReader) decodeDeletes(flag byte, b []byte, res *[]tombstones.Stone) error {
dec := &encoding.Decbuf{B: b}
for dec.Len() > 0 && dec.Err() == nil {
*res = append(*res, tombstones.Stone{
Ref: storage.SeriesRef(dec.Be64()),
Intervals: tombstones.Intervals{
{Mint: dec.Varint64(), Maxt: dec.Varint64()},
},
})
}
if dec.Err() != nil {
return dec.Err()
}
if len(dec.B) > 0 {
return fmt.Errorf("unexpected %d bytes left in entry", len(dec.B))
}
return nil
2016-12-22 06:18:33 -08:00
}
func deprecatedWALExists(logger log.Logger, dir string) (bool, error) {
// Detect whether we still have the old WAL.
fns, err := sequenceFiles(dir)
if err != nil && !os.IsNotExist(err) {
return false, fmt.Errorf("list sequence files: %w", err)
}
if len(fns) == 0 {
return false, nil // No WAL at all yet.
}
// Check header of first segment to see whether we are still dealing with an
// old WAL.
f, err := os.Open(fns[0])
if err != nil {
return false, fmt.Errorf("check first existing segment: %w", err)
}
defer f.Close()
var hdr [4]byte
if _, err := f.Read(hdr[:]); err != nil && !errors.Is(err, io.EOF) {
return false, fmt.Errorf("read header from first segment: %w", err)
}
// If we cannot read the magic header for segments of the old WAL, abort.
// Either it's migrated already or there's a corruption issue with which
// we cannot deal here anyway. Subsequent attempts to open the WAL will error in that case.
if binary.BigEndian.Uint32(hdr[:]) != WALMagic {
return false, nil
}
return true, nil
}
// MigrateWAL rewrites the deprecated write ahead log into the new format.
func MigrateWAL(logger log.Logger, dir string) (err error) {
if logger == nil {
logger = log.NewNopLogger()
}
if exists, err := deprecatedWALExists(logger, dir); err != nil || !exists {
return err
}
level.Info(logger).Log("msg", "Migrating WAL format")
tmpdir := dir + ".tmp"
if err := os.RemoveAll(tmpdir); err != nil {
return fmt.Errorf("cleanup replacement dir: %w", err)
}
repl, err := wlog.New(logger, nil, tmpdir, wlog.CompressionNone)
if err != nil {
return fmt.Errorf("open new WAL: %w", err)
}
// It should've already been closed as part of the previous finalization.
// Do it once again in case of prior errors.
defer func() {
if err != nil {
repl.Close()
}
}()
w, err := OpenSegmentWAL(dir, logger, time.Minute, nil)
if err != nil {
return fmt.Errorf("open old WAL: %w", err)
}
defer w.Close()
rdr := w.Reader()
var (
enc record.Encoder
b []byte
)
decErr := rdr.Read(
func(s []record.RefSeries) {
if err != nil {
return
}
err = repl.Log(enc.Series(s, b[:0]))
},
func(s []record.RefSample) {
if err != nil {
return
}
err = repl.Log(enc.Samples(s, b[:0]))
},
func(s []tombstones.Stone) {
if err != nil {
return
}
err = repl.Log(enc.Tombstones(s, b[:0]))
},
)
if decErr != nil {
return fmt.Errorf("decode old entries: %w", err)
}
if err != nil {
return fmt.Errorf("write new entries: %w", err)
}
// We explicitly close even when there is a defer for Windows to be
// able to delete it. The defer is in place to close it in-case there
// are errors above.
if err := w.Close(); err != nil {
return fmt.Errorf("close old WAL: %w", err)
}
if err := repl.Close(); err != nil {
return fmt.Errorf("close new WAL: %w", err)
}
if err := fileutil.Replace(tmpdir, dir); err != nil {
return fmt.Errorf("replace old WAL: %w", err)
}
return nil
}