// Copyright 2017 The Prometheus Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tsdb import ( "bufio" "encoding/binary" "hash" "hash/crc32" "io" "math" "os" "sync" "time" "github.com/coreos/etcd/pkg/fileutil" "github.com/go-kit/kit/log" "github.com/pkg/errors" "github.com/prometheus/tsdb/labels" ) // WALEntryType indicates what data a WAL entry contains. type WALEntryType byte const ( // WALMagic is a 4 byte number every WAL segment file starts with. WALMagic = uint32(0x43AF00EF) // WALFormatDefault is the version flag for the default outer segment file format. WALFormatDefault = byte(1) ) // Entry types in a segment file. const ( WALEntrySymbols WALEntryType = 1 WALEntrySeries WALEntryType = 2 WALEntrySamples WALEntryType = 3 ) // SegmentWAL is a write ahead log for series data. type SegmentWAL struct { mtx sync.Mutex dirFile *os.File files []*os.File logger log.Logger flushInterval time.Duration segmentSize int64 crc32 hash.Hash32 cur *bufio.Writer curN int64 stopc chan struct{} donec chan struct{} } // WAL is a write ahead log that can log new series labels and samples. // It must be completely read before new entries are logged. type WAL interface { Reader() WALReader Log([]labels.Labels, []RefSample) error Close() error } // WALReader reads entries from a WAL. type WALReader interface { At() ([]labels.Labels, []RefSample) Next() bool Err() error } // RefSample is a timestamp/value pair associated with a reference to a series. type RefSample struct { Ref uint64 T int64 V float64 } const ( walSegmentSizeBytes = 256 * 1024 * 1024 // 256 MB ) // The table gets initialized with sync.Once but may still cause a race // with any other use of the crc32 package anywhere. Thus we initialize it // before. var castagnoliTable *crc32.Table func init() { castagnoliTable = crc32.MakeTable(crc32.Castagnoli) } // OpenSegmentWAL opens or creates a write ahead log in the given directory. // The WAL must be read completely before new data is written. func OpenSegmentWAL(dir string, logger log.Logger, flushInterval time.Duration) (*SegmentWAL, error) { if err := os.MkdirAll(dir, 0777); err != nil { return nil, err } df, err := fileutil.OpenDir(dir) if err != nil { return nil, err } if logger == nil { logger = log.NewNopLogger() } w := &SegmentWAL{ dirFile: df, logger: logger, flushInterval: flushInterval, donec: make(chan struct{}), stopc: make(chan struct{}), segmentSize: walSegmentSizeBytes, crc32: crc32.New(castagnoliTable), } if err := w.initSegments(); err != nil { return nil, err } go w.run(flushInterval) return w, nil } // Reader returns a new reader over the the write ahead log data. // It must be completely consumed before writing to the WAL. func (w *SegmentWAL) Reader() WALReader { return newWALReader(w, w.logger) } // Log writes a batch of new series labels and samples to the log. func (w *SegmentWAL) Log(series []labels.Labels, samples []RefSample) error { if err := w.encodeSeries(series); err != nil { return err } if err := w.encodeSamples(samples); err != nil { return err } if w.flushInterval <= 0 { return w.Sync() } return nil } // initSegments finds all existing segment files and opens them in the // appropriate file modes. func (w *SegmentWAL) initSegments() error { fns, err := sequenceFiles(w.dirFile.Name(), "") if err != nil { return err } if len(fns) == 0 { return nil } // We must open all files in read/write mode as we may have to truncate along // the way and any file may become the tail. for _, fn := range fns { f, err := os.OpenFile(fn, os.O_RDWR, 0666) if err != nil { return err } w.files = append(w.files, f) } // Consume and validate meta headers. for _, f := range w.files { metab := make([]byte, 8) if n, err := f.Read(metab); err != nil { return errors.Wrapf(err, "validate meta %q", f.Name()) } else if n != 8 { return errors.Errorf("invalid header size %d in %q", n, f.Name()) } if m := binary.BigEndian.Uint32(metab[:4]); m != WALMagic { return errors.Errorf("invalid magic header %x in %q", m, f.Name()) } if metab[4] != WALFormatDefault { return errors.Errorf("unknown WAL segment format %d in %q", metab[4], f.Name()) } } return nil } // cut finishes the currently active segments and opens the next one. // The encoder is reset to point to the new segment. func (w *SegmentWAL) cut() error { // Sync current tail to disk and close. if tf := w.tail(); tf != nil { if err := w.sync(); err != nil { return err } off, err := tf.Seek(0, os.SEEK_CUR) if err != nil { return err } if err := tf.Truncate(off); err != nil { return err } if err := tf.Close(); err != nil { return err } } p, _, err := nextSequenceFile(w.dirFile.Name(), "") if err != nil { return err } f, err := os.Create(p) if err != nil { return err } if err = fileutil.Preallocate(f, w.segmentSize, true); err != nil { return err } if err = w.dirFile.Sync(); err != nil { return err } // Write header metadata for new file. metab := make([]byte, 8) binary.BigEndian.PutUint32(metab[:4], WALMagic) metab[4] = WALFormatDefault if _, err := f.Write(metab); err != nil { return err } w.files = append(w.files, f) w.cur = bufio.NewWriterSize(f, 4*1024*1024) w.curN = 8 return nil } func (w *SegmentWAL) tail() *os.File { if len(w.files) == 0 { return nil } return w.files[len(w.files)-1] } // Sync flushes the changes to disk. func (w *SegmentWAL) Sync() error { w.mtx.Lock() defer w.mtx.Unlock() return w.sync() } func (w *SegmentWAL) sync() error { if w.cur == nil { return nil } if err := w.cur.Flush(); err != nil { return err } return fileutil.Fdatasync(w.tail()) } func (w *SegmentWAL) run(interval time.Duration) { var tick <-chan time.Time if interval > 0 { ticker := time.NewTicker(interval) defer ticker.Stop() tick = ticker.C } defer close(w.donec) for { select { case <-w.stopc: return case <-tick: if err := w.Sync(); err != nil { w.logger.Log("msg", "sync failed", "err", err) } } } } // Close syncs all data and closes the underlying resources. func (w *SegmentWAL) Close() error { close(w.stopc) <-w.donec // Lock mutex and leave it locked so we panic if there's a bug causing // the block to be used afterwards. w.mtx.Lock() if err := w.sync(); err != nil { return err } // On opening, a WAL must be fully consumed once. Afterwards // only the current segment will still be open. if tf := w.tail(); tf != nil { return errors.Wrapf(tf.Close(), "closing WAL tail %s", tf.Name()) } return nil } const ( minSectorSize = 512 // walPageBytes is the alignment for flushing records to the backing Writer. // It should be a multiple of the minimum sector size so that WAL can safely // distinguish between torn writes and ordinary data corruption. walPageBytes = 16 * minSectorSize ) func (w *SegmentWAL) entry(et WALEntryType, flag byte, buf []byte) error { w.mtx.Lock() defer w.mtx.Unlock() // Cut to the next segment if the entry exceeds the file size unless it would also // exceed the size of a new segment. var ( // 6-byte header + 4-byte CRC32 + buf. sz = int64(6 + 4 + len(buf)) newsz = w.curN + sz ) // XXX(fabxc): this currently cuts a new file whenever the WAL was newly opened. // Probably fine in general but may yield a lot of short files in some cases. if w.cur == nil || w.curN > w.segmentSize || newsz > w.segmentSize && sz <= w.segmentSize { if err := w.cut(); err != nil { return err } } w.crc32.Reset() wr := io.MultiWriter(w.crc32, w.cur) b := make([]byte, 6) b[0] = byte(et) b[1] = flag binary.BigEndian.PutUint32(b[2:], uint32(len(buf))) if _, err := wr.Write(b); err != nil { return err } if _, err := wr.Write(buf); err != nil { return err } if _, err := w.cur.Write(w.crc32.Sum(nil)); err != nil { return err } w.curN += sz putWALBuffer(buf) return nil } const ( walSeriesSimple = 1 walSamplesSimple = 1 ) var walBuffers = sync.Pool{} func getWALBuffer() []byte { b := walBuffers.Get() if b == nil { return make([]byte, 0, 64*1024) } return b.([]byte) } func putWALBuffer(b []byte) { b = b[:0] walBuffers.Put(b) } func (w *SegmentWAL) encodeSeries(series []labels.Labels) error { if len(series) == 0 { return nil } b := make([]byte, binary.MaxVarintLen32) buf := getWALBuffer() for _, lset := range series { n := binary.PutUvarint(b, uint64(len(lset))) buf = append(buf, b[:n]...) for _, l := range lset { n = binary.PutUvarint(b, uint64(len(l.Name))) buf = append(buf, b[:n]...) buf = append(buf, l.Name...) n = binary.PutUvarint(b, uint64(len(l.Value))) buf = append(buf, b[:n]...) buf = append(buf, l.Value...) } } return w.entry(WALEntrySeries, walSeriesSimple, buf) } func (w *SegmentWAL) encodeSamples(samples []RefSample) error { if len(samples) == 0 { return nil } b := make([]byte, binary.MaxVarintLen64) buf := getWALBuffer() // Store base timestamp and base reference number of first sample. // All samples encode their timestamp and ref as delta to those. // // TODO(fabxc): optimize for all samples having the same timestamp. first := samples[0] binary.BigEndian.PutUint64(b, first.Ref) buf = append(buf, b[:8]...) binary.BigEndian.PutUint64(b, uint64(first.T)) buf = append(buf, b[:8]...) for _, s := range samples { n := binary.PutVarint(b, int64(s.Ref)-int64(first.Ref)) buf = append(buf, b[:n]...) n = binary.PutVarint(b, s.T-first.T) buf = append(buf, b[:n]...) binary.BigEndian.PutUint64(b, math.Float64bits(s.V)) buf = append(buf, b[:8]...) } return w.entry(WALEntrySamples, walSamplesSimple, buf) } // walReader decodes and emits write ahead log entries. type walReader struct { logger log.Logger wal *SegmentWAL cur int buf []byte crc32 hash.Hash32 err error labels []labels.Labels samples []RefSample } func newWALReader(w *SegmentWAL, l log.Logger) *walReader { if l == nil { l = log.NewNopLogger() } return &walReader{ logger: l, wal: w, buf: make([]byte, 0, 128*4096), crc32: crc32.New(crc32.MakeTable(crc32.Castagnoli)), } } // At returns the last decoded entry of labels or samples. // The returned slices are only valid until the next call to Next(). Their elements // have to be copied to preserve them. func (r *walReader) At() ([]labels.Labels, []RefSample) { return r.labels, r.samples } // Err returns the last error the reader encountered. func (r *walReader) Err() error { return r.err } // nextEntry retrieves the next entry. It is also used as a testing hook. func (r *walReader) nextEntry() (WALEntryType, byte, []byte, error) { if r.cur >= len(r.wal.files) { return 0, 0, nil, io.EOF } cf := r.wal.files[r.cur] et, flag, b, err := r.entry(cf) // If we reached the end of the reader, advance to the next one // and close. // Do not close on the last one as it will still be appended to. if err == io.EOF && r.cur < len(r.wal.files)-1 { // Current reader completed, close and move to the next one. if err := cf.Close(); err != nil { return 0, 0, nil, err } r.cur++ return r.nextEntry() } return et, flag, b, err } // Next returns decodes the next entry pair and returns true // if it was succesful. func (r *walReader) Next() bool { r.labels = r.labels[:0] r.samples = r.samples[:0] if r.cur >= len(r.wal.files) { return false } cf := r.wal.files[r.cur] // Save position after last valid entry if we have to truncate the WAL. lastOffset, err := cf.Seek(0, os.SEEK_CUR) if err != nil { r.err = err return false } et, flag, b, err := r.entry(cf) // If we reached the end of the reader, advance to the next one // and close. // Do not close on the last one as it will still be appended to. if err == io.EOF { if r.cur == len(r.wal.files)-1 { return false } // Current reader completed, close and move to the next one. if err := cf.Close(); err != nil { r.err = err return false } r.cur++ return r.Next() } if err != nil { r.err = err if _, ok := err.(walCorruptionErr); ok { r.err = r.truncate(lastOffset) } return false } // In decoding below we never return a walCorruptionErr for now. // Those should generally be catched by entry decoding before. switch et { case WALEntrySamples: if err := r.decodeSamples(flag, b); err != nil { r.err = err } case WALEntrySeries: if err := r.decodeSeries(flag, b); err != nil { r.err = err } } return r.err == nil } func (r *walReader) current() *os.File { return r.wal.files[r.cur] } // truncate the WAL after the last valid entry. func (r *walReader) truncate(lastOffset int64) error { r.logger.Log("msg", "WAL corruption detected; truncating", "err", r.err, "file", r.current().Name(), "pos", lastOffset) // Close and delete all files after the current one. for _, f := range r.wal.files[r.cur+1:] { if err := f.Close(); err != nil { return err } if err := os.Remove(f.Name()); err != nil { return err } } r.wal.files = r.wal.files[:r.cur+1] // Seek the current file to the last valid offset where we continue writing from. _, err := r.current().Seek(lastOffset, os.SEEK_SET) return err } // walCorruptionErr is a type wrapper for errors that indicate WAL corruption // and trigger a truncation. type walCorruptionErr error func walCorruptionErrf(s string, args ...interface{}) error { return walCorruptionErr(errors.Errorf(s, args...)) } func (r *walReader) entry(cr io.Reader) (WALEntryType, byte, []byte, error) { r.crc32.Reset() tr := io.TeeReader(cr, r.crc32) b := make([]byte, 6) if n, err := tr.Read(b); err != nil { return 0, 0, nil, err } else if n != 6 { return 0, 0, nil, walCorruptionErrf("invalid entry header size %d", n) } var ( etype = WALEntryType(b[0]) flag = b[1] length = int(binary.BigEndian.Uint32(b[2:])) ) // Exit if we reached pre-allocated space. if etype == 0 { return 0, 0, nil, io.EOF } if etype != WALEntrySeries && etype != WALEntrySamples { return 0, 0, nil, walCorruptionErrf("invalid entry type %d", etype) } if length > len(r.buf) { r.buf = make([]byte, length) } buf := r.buf[:length] if n, err := tr.Read(buf); err != nil { return 0, 0, nil, err } else if n != length { return 0, 0, nil, walCorruptionErrf("invalid entry body size %d", n) } if n, err := cr.Read(b[:4]); err != nil { return 0, 0, nil, err } else if n != 4 { return 0, 0, nil, walCorruptionErrf("invalid checksum length %d", n) } if exp, has := binary.BigEndian.Uint32(b[:4]), r.crc32.Sum32(); has != exp { return 0, 0, nil, walCorruptionErrf("unexpected CRC32 checksum %x, want %x", has, exp) } return etype, flag, buf, nil } func (r *walReader) decodeSeries(flag byte, b []byte) error { for len(b) > 0 { l, n := binary.Uvarint(b) if n < 1 { return errors.Wrap(errInvalidSize, "number of labels") } b = b[n:] lset := make(labels.Labels, l) for i := 0; i < int(l); i++ { nl, n := binary.Uvarint(b) if n < 1 || len(b) < n+int(nl) { return errors.Wrap(errInvalidSize, "label name") } lset[i].Name = string(b[n : n+int(nl)]) b = b[n+int(nl):] vl, n := binary.Uvarint(b) if n < 1 || len(b) < n+int(vl) { return errors.Wrap(errInvalidSize, "label value") } lset[i].Value = string(b[n : n+int(vl)]) b = b[n+int(vl):] } r.labels = append(r.labels, lset) } return nil } func (r *walReader) decodeSamples(flag byte, b []byte) error { if len(b) < 16 { return errors.Wrap(errInvalidSize, "header length") } var ( baseRef = binary.BigEndian.Uint64(b) baseTime = int64(binary.BigEndian.Uint64(b[8:])) ) b = b[16:] for len(b) > 0 { var smpl RefSample dref, n := binary.Varint(b) if n < 1 { return errors.Wrap(errInvalidSize, "sample ref delta") } b = b[n:] smpl.Ref = uint64(int64(baseRef) + dref) dtime, n := binary.Varint(b) if n < 1 { return errors.Wrap(errInvalidSize, "sample timestamp delta") } b = b[n:] smpl.T = baseTime + dtime if len(b) < 8 { return errors.Wrapf(errInvalidSize, "sample value bits %d", len(b)) } smpl.V = float64(math.Float64frombits(binary.BigEndian.Uint64(b))) b = b[8:] r.samples = append(r.samples, smpl) } return nil }