// Copyright 2017 The Prometheus Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package tsdb import ( "encoding/binary" "encoding/json" "fmt" "io/ioutil" "os" "path/filepath" "sort" "github.com/oklog/ulid" "github.com/pkg/errors" "github.com/prometheus/tsdb/labels" ) // DiskBlock handles reads against a Block of time series data. type DiskBlock interface { // Directory where block data is stored. Dir() string // Stats returns statistics about the block. Meta() BlockMeta // Index returns an IndexReader over the block's data. Index() IndexReader // Chunks returns a ChunkReader over the block's data. Chunks() ChunkReader // Delete deletes data from the block. Delete(mint, maxt int64, ms ...labels.Matcher) error // Close releases all underlying resources of the block. Close() error } // Block is an interface to a DiskBlock that can also be queried. type Block interface { DiskBlock Queryable } // headBlock is a regular block that can still be appended to. type headBlock interface { Block Appendable } // Appendable defines an entity to which data can be appended. type Appendable interface { // Appender returns a new Appender against an underlying store. Appender() Appender // Busy returns whether there are any currently active appenders. Busy() bool } // Queryable defines an entity which provides a Querier. type Queryable interface { Querier(mint, maxt int64) Querier } // BlockMeta provides meta information about a block. type BlockMeta struct { // Unique identifier for the block and its contents. Changes on compaction. ULID ulid.ULID `json:"ulid"` // Sequence number of the block. Sequence int `json:"sequence"` // MinTime and MaxTime specify the time range all samples // in the block are in. MinTime int64 `json:"minTime"` MaxTime int64 `json:"maxTime"` // Stats about the contents of the block. Stats struct { NumSamples uint64 `json:"numSamples,omitempty"` NumSeries uint64 `json:"numSeries,omitempty"` NumChunks uint64 `json:"numChunks,omitempty"` } `json:"stats,omitempty"` // Information on compactions the block was created from. Compaction struct { Generation int `json:"generation"` } `json:"compaction"` } const ( flagNone = 0 flagStd = 1 ) type blockMeta struct { Version int `json:"version"` *BlockMeta } const metaFilename = "meta.json" const tombstoneFilename = "tombstones" func readMetaFile(dir string) (*BlockMeta, error) { b, err := ioutil.ReadFile(filepath.Join(dir, metaFilename)) if err != nil { return nil, err } var m blockMeta if err := json.Unmarshal(b, &m); err != nil { return nil, err } if m.Version != 1 { return nil, errors.Errorf("unexpected meta file version %d", m.Version) } return m.BlockMeta, nil } func writeMetaFile(dir string, meta *BlockMeta) error { // Make any changes to the file appear atomic. path := filepath.Join(dir, metaFilename) tmp := path + ".tmp" f, err := os.Create(tmp) if err != nil { return err } enc := json.NewEncoder(f) enc.SetIndent("", "\t") var merr MultiError if merr.Add(enc.Encode(&blockMeta{Version: 1, BlockMeta: meta})); merr.Err() != nil { merr.Add(f.Close()) return merr } if err := f.Close(); err != nil { return err } return renameFile(tmp, path) } func readTombstoneFile(dir string) (TombstoneReader, error) { return newTombStoneReader(dir) } func writeTombstoneFile(dir string, tr TombstoneReader) error { path := filepath.Join(dir, tombstoneFilename) tmp := path + ".tmp" f, err := os.Create(tmp) if err != nil { return err } stoneOff := make(map[uint32]int64) // The map that holds the ref to offset vals. refs := []uint32{} // Sorted refs. pos := int64(0) buf := encbuf{b: make([]byte, 2*binary.MaxVarintLen64)} for tr.Next() { s := tr.At() refs = append(refs, s.ref) stoneOff[s.ref] = pos // Write the ranges. buf.reset() buf.putVarint64(int64(len(s.ranges))) n, err := f.Write(buf.get()) if err != nil { return err } pos += int64(n) for _, r := range s.ranges { buf.reset() buf.putVarint64(r.mint) buf.putVarint64(r.maxt) n, err = f.Write(buf.get()) if err != nil { return err } pos += int64(n) } } if err := tr.Err(); err != nil { return err } // Write the offset table. buf.reset() buf.putBE32int(len(refs)) if _, err := f.Write(buf.get()); err != nil { return err } for _, ref := range refs { buf.reset() buf.putBE32(ref) buf.putBE64int64(stoneOff[ref]) _, err = f.Write(buf.get()) if err != nil { return err } } // Write the offset to the offset table. buf.reset() buf.putBE64int64(pos) _, err = f.Write(buf.get()) if err != nil { return err } if err := f.Close(); err != nil { return err } return renameFile(tmp, path) } type persistedBlock struct { dir string meta BlockMeta chunkr *chunkReader indexr *indexReader } func newPersistedBlock(dir string) (*persistedBlock, error) { meta, err := readMetaFile(dir) if err != nil { return nil, err } cr, err := newChunkReader(chunkDir(dir)) if err != nil { return nil, err } ir, err := newIndexReader(dir) if err != nil { return nil, err } pb := &persistedBlock{ dir: dir, meta: *meta, chunkr: cr, indexr: ir, } return pb, nil } func (pb *persistedBlock) Close() error { var merr MultiError merr.Add(pb.chunkr.Close()) merr.Add(pb.indexr.Close()) return merr.Err() } func (pb *persistedBlock) String() string { return fmt.Sprintf("(%d, %s)", pb.meta.Sequence, pb.meta.ULID) } func (pb *persistedBlock) Querier(mint, maxt int64) Querier { return &blockQuerier{ mint: mint, maxt: maxt, index: pb.Index(), chunks: pb.Chunks(), } } func (pb *persistedBlock) Dir() string { return pb.dir } func (pb *persistedBlock) Index() IndexReader { return pb.indexr } func (pb *persistedBlock) Chunks() ChunkReader { return pb.chunkr } func (pb *persistedBlock) Meta() BlockMeta { return pb.meta } func (pb *persistedBlock) Delete(mint, maxt int64, ms ...labels.Matcher) error { pr := newPostingsReader(pb.indexr) p, absent := pr.Select(ms...) ir := pb.indexr // Choose only valid postings which have chunks in the time-range. vPostings := []uint32{} Outer: for p.Next() { lset, chunks, err := ir.Series(p.At()) if err != nil { return err } for _, abs := range absent { if lset.Get(abs) != "" { continue Outer } } // XXX(gouthamve): Adjust mint and maxt to match the time-range in the chunks? for _, chk := range chunks { if (mint <= chk.MinTime && maxt >= chk.MinTime) || (mint > chk.MinTime && mint <= chk.MaxTime) { vPostings = append(vPostings, p.At()) continue Outer } } } if p.Err() != nil { return p.Err() } // Merge the current and new tombstones. tr := newMapTombstoneReader(ir.tombstones) str := newSimpleTombstoneReader(vPostings, []trange{{mint, maxt}}) tombreader := newMergedTombstoneReader(tr, str) return writeTombstoneFile(pb.dir, tombreader) } // stone holds the information on the posting and time-range // that is deleted. type stone struct { ref uint32 ranges []trange } // TombstoneReader is the iterator over tombstones. type TombstoneReader interface { Next() bool At() stone Err() error } var emptyTombstoneReader = newMapTombstoneReader(make(map[uint32][]trange)) type tombstoneReader struct { stones []byte idx int len int b []byte err error } func newTombStoneReader(dir string) (*tombstoneReader, error) { // TODO(gouthamve): MMAP? b, err := ioutil.ReadFile(filepath.Join(dir, tombstoneFilename)) if err != nil { return nil, err } offsetBytes := b[len(b)-8:] d := &decbuf{b: offsetBytes} off := d.be64int64() if err := d.err(); err != nil { return nil, err } d = &decbuf{b: b[off:]} numStones := d.be32int() if err := d.err(); err != nil { return nil, err } return &tombstoneReader{ stones: b[off+4:], idx: -1, len: int(numStones), b: b, }, nil } func (t *tombstoneReader) Next() bool { if t.err != nil { return false } t.idx++ return t.idx < t.len } func (t *tombstoneReader) At() stone { bytIdx := t.idx * (4 + 8) dat := t.stones[bytIdx : bytIdx+12] d := &decbuf{b: dat} ref := d.be32() off := d.be64int64() d = &decbuf{b: t.b[off:]} numRanges := d.varint64() if err := d.err(); err != nil { t.err = err return stone{ref: ref} } dranges := make([]trange, 0, numRanges) for i := 0; i < int(numRanges); i++ { mint := d.varint64() maxt := d.varint64() if err := d.err(); err != nil { t.err = err return stone{ref: ref, ranges: dranges} } dranges = append(dranges, trange{mint, maxt}) } return stone{ref: ref, ranges: dranges} } func (t *tombstoneReader) Err() error { return t.err } type mapTombstoneReader struct { refs []uint32 cur uint32 stones map[uint32][]trange } func newMapTombstoneReader(ts map[uint32][]trange) *mapTombstoneReader { refs := make([]uint32, 0, len(ts)) for k := range ts { refs = append(refs, k) } sort.Sort(uint32slice(refs)) return &mapTombstoneReader{stones: ts, refs: refs} } func (t *mapTombstoneReader) Next() bool { if len(t.refs) > 0 { t.cur = t.refs[0] t.refs = t.refs[1:] return true } t.cur = 0 return false } func (t *mapTombstoneReader) At() stone { return stone{ref: t.cur, ranges: t.stones[t.cur]} } func (t *mapTombstoneReader) Err() error { return nil } type simpleTombstoneReader struct { refs []uint32 cur uint32 ranges []trange } func newSimpleTombstoneReader(refs []uint32, drange []trange) *simpleTombstoneReader { return &simpleTombstoneReader{refs: refs, ranges: drange} } func (t *simpleTombstoneReader) Next() bool { if len(t.refs) > 0 { t.cur = t.refs[0] return true } t.cur = 0 return false } func (t *simpleTombstoneReader) At() stone { return stone{ref: t.cur, ranges: t.ranges} } func (t *simpleTombstoneReader) Err() error { return nil } type mergedTombstoneReader struct { a, b TombstoneReader cur stone initialized bool aok, bok bool } func newMergedTombstoneReader(a, b TombstoneReader) *mergedTombstoneReader { return &mergedTombstoneReader{a: a, b: b} } func (t *mergedTombstoneReader) Next() bool { if !t.initialized { t.aok = t.a.Next() t.bok = t.b.Next() t.initialized = true } if !t.aok && !t.bok { return false } if !t.aok { t.cur = t.b.At() t.bok = t.b.Next() return true } if !t.bok { t.cur = t.a.At() t.aok = t.a.Next() return true } acur, bcur := t.a.At(), t.b.At() if acur.ref < bcur.ref { t.cur = acur t.aok = t.a.Next() } else if acur.ref > bcur.ref { t.cur = bcur t.bok = t.b.Next() } else { t.cur = acur // Merge time ranges. for _, r := range bcur.ranges { acur.ranges = addNewInterval(acur.ranges, r) } t.aok = t.a.Next() t.bok = t.b.Next() } return true } func (t *mergedTombstoneReader) At() stone { return t.cur } func (t *mergedTombstoneReader) Err() error { if t.a.Err() != nil { return t.a.Err() } return t.b.Err() } func chunkDir(dir string) string { return filepath.Join(dir, "chunks") } func walDir(dir string) string { return filepath.Join(dir, "wal") } type mmapFile struct { f *os.File b []byte } func openMmapFile(path string) (*mmapFile, error) { f, err := os.Open(path) if err != nil { return nil, errors.Wrap(err, "try lock file") } info, err := f.Stat() if err != nil { return nil, errors.Wrap(err, "stat") } b, err := mmap(f, int(info.Size())) if err != nil { return nil, errors.Wrap(err, "mmap") } return &mmapFile{f: f, b: b}, nil } func (f *mmapFile) Close() error { err0 := munmap(f.b) err1 := f.f.Close() if err0 != nil { return err0 } return err1 }