Move CRC back to chunks file, alignment for fixed-sized ints

This commit is contained in:
Fabian Reinartz 2017-04-28 14:17:53 +02:00
parent a54f46d5e7
commit 34ba92eeeb
4 changed files with 137 additions and 117 deletions

View file

@ -1,13 +1,15 @@
# Chunks Disk Format
The following describes the format of a single chunks file, which is created in the `chunks/` directory of a block.
The following describes the format of a single chunks file, which is created in the `chunks/` directory of a block. The maximum size per segment file is 512MiB.
Chunks in the files are referenced from the index by the in-file offset in the 4 LSB and the segment sequence number in the bigher 4 MSBs.
```
┌─────────────────────────────┬─────────────────────┐
│ magic(0x85BD40DD) <4 byte> │ version(1) <1 byte>
├─────────────────────────────┴─────────────────────┤
│ ┌──────────────┬───────────────────┬────────┐
│ │ len <varint> │ encoding <1 byte> data │ ...
│ └──────────────┴───────────────────┴────────┘
└───────────────────────────────────────────────────┘
┌────────────────────────────────────────┬─────────────────────┐
│ magic(0x85BD40DD) <4 byte> │ version(1) <1 byte>
├────────────────────────────────────────┴─────────────────────┤
│ ┌──────────────┬───────────────────┬──────┬────────────────┐ │
│ │ len <varint> │ encoding <1 byte>data │ CRC32 <4 byte>
│ └──────────────┴───────────────────┴──────┴────────────────┘ │
└──────────────────────────────────────────────────────────────
```

View file

@ -1,10 +1,11 @@
# Index Disk Format
The following describes the format of the `index` file found in each block directory.
It is terminated by a table of contents which serves as an entry point into the index.
```
┌────────────────────────────┬─────────────────────┐
│ magic(0xBAAAD700) <4 byte> │ version(1) <1 byte>
│ magic(0xBAAAD700) <4b> │ version(1) <1 byte>
├────────────────────────────┴─────────────────────┤
│ ┌──────────────────────────────────────────────┐ │
│ │ Symbol Table │ │
@ -35,36 +36,38 @@ The following describes the format of the `index` file found in each block direc
### Symbol Table
The symbol table holds all strings encountered in our index. All other index sections just reference strings in the table as they are highly repetitive.
The symbol table holds a sorted list of deduplicated strings that occurred in label pairs of the stored series. They can be referenced from subsequent sections and significantly reduce the total index size.
The section contains a sequence of the raw string data, each prefixed with the string's length.
The section contains a sequence of the string entries, each prefixed with the string's length in raw bytes.
Strings are referenced by pointing to the beginning of their length field. The strings are sorted in lexicographically ascending order.
The full list of strings is validated with a CRC32 checksum.
```
┌─────────────────────────┬───────────────┐
count(symbols) <4 byte> │ len <4 byte>
├────────────────────────────────────────┤
│ ┌─────────────────────┬───────────────┐ │
│ │ len(str_1) <varint> │ str_1 <bytes> │ │
│ ├─────────────────────┴───────────────┤ │
│ │ . . . │ │
│ ├─────────────────────┬───────────────┤ │
│ │ len(str_n) <varint> │ str_1 <bytes> │ │
│ └─────────────────────┴───────────────┘ │
├─────────────────────────────────────────┤
CRC32 <4 byte>
└─────────────────────────────────────────┘
┌────────────────────┬─────────────────────┐
len <4b>#symbols <4b>
├────────────────────┴─────────────────────┤
│ ┌─────────────────────┬───────────────┐ │
│ │ len(str_1) <uvarint> │ str_1 <bytes> │ │
│ ├─────────────────────┴───────────────┤ │
│ │ . . . │ │
│ ├─────────────────────┬───────────────┤ │
│ │ len(str_n) <uvarint> │ str_1 <bytes> │ │
│ └─────────────────────┴───────────────┘ │
├─────────────────────────────────────────
│ CRC32 <4b>
└─────────────────────────────────────────
```
### Series
The section contains a sequence of series that hold the label set of the series as well as the chunks within the block. The series are sorted lexicographically by their label sets.
The section contains a sequence of series that hold the label set of the series as well as its chunks within the block. The series are sorted lexicographically by their label sets.
The file offset to the beginning of a series serves as the series' ID in all subsequent references. Thereby, a sorted list of series IDs implies a lexicographically sorted list of series label sets.
```
┌───────────────────────────────────────┐
count(series) <4 byte>
#series <4b>
├───────────────────────────────────────┤
│ ┌───────────────────────────────────┐ │
│ │ series_1 │ │
@ -76,7 +79,9 @@ The file offset to the beginning of a series serves as the series' ID in all sub
└───────────────────────────────────────┘
```
Every series holds a list of label pairs and chunks. The label pairs reference the symbol table and the chunks an address in one of the block's chunk files.
Every series entry first holds its number of labels, followed by tuples of symbol table references that resemble label name and value. The label pairs are lexicographically sorted.
After the labels, the number of indexed chunks is encoded, followed by a sequence of metadata entries containing the chunks minimum and maximum timestamp and a reference to its position in the chunk file. Holding the time range data in the index allows dropping chunks irrelevant to queried time ranges without accessing them directly.
The series entry is prefixed with its length and terminated by a CRC32 checksum over its contents.
```
┌─────────────────────────────────────────────────────────┐
@ -84,117 +89,116 @@ Every series holds a list of label pairs and chunks. The label pairs reference t
├─────────────────────────────────────────────────────────┤
│ ┌──────────────────┬──────────────────────────────────┐ │
│ │ │ ┌──────────────────────────┐ │ │
│ │ │ │ ref(l_i.name) <varint> │ │ │
│ │ #labels <varint> │ ├──────────────────────────┤ ... │ │
│ │ │ │ ref(l_i.value) <varint> │ │ │
│ │ │ │ ref(l_i.name) <uvarint> │ │ │
│ │ #labels │ ├──────────────────────────┤ ... │ │
│ │ <uvarint> │ │ ref(l_i.value) <uvarint> │ │ │
│ │ │ └──────────────────────────┘ │ │
│ ├──────────────────┼──────────────────────────────────┤ │
│ │ │ ┌──────────────────────────┐ │ │
│ │ │ │ c_i.mint <varint> │ │ │
│ │ │ ├──────────────────────────┤ │ │
│ │ │ │ c_i.maxt <varint> │ │ │
│ │ #chunks <varint> │ ├──────────────────────────┤ ... │ │
│ │ │ │ ref(c_i.data) <varint> │ │ │
│ │ │ ├──────────────────────────┤ │ │
│ │ │ │ crc32(c_i.data) <varint> │ │ │
│ │ #chunks │ │ c_i.maxt <varint> │ │ │
│ │ <uvarint> │ ├──────────────────────────┤ ... │ │
│ │ │ │ ref(c_i.data) <uvarint> │ │ │
│ │ │ └──────────────────────────┘ │ │
│ └──────────────────┴──────────────────────────────────┘ │
├─────────────────────────────────────────────────────────┤
│ CRC32 <4 byte>
│ CRC32 <4b>
└─────────────────────────────────────────────────────────┘
```
The CRC checksum is calculated over the series contents of the index concatenated with the data of its chunks (with encoding byte, without length).
### Label Index
The label index indexes holds lists of possible values for label names. A sequence of label index blocks follow on the series entries.
The label index indexes holds lists of possible values for label names. Each label index can be a composite index over more than a single label name, which is tracked by `#names`, followed by the total number of entries.
The body holds `#entries` entries of possible values pointing back into the symbol table.
```
┌─────────────────────────────────────────────────────────┐
│ len <varint>
├─────────────────────────────────────────────────────────┤
│ ┌──────────────────┬──────────────────────────────────┐ │
│ │ │ ┌──────────────────────────┐ │ │
│ │ │ │ ref(value[0]) <4 byte> │ │ │
│ │ │ ├──────────────────────────┤ │ │
│ │ n = len(names) │ │ ... │ ... │ │
│ │ <varint> │ ├──────────────────────────┤ │ │
│ │ │ │ ref(value[n]) <4 byte> │ │ │
│ │ │ └──────────────────────────┘ │ │
│ └──────────────────┴──────────────────────────────────┘ │
├─────────────────────────────────────────────────────────┤
│ CRC32 <4 byte>
└─────────────────────────────────────────────────────────┘
```
The sequence of label index blocks is finalized by a lookup table pointing to the beginning of each label index block. It is simply a list of entries that are read into an in-memory hashmap when the index is loaded.
### Postings
Postings are postings lists that map label pairs to series they occur in.
```
┌─────────────────────────────────────────────────┐
│ len <varint>
├─────────────────────────────────────────────────┤
┌───────────────┬────────────────┬────────────────┐
│ len <4b>#names <4b>#entries <4b>
├───────────────┴────────────────┴────────────────┤
│ ┌─────────────────────────────────────────────┐ │
│ │ ref(series[0]) <4 byte> │ │
│ │ ref(value_0) <4b> │ │
│ ├─────────────────────────────────────────────┤ │
│ │ ... │ │
│ ├─────────────────────────────────────────────┤ │
│ │ ref(series[n]) <4 byte> │ │
│ │ ref(value_n) <4b> │ │
│ └─────────────────────────────────────────────┘ │
│ . . . │
├─────────────────────────────────────────────────┤
│ CRC32 <4 byte>
│ CRC32 <4b>
└─────────────────────────────────────────────────┘
```
### Offset Table
The sequence of label index sections is finalized by an offset table pointing to the beginning of each label index section for a given set of label names.
### Postings
Postings sections store monotinically increasing lists of series references that contain a given label pair associated with the list.
```
┌─────────────────────────┬───────────────┐
count(symbols) <4 byte> │ len <4 byte>
├────────────────────────────────────────┤
┌────────────────────┬────────────────────┐
│ len <4b>#entries <4b>
├────────────────────┴────────────────────┤
│ ┌─────────────────────────────────────┐ │
│ │ n = len(strs) <varint> │ │
│ │ ref(series_1) <4b> │ │
│ ├─────────────────────────────────────┤ │
│ │ len(strs[0]) │ │
│ │ ... │ │
│ ├─────────────────────────────────────┤ │
│ │ ... │ │
│ ├─────────────────────────────────────┤ │
│ │ strs[n] │ │
│ ├─────────────────────────────────────┤ │
│ │ offset <varint> │ │
│ │ ref(series_n) <4b> │ │
│ └─────────────────────────────────────┘ │
│ . . . │
├─────────────────────────────────────────┤
CRC32 <4 byte>
│ CRC32 <4b>
└─────────────────────────────────────────┘
```
The sequence of postings sections is finalized by an offset table pointing to the beginning of each postings section for a given set of label names.
### Offset Table
An offset table stores a sequence of entries that maps a list of strings to an offset. They are used to track label index and postings sections. They are read into memory when an index file is loaded.
```
┌─────────────────────┬────────────────────┐
│ len <4b>#entries <4b>
├─────────────────────┴────────────────────┤
│ ┌──────────────────────────────────────┐ │
│ │ n = #strs <uvarint> │ │
│ ├──────────────────────┬───────────────┤ │
│ │ len(str_1) <uvarint> │ str_1 <bytes> │ │
│ ├──────────────────────┴───────────────┤ │
│ │ ... │ │
│ ├──────────────────────┬───────────────┤ │
│ │ len(str_n) <uvarint> │ str_n <bytes> │ │
│ ├──────────────────────┴───────────────┤ │
│ │ offset <uvarint> │ │
│ └──────────────────────────────────────┘ │
│ . . . │
├──────────────────────────────────────────┤
│ CRC32 <4b>
└──────────────────────────────────────────┘
```
### TOC
The table of contents serves as an entry point to the entire index. It's size is fixed.
The table of contents serves as an entry point to the entire index and points to various sections in the file.
```
┌─────────────────────────────────────────────┐
│ ref(symbols) <8 byte>
├─────────────────────────────────────────────┤
│ ref(series) <8 byte>
├─────────────────────────────────────────────┤
│ ref(label indices) <8 byte>
├─────────────────────────────────────────────┤
│ ref(label indices table) <8 byte>
├─────────────────────────────────────────────┤
│ ref(postings) <8 byte>
├─────────────────────────────────────────────┤
│ ref(postings table) <8 byte>
├─────────────────────────────────────────────┤
│ CRC32 <4 byte>
└─────────────────────────────────────────────┘
┌─────────────────────────────────────────┐
│ ref(symbols) <8b>
├─────────────────────────────────────────┤
│ ref(series) <8b>
├─────────────────────────────────────────┤
│ ref(label indices start) <8b>
├─────────────────────────────────────────┤
│ ref(label indices table) <8b>
├─────────────────────────────────────────┤
│ ref(postings start) <8b>
├─────────────────────────────────────────┤
│ ref(postings table) <8b>
├─────────────────────────────────────────┤
│ CRC32 <4b>
└─────────────────────────────────────────┘
```

View file

@ -18,6 +18,7 @@ import (
"encoding/binary"
"fmt"
"hash"
"hash/crc32"
"io"
"os"
@ -72,6 +73,7 @@ type chunkWriter struct {
files []*os.File
wbuf *bufio.Writer
n int64
crc32 hash.Hash
segmentSize int64
}
@ -93,6 +95,7 @@ func newChunkWriter(dir string) (*chunkWriter, error) {
cw := &chunkWriter{
dirFile: dirFile,
n: 0,
crc32: crc32.New(crc32.MakeTable(crc32.Castagnoli)),
segmentSize: defaultChunkSegmentSize,
}
return cw, nil
@ -216,6 +219,13 @@ func (w *chunkWriter) WriteChunks(chks ...*ChunkMeta) error {
if err := w.write(chk.Chunk.Bytes()); err != nil {
return err
}
w.crc32.Reset()
w.crc32.Write([]byte{byte(chk.Chunk.Encoding())})
w.crc32.Write(chk.Chunk.Bytes())
if err := w.write(w.crc32.Sum(nil)); err != nil {
return err
}
}
return nil

View file

@ -267,11 +267,13 @@ func (w *indexWriter) writeSymbols() error {
}
sort.Strings(symbols)
const headerSize = 8
const headerSize = 4
w.buf1.reset()
w.buf2.reset()
w.buf2.putBE32int(len(symbols))
for _, s := range symbols {
w.symbols[s] = uint32(w.pos) + headerSize + uint32(w.buf2.len())
@ -281,9 +283,7 @@ func (w *indexWriter) writeSymbols() error {
w.buf2.putUvarintStr(s)
}
w.buf1.putBE32int(len(symbols))
w.buf1.putBE32int(w.buf2.len())
w.buf2.putHash(w.crc32)
err := w.write(w.buf1.get(), w.buf2.get())
@ -302,11 +302,14 @@ func (w *indexWriter) writeSeries() error {
// Header holds number of series.
w.buf1.reset()
w.buf1.putBE32int(len(series))
if err := w.write(w.buf1.get()); err != nil {
return errors.Wrap(err, "write series count")
}
for _, s := range series {
s.offset = uint32(w.pos)
w.buf2.reset()
w.buf2.putUvarint(len(s.labels))
@ -321,14 +324,8 @@ func (w *indexWriter) writeSeries() error {
w.buf2.putVarint64(c.MinTime)
w.buf2.putVarint64(c.MaxTime)
w.buf2.putUvarint64(c.Ref)
w.crc32.Reset()
c.hash(w.crc32)
w.buf2.putBytes(w.crc32.Sum(nil))
}
s.offset = uint32(w.pos)
w.buf1.reset()
w.buf1.putUvarint(w.buf2.len())
@ -343,6 +340,9 @@ func (w *indexWriter) writeSeries() error {
}
func (w *indexWriter) WriteLabelIndex(names []string, values []string) error {
if len(values)%len(names) != 0 {
return errors.Errorf("invalid value list length %d for %d names", len(values), len(names))
}
if err := w.ensureStage(idxStageLabelIndex); err != nil {
return errors.Wrap(err, "ensure stage")
}
@ -359,14 +359,15 @@ func (w *indexWriter) WriteLabelIndex(names []string, values []string) error {
})
w.buf2.reset()
w.buf2.putUvarint(len(names))
w.buf2.putBE32int(len(names))
w.buf2.putBE32int(valt.Len())
for _, v := range valt.s {
w.buf2.putBE32(w.symbols[v])
}
w.buf1.reset()
w.buf1.putUvarint(w.buf2.len())
w.buf1.putBE32int(w.buf2.len())
w.buf2.putHash(w.crc32)
@ -437,16 +438,17 @@ func (w *indexWriter) WritePostings(name, value string, it Postings) error {
if err := it.Err(); err != nil {
return err
}
sort.Sort(uint32slice(refs))
w.buf2.reset()
w.buf2.putBE32int(len(refs))
for _, r := range refs {
w.buf2.putBE32(r)
}
w.buf1.reset()
w.buf1.putUvarint(w.buf2.len())
w.buf1.putBE32int(w.buf2.len())
w.buf2.putHash(w.crc32)
@ -658,9 +660,10 @@ func (r *indexReader) LabelValues(names ...string) (StringTuples, error) {
}
d1 := r.decbufAt(int(off))
d2 := d1.decbuf(int(d1.uvarint()))
d2 := d1.decbuf(d1.be32int())
c := d2.uvarint()
nc := d2.be32int()
d2.be32() // consume unused value entry count.
if d2.err() != nil {
return nil, errors.Wrap(d2.err(), "read label value index")
@ -669,7 +672,7 @@ func (r *indexReader) LabelValues(names ...string) (StringTuples, error) {
// TODO(fabxc): verify checksum in 4 remaining bytes of d1.
st := &serializedStringTuples{
l: int(c),
l: nc,
b: d2.get(),
lookup: r.lookupSymbol,
}
@ -727,9 +730,6 @@ func (r *indexReader) Series(ref uint32) (labels.Labels, []*ChunkMeta, error) {
mint := d2.varint64()
maxt := d2.varint64()
off := d2.uvarint64()
_ = d2.be32()
// TODO(fabxc): verify CRC32
if d2.err() != nil {
return nil, nil, errors.Wrapf(d2.err(), "read meta for chunk %d", i)
@ -742,6 +742,8 @@ func (r *indexReader) Series(ref uint32) (labels.Labels, []*ChunkMeta, error) {
})
}
// TODO(fabxc): verify CRC32.
return lbls, chunks, nil
}
@ -755,7 +757,9 @@ func (r *indexReader) Postings(name, value string) (Postings, error) {
}
d1 := r.decbufAt(int(off))
d2 := d1.decbuf(d1.uvarint())
d2 := d1.decbuf(d1.be32int())
d2.be32() // consume unused postings list length.
if d2.err() != nil {
return nil, errors.Wrap(d2.err(), "get postings bytes")