Merge pull request #262 from cstyan/callum-249

Reduce index file size caused by symbol references by using indicies rather than offsets
This commit is contained in:
Goutham Veeramachaneni 2018-01-18 14:02:54 +05:30 committed by GitHub
commit 467948f3c3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 21 additions and 10 deletions

View file

@ -43,7 +43,7 @@ Most of the sections described below start with a `len` field. It always specifi
The symbol table holds a sorted list of deduplicated strings that occurred in label pairs of the stored series. They can be referenced from subsequent sections and significantly reduce the total index size.
The section contains a sequence of the string entries, each prefixed with the string's length in raw bytes. All strings are utf-8 encoded.
Strings are referenced by pointing to the beginning of their length field. The strings are sorted in lexicographically ascending order.
Strings are referenced by sequential indexing. The strings are sorted in lexicographically ascending order.
```
┌────────────────────┬─────────────────────┐

View file

@ -273,17 +273,18 @@ func (w *Writer) AddSeries(ref uint64, lset labels.Labels, chunks ...chunks.Meta
w.buf2.putUvarint(len(lset))
for _, l := range lset {
offset, ok := w.symbols[l.Name]
// here we have an index for the symbol file if v2, otherwise it's an offset
index, ok := w.symbols[l.Name]
if !ok {
return errors.Errorf("symbol entry for %q does not exist", l.Name)
}
w.buf2.putUvarint32(offset)
w.buf2.putUvarint32(index)
offset, ok = w.symbols[l.Value]
index, ok = w.symbols[l.Value]
if !ok {
return errors.Errorf("symbol entry for %q does not exist", l.Value)
}
w.buf2.putUvarint32(offset)
w.buf2.putUvarint32(index)
}
w.buf2.putUvarint(len(chunks))
@ -341,8 +342,8 @@ func (w *Writer) AddSymbols(sym map[string]struct{}) error {
w.symbols = make(map[string]uint32, len(symbols))
for _, s := range symbols {
w.symbols[s] = uint32(w.pos) + headerSize + uint32(w.buf2.len())
for index, s := range symbols {
w.symbols[s] = uint32(index)
w.buf2.putUvarintStr(s)
}
@ -381,12 +382,13 @@ func (w *Writer) WriteLabelIndex(names []string, values []string) error {
w.buf2.putBE32int(len(names))
w.buf2.putBE32int(valt.Len())
// here we have an index for the symbol file if v2, otherwise it's an offset
for _, v := range valt.s {
offset, ok := w.symbols[v]
index, ok := w.symbols[v]
if !ok {
return errors.Errorf("symbol entry for %q does not exist", v)
}
w.buf2.putBE32(offset)
w.buf2.putBE32(index)
}
w.buf1.reset()
@ -756,11 +758,20 @@ func (r *Reader) readSymbols(off int) error {
basePos = uint32(off) + 4
nextPos = basePos + uint32(origLen-d.len())
)
if r.version == 2 {
nextPos = 0
}
for d.err() == nil && d.len() > 0 && cnt > 0 {
s := d.uvarintStr()
r.symbols[uint32(nextPos)] = s
nextPos = basePos + uint32(origLen-d.len())
if r.version == 2 {
nextPos++
} else {
nextPos = basePos + uint32(origLen-d.len())
}
cnt--
}
return errors.Wrap(d.err(), "read symbols")