Fix precision loss.

Large delta values often imply a difference between a large base value
and the large delta value, potentially resulting in small numbers with
a huge precision error. Since large delta values need 8 bytes anyway,
we are not even saving memory.

As a solution, always save the absoluto value rather than a delta once
8 bytes would be needed for the delta. Timestamps are then saved as 8
byte integers, while values are always saved as float64 in that case.

Change-Id: I01100d600515e16df58ce508b50982ffd762cc49
This commit is contained in:
Bjoern Rabenstein 2014-08-19 18:14:44 +02:00
parent dc2e463a97
commit d742edfe0d
4 changed files with 88 additions and 77 deletions

View file

@ -60,7 +60,7 @@ func chunkType(c chunk) byte {
func chunkForType(chunkType byte) chunk { func chunkForType(chunkType byte) chunk {
switch chunkType { switch chunkType {
case 0: case 0:
return newDeltaEncodedChunk(1, 1, false) return newDeltaEncodedChunk(d1, d0, true)
default: default:
panic("unknown chunk type") panic("unknown chunk type")
} }

View file

@ -41,6 +41,11 @@ const (
deltaHeaderBufLenOffset = 19 deltaHeaderBufLenOffset = 19
) )
// A deltaEncodedChunk adaptively stores sample timestamps and values with a
// delta encoding of various types (int, float) and bit width. However, once 8
// bytes would be needed to encode a delta value, a fall-back to the absolute
// numbers happens (so that timestamps are saved directly as int64 and values as
// float64).
type deltaEncodedChunk struct { type deltaEncodedChunk struct {
buf []byte buf []byte
} }
@ -51,7 +56,7 @@ func newDeltaEncodedChunk(tb, vb deltaBytes, isInt bool) *deltaEncodedChunk {
buf[deltaHeaderTimeBytesOffset] = byte(tb) buf[deltaHeaderTimeBytesOffset] = byte(tb)
buf[deltaHeaderValueBytesOffset] = byte(vb) buf[deltaHeaderValueBytesOffset] = byte(vb)
if isInt { if vb < d8 && isInt { // Only use int for fewer than 8 value delta bytes.
buf[deltaHeaderIsIntOffset] = 1 buf[deltaHeaderIsIntOffset] = 1
} else { } else {
buf[deltaHeaderIsIntOffset] = 0 buf[deltaHeaderIsIntOffset] = 0
@ -77,35 +82,35 @@ func (c *deltaEncodedChunk) clone() chunk {
} }
func neededDeltaBytes(deltaT clientmodel.Timestamp, deltaV clientmodel.SampleValue, isInt bool) (dtb, dvb deltaBytes) { func neededDeltaBytes(deltaT clientmodel.Timestamp, deltaV clientmodel.SampleValue, isInt bool) (dtb, dvb deltaBytes) {
dtb = 1 dtb = d1
if deltaT >= 256 { if deltaT >= 256 {
dtb = 2 dtb = d2
} }
if deltaT >= 256*256 { if deltaT >= 256*256 {
dtb = 4 dtb = d4
} }
if deltaT >= 256*256*256*256 { if deltaT >= 256*256*256*256 {
dtb = 8 dtb = d8
} }
if isInt { if isInt {
dvb = 0 dvb = d0
if deltaV != 0 { if deltaV != 0 {
dvb = 1 dvb = d1
} }
if deltaV < -(256/2) || deltaV > (256/2)-1 { if deltaV < -(256/2) || deltaV > (256/2)-1 {
dvb = 2 dvb = d2
} }
if deltaV < -(256*256/2) || deltaV > (256*256/2)-1 { if deltaV < -(256*256/2) || deltaV > (256*256/2)-1 {
dvb = 4 dvb = d4
} }
if deltaV < -(256*256*256*256/2) || deltaV > (256*256*256*256/2)-1 { if deltaV < -(256*256*256*256/2) || deltaV > (256*256*256*256/2)-1 {
dvb = 8 dvb = d8
} }
} else { } else {
dvb = 4 dvb = d4
if clientmodel.SampleValue(float32(deltaV)) != deltaV { if clientmodel.SampleValue(float32(deltaV)) != deltaV {
dvb = 8 dvb = d8
} }
} }
return dtb, dvb return dtb, dvb
@ -149,8 +154,7 @@ func (c *deltaEncodedChunk) add(s *metric.SamplePair) chunks {
sampleSize := c.sampleSize() sampleSize := c.sampleSize()
// Do we generally have space for another sample in this chunk? If not, // Do we generally have space for another sample in this chunk? If not,
// overflow into a new one. We assume that if we have seen floating point // overflow into a new one.
// values once, the series will most likely contain floats in the future.
if remainingBytes < sampleSize { if remainingBytes < sampleSize {
//fmt.Println("overflow") //fmt.Println("overflow")
overflowChunks := c.newFollowupChunk().add(s) overflowChunks := c.newFollowupChunk().add(s)
@ -159,6 +163,8 @@ func (c *deltaEncodedChunk) add(s *metric.SamplePair) chunks {
dt := s.Timestamp - c.baseTime() dt := s.Timestamp - c.baseTime()
dv := s.Value - c.baseValue() dv := s.Value - c.baseValue()
tb := c.timeBytes()
vb := c.valueBytes()
// If the new sample is incompatible with the current encoding, reencode the // If the new sample is incompatible with the current encoding, reencode the
// existing chunk data into new chunk(s). // existing chunk data into new chunk(s).
@ -166,61 +172,65 @@ func (c *deltaEncodedChunk) add(s *metric.SamplePair) chunks {
// int->float. // int->float.
// TODO: compare speed with Math.Modf. // TODO: compare speed with Math.Modf.
if c.isInt() && clientmodel.SampleValue(int64(dv)) != dv { if c.isInt() && clientmodel.SampleValue(int64(dv)) != dv {
//fmt.Println("int->float", len(c.buf), cap(c.buf)) //fmt.Println("int->float", len(c.buf), cap(c.buf), dv)
return transcodeAndAdd(newDeltaEncodedChunk(c.timeBytes(), d4, false), c, s) return transcodeAndAdd(newDeltaEncodedChunk(tb, d4, false), c, s)
} }
// float32->float64. // float32->float64.
if !c.isInt() && c.valueBytes() == d4 && clientmodel.SampleValue(float32(dv)) != dv { if !c.isInt() && vb == d4 && clientmodel.SampleValue(float32(dv)) != dv {
//fmt.Println("float32->float64", float32(dv), dv, len(c.buf), cap(c.buf)) //fmt.Println("float32->float64", float32(dv), dv, len(c.buf), cap(c.buf))
return transcodeAndAdd(newDeltaEncodedChunk(c.timeBytes(), d8, false), c, s) return transcodeAndAdd(newDeltaEncodedChunk(tb, d8, false), c, s)
}
if tb < d8 || vb < d8 {
// Maybe more bytes per sample.
if ntb, nvb := neededDeltaBytes(dt, dv, c.isInt()); ntb > tb || nvb > vb {
//fmt.Printf("transcoding T: %v->%v, V: %v->%v, I: %v; len %v, cap %v\n", tb, ntb, vb, nvb, c.isInt(), len(c.buf), cap(c.buf))
ntb = max(ntb, tb)
nvb = max(nvb, vb)
return transcodeAndAdd(newDeltaEncodedChunk(ntb, nvb, c.isInt()), c, s)
} }
// More bytes per sample.
if dtb, dvb := neededDeltaBytes(dt, dv, c.isInt()); dtb > c.timeBytes() || dvb > c.valueBytes() {
//fmt.Printf("transcoding T: %v->%v, V: %v->%v, I: %v; len %v, cap %v\n", c.timeBytes(), dtb, c.valueBytes(), dvb, c.isInt(), len(c.buf), cap(c.buf))
dtb = max(dtb, c.timeBytes())
dvb = max(dvb, c.valueBytes())
return transcodeAndAdd(newDeltaEncodedChunk(dtb, dvb, c.isInt()), c, s)
} }
offset := len(c.buf) offset := len(c.buf)
c.buf = c.buf[:offset+sampleSize] c.buf = c.buf[:offset+sampleSize]
switch c.timeBytes() { switch tb {
case 1: case d1:
c.buf[offset] = byte(dt) c.buf[offset] = byte(dt)
case 2: case d2:
binary.LittleEndian.PutUint16(c.buf[offset:], uint16(dt)) binary.LittleEndian.PutUint16(c.buf[offset:], uint16(dt))
case 4: case d4:
binary.LittleEndian.PutUint32(c.buf[offset:], uint32(dt)) binary.LittleEndian.PutUint32(c.buf[offset:], uint32(dt))
case 8: case d8:
binary.LittleEndian.PutUint64(c.buf[offset:], uint64(dt)) // Store the absolute value (no delta) in case of d8.
binary.LittleEndian.PutUint64(c.buf[offset:], uint64(s.Timestamp))
default:
panic("invalid number of bytes for time delta")
} }
offset += int(c.timeBytes()) offset += int(tb)
if c.isInt() { if c.isInt() {
switch c.valueBytes() { switch vb {
case 0: case d0:
// No-op. Constant value is stored as base value. // No-op. Constant value is stored as base value.
case 1: case d1:
c.buf[offset] = byte(dv) c.buf[offset] = byte(dv)
case 2: case d2:
binary.LittleEndian.PutUint16(c.buf[offset:], uint16(dv)) binary.LittleEndian.PutUint16(c.buf[offset:], uint16(dv))
case 4: case d4:
binary.LittleEndian.PutUint32(c.buf[offset:], uint32(dv)) binary.LittleEndian.PutUint32(c.buf[offset:], uint32(dv))
case 8: // d8 must not happen. Those samples are encoded as float64.
binary.LittleEndian.PutUint64(c.buf[offset:], uint64(dv))
default: default:
panic("Invalid number of bytes for integer delta") panic("invalid number of bytes for integer delta")
} }
} else { } else {
switch c.valueBytes() { switch vb {
case 4: case d4:
binary.LittleEndian.PutUint32(c.buf[offset:], math.Float32bits(float32(dv))) binary.LittleEndian.PutUint32(c.buf[offset:], math.Float32bits(float32(dv)))
case 8: case d8:
binary.LittleEndian.PutUint64(c.buf[offset:], math.Float64bits(float64(dv))) // Store the absolute value (no delta) in case of d8.
binary.LittleEndian.PutUint64(c.buf[offset:], math.Float64bits(float64(s.Value)))
default: default:
panic("Invalid number of bytes for floating point delta") panic("invalid number of bytes for floating point delta")
} }
} }
return chunks{c} return chunks{c}
@ -258,49 +268,52 @@ func (c *deltaEncodedChunk) values() <-chan *metric.SamplePair {
func (c *deltaEncodedChunk) valueAtIndex(idx int) *metric.SamplePair { func (c *deltaEncodedChunk) valueAtIndex(idx int) *metric.SamplePair {
offset := deltaHeaderBytes + idx*c.sampleSize() offset := deltaHeaderBytes + idx*c.sampleSize()
var dt uint64 var ts clientmodel.Timestamp
switch c.timeBytes() { switch c.timeBytes() {
case 1: case d1:
dt = uint64(uint8(c.buf[offset])) ts = c.baseTime() + clientmodel.Timestamp(uint8(c.buf[offset]))
case 2: case d2:
dt = uint64(binary.LittleEndian.Uint16(c.buf[offset:])) ts = c.baseTime() + clientmodel.Timestamp(binary.LittleEndian.Uint16(c.buf[offset:]))
case 4: case d4:
dt = uint64(binary.LittleEndian.Uint32(c.buf[offset:])) ts = c.baseTime() + clientmodel.Timestamp(binary.LittleEndian.Uint32(c.buf[offset:]))
case 8: case d8:
dt = uint64(binary.LittleEndian.Uint64(c.buf[offset:])) // Take absolute value for d8.
ts = clientmodel.Timestamp(binary.LittleEndian.Uint64(c.buf[offset:]))
default:
panic("Invalid number of bytes for time delta")
} }
offset += int(c.timeBytes()) offset += int(c.timeBytes())
var dv clientmodel.SampleValue var v clientmodel.SampleValue
if c.isInt() { if c.isInt() {
switch c.valueBytes() { switch c.valueBytes() {
case 0: case d0:
dv = clientmodel.SampleValue(0) v = c.baseValue()
case 1: case d1:
dv = clientmodel.SampleValue(int8(c.buf[offset])) v = c.baseValue() + clientmodel.SampleValue(int8(c.buf[offset]))
case 2: case d2:
dv = clientmodel.SampleValue(int16(binary.LittleEndian.Uint16(c.buf[offset:]))) v = c.baseValue() + clientmodel.SampleValue(int16(binary.LittleEndian.Uint16(c.buf[offset:])))
case 4: case d4:
dv = clientmodel.SampleValue(int32(binary.LittleEndian.Uint32(c.buf[offset:]))) v = c.baseValue() + clientmodel.SampleValue(int32(binary.LittleEndian.Uint32(c.buf[offset:])))
case 8: // No d8 for ints.
dv = clientmodel.SampleValue(int64(binary.LittleEndian.Uint64(c.buf[offset:])))
default: default:
panic("Invalid number of bytes for integer delta") panic("Invalid number of bytes for integer delta")
} }
} else { } else {
switch c.valueBytes() { switch c.valueBytes() {
case 4: case d4:
dv = clientmodel.SampleValue(math.Float32frombits(binary.LittleEndian.Uint32(c.buf[offset:]))) v = c.baseValue() + clientmodel.SampleValue(math.Float32frombits(binary.LittleEndian.Uint32(c.buf[offset:])))
case 8: case d8:
dv = clientmodel.SampleValue(math.Float64frombits(binary.LittleEndian.Uint64(c.buf[offset:]))) // Take absolute value for d8.
v = clientmodel.SampleValue(math.Float64frombits(binary.LittleEndian.Uint64(c.buf[offset:])))
default: default:
panic("Invalid number of bytes for floating point delta") panic("Invalid number of bytes for floating point delta")
} }
} }
return &metric.SamplePair{ return &metric.SamplePair{
Timestamp: c.baseTime() + clientmodel.Timestamp(dt), Timestamp: ts,
Value: c.baseValue() + dv, Value: v,
} }
} }

View file

@ -244,7 +244,7 @@ func TestPersistChunk(t *testing.T) {
for fp, expectedChunks := range fpToChunks { for fp, expectedChunks := range fpToChunks {
indexes := make([]int, 0, len(expectedChunks)) indexes := make([]int, 0, len(expectedChunks))
for i, _ := range expectedChunks { for i := range expectedChunks {
indexes = append(indexes, i) indexes = append(indexes, i)
} }
actualChunks, err := p.LoadChunks(fp, indexes) actualChunks, err := p.LoadChunks(fp, indexes)

View file

@ -324,9 +324,8 @@ func createRandomSamples(r *rand.Rand) clientmodel.Samples {
}, },
}, },
{ // Integer with int deltas of various byte length. { // Integer with int deltas of various byte length.
// TODO: Using larger ints yields even worse results. Improve!
createValue: func() clientmodel.SampleValue { createValue: func() clientmodel.SampleValue {
return clientmodel.SampleValue(r.Int31() - 1<<30) return clientmodel.SampleValue(r.Int63() - 1<<62)
}, },
applyDelta: []deltaApplier{ applyDelta: []deltaApplier{
func(v clientmodel.SampleValue) clientmodel.SampleValue { func(v clientmodel.SampleValue) clientmodel.SampleValue {
@ -435,8 +434,7 @@ func verifyStorage(t *testing.T, s Storage, samples clientmodel.Samples, r *rand
} }
want := float64(sample.Value) want := float64(sample.Value)
got := float64(found[0].Value) got := float64(found[0].Value)
// TODO: 0.01 is a horribly large deviation. Improve! if want != got && (want == 0. || math.Abs(want-got)/want > 0.000001) {
if want != got && (want == 0. || math.Abs(want-got)/want > 0.01) {
t.Errorf("Value mismatch, want %f, got %f.", want, got) t.Errorf("Value mismatch, want %f, got %f.", want, got)
result = false result = false
} }