Fix panic during tsdb Commit (#13092)

* Fix panic during tsdb Commit

Fixes the following

panic: runtime error: invalid memory address or nil pointer dereference
[signal SIGSEGV: segmentation violation code=0x1 addr=0x20 pc=0x19deb45]

goroutine 651118930 [running]:
github.com/prometheus/prometheus/tsdb.(*headAppender).Commit(0xc19100f7c0)
	/drone/src/vendor/github.com/prometheus/prometheus/tsdb/head_append.go:855 +0x245
github.com/prometheus/prometheus/tsdb.dbAppender.Commit({{0x35bd6f0?, 0xc19100f7c0?}, 0xc000fa4c00?})
	/drone/src/vendor/github.com/prometheus/prometheus/tsdb/db.go:1159 +0x2f

We theorize that the panic happened due the the series referenced by the
exemplar being removed between AppendExemplar and Commit due to being idle.

Signed-off-by: György Krajcsovits <gyorgy.krajcsovits@grafana.com>
This commit is contained in:
George Krajcsovits 2023-11-12 15:51:37 +01:00 committed by GitHub
parent 39a35d92bc
commit acc114fe55
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 34 additions and 0 deletions

View file

@ -751,6 +751,12 @@ func (a *headAppender) Commit() (err error) {
// No errors logging to WAL, so pass the exemplars along to the in memory storage.
for _, e := range a.exemplars {
s := a.head.series.getByID(chunks.HeadSeriesRef(e.ref))
if s == nil {
// This is very unlikely to happen, but we have seen it in the wild.
// It means that the series was truncated between AppendExemplar and Commit.
// See TestHeadCompactionWhileAppendAndCommitExemplar.
continue
}
// We don't instrument exemplar appends here, all is instrumented by storage.
if err := a.head.exemplars.AddExemplar(s.lset, e.exemplar); err != nil {
if err == storage.ErrOutOfOrderExemplar {

View file

@ -5514,3 +5514,31 @@ func TestWALSampleAndExemplarOrder(t *testing.T) {
})
}
}
// TestHeadCompactionWhileAppendAndCommitExemplar simulates a use case where
// a series is removed from the head while an exemplar is being appended to it.
// This can happen in theory by compacting the head at the right time due to
// a series being idle.
// The test cheats a little bit by not appending a sample with the exemplar.
// If you also add a sample and run Truncate in a concurrent goroutine and run
// the test around a million(!) times, you can get
// `unknown HeadSeriesRef when trying to add exemplar: 1` error on push.
// It is likely that running the test for much longer and with more time variations
// would trigger the
// `signal SIGSEGV: segmentation violation code=0x1 addr=0x20 pc=0xbb03d1`
// panic, that we have seen in the wild once.
func TestHeadCompactionWhileAppendAndCommitExemplar(t *testing.T) {
h, _ := newTestHead(t, DefaultBlockDuration, wlog.CompressionNone, false)
app := h.Appender(context.Background())
lbls := labels.FromStrings("foo", "bar")
ref, err := app.Append(0, lbls, 1, 1)
require.NoError(t, err)
app.Commit()
// Not adding a sample here to trigger the fault.
app = h.Appender(context.Background())
_, err = app.AppendExemplar(ref, lbls, exemplar.Exemplar{Value: 1, Ts: 20})
require.NoError(t, err)
h.Truncate(10)
app.Commit()
h.Close()
}