tsdb/agent: Fix deadlock from simultaneous GC and write (#10166)

* tsdb/agent: Fix deadlock from simultaneous GC and write

This commit fixes a potential deadlock where storing in-memory series
references could deadlock with a WAL GC cycle.

Signed-off-by: Robert Fratto <robertfratto@gmail.com>

* add missing license header

Signed-off-by: Robert Fratto <robertfratto@gmail.com>

* order local imports

Signed-off-by: Robert Fratto <robertfratto@gmail.com>

* align deadlock testing with discovery/manager_test.go method

Also prevents GCs from running concurrently, which could also cause a
deadlock (even though it's currently impossible for two GCs to run
concurrently).

Signed-off-by: Robert Fratto <robertfratto@gmail.com>
This commit is contained in:
Robert Fratto 2022-01-19 09:53:06 -05:00 committed by GitHub
parent bf959b36cb
commit b71a6dbbd1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 96 additions and 9 deletions

View file

@ -93,6 +93,8 @@ type stripeSeries struct {
series []map[chunks.HeadSeriesRef]*memSeries
hashes []seriesHashmap
locks []stripeLock
gcMut sync.Mutex
}
type stripeLock struct {
@ -120,8 +122,14 @@ func newStripeSeries(stripeSize int) *stripeSeries {
// GC garbage collects old series that have not received a sample after mint
// and will fully delete them.
func (s *stripeSeries) GC(mint int64) map[chunks.HeadSeriesRef]struct{} {
deleted := map[chunks.HeadSeriesRef]struct{}{}
// NOTE(rfratto): GC will grab two locks, one for the hash and the other for
// series. It's not valid for any other function to grab both locks,
// otherwise a deadlock might occur when running GC in parallel with
// appending.
s.gcMut.Lock()
defer s.gcMut.Unlock()
deleted := map[chunks.HeadSeriesRef]struct{}{}
for hashLock := 0; hashLock < s.size; hashLock++ {
s.locks[hashLock].Lock()
@ -179,14 +187,17 @@ func (s *stripeSeries) Set(hash uint64, series *memSeries) {
hashLock = hash & uint64(s.size-1)
refLock = uint64(series.ref) & uint64(s.size-1)
)
s.locks[hashLock].Lock()
defer s.locks[hashLock].Unlock()
if hashLock != refLock {
s.locks[refLock].Lock()
defer s.locks[refLock].Unlock()
}
s.hashes[hashLock].Set(hash, series)
// We can't hold both locks at once otherwise we might deadlock with a
// simulatenous call to GC.
//
// We update s.series first because GC expects anything in s.hashes to
// already exist in s.series.
s.locks[refLock].Lock()
s.series[refLock][series.ref] = series
s.locks[refLock].Unlock()
s.locks[hashLock].Lock()
s.hashes[hashLock].Set(hash, series)
s.locks[hashLock].Unlock()
}

76
tsdb/agent/series_test.go Normal file
View file

@ -0,0 +1,76 @@
// Copyright 2022 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package agent
import (
"fmt"
"math"
"sync"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/tsdb/chunks"
)
func TestNoDeadlock(t *testing.T) {
const numWorkers = 1000
var (
wg sync.WaitGroup
started = make(chan struct{})
stripeSeries = newStripeSeries(3)
)
wg.Add(numWorkers)
for i := 0; i < numWorkers; i++ {
go func() {
defer wg.Done()
<-started
_ = stripeSeries.GC(math.MaxInt64)
}()
}
wg.Add(numWorkers)
for i := 0; i < numWorkers; i++ {
go func(i int) {
defer wg.Done()
<-started
series := &memSeries{
ref: chunks.HeadSeriesRef(i),
lset: labels.FromMap(map[string]string{
"id": fmt.Sprintf("%d", i),
}),
}
stripeSeries.Set(series.lset.Hash(), series)
}(i)
}
finished := make(chan struct{})
go func() {
wg.Wait()
close(finished)
}()
close(started)
select {
case <-finished:
return
case <-time.After(15 * time.Second):
require.FailNow(t, "deadlock detected")
}
}