Decouple ruler dependency controller from concurrency controller

Signed-off-by: Marco Pracucci <marco@pracucci.com>
This commit is contained in:
Marco Pracucci 2024-02-02 10:06:37 +01:00
parent 6feffeb92e
commit 5ee3fbe825
No known key found for this signature in database
GPG key ID: 74C1BD403D2DF9B5
10 changed files with 298 additions and 18 deletions

View file

@ -142,6 +142,9 @@ type AlertingRule struct {
active map[uint64]*Alert
logger log.Logger
noDependentRules *atomic.Bool
noDependencyRules *atomic.Bool
}
// NewAlertingRule constructs a new AlertingRule.
@ -168,6 +171,8 @@ func NewAlertingRule(
evaluationTimestamp: atomic.NewTime(time.Time{}),
evaluationDuration: atomic.NewDuration(0),
lastError: atomic.NewError(nil),
noDependentRules: atomic.NewBool(false),
noDependencyRules: atomic.NewBool(false),
}
}
@ -317,6 +322,22 @@ func (r *AlertingRule) Restored() bool {
return r.restored.Load()
}
func (r *AlertingRule) SetNoDependentRules(noDependentRules bool) {
r.noDependentRules.Store(noDependentRules)
}
func (r *AlertingRule) NoDependentRules() bool {
return r.noDependentRules.Load()
}
func (r *AlertingRule) SetNoDependencyRules(noDependencyRules bool) {
r.noDependencyRules.Store(noDependencyRules)
}
func (r *AlertingRule) NoDependencyRules() bool {
return r.noDependencyRules.Load()
}
// resolvedRetention is the duration for which a resolved alert instance
// is kept in memory state and consequently repeatedly sent to the AlertManager.
const resolvedRetention = 15 * time.Minute

View file

@ -920,3 +920,45 @@ func TestAlertingEvalWithOrigin(t *testing.T) {
require.NoError(t, err)
require.Equal(t, detail, NewRuleDetail(rule))
}
func TestAlertingRule_SetNoDependentRules(t *testing.T) {
rule := NewAlertingRule(
"test",
&parser.NumberLiteral{Val: 1},
time.Minute,
0,
labels.FromStrings("test", "test"),
labels.EmptyLabels(),
labels.EmptyLabels(),
"",
true, log.NewNopLogger(),
)
require.False(t, rule.NoDependentRules())
rule.SetNoDependentRules(false)
require.False(t, rule.NoDependentRules())
rule.SetNoDependentRules(true)
require.True(t, rule.NoDependentRules())
}
func TestAlertingRule_SetNoDependencyRules(t *testing.T) {
rule := NewAlertingRule(
"test",
&parser.NumberLiteral{Val: 1},
time.Minute,
0,
labels.FromStrings("test", "test"),
labels.EmptyLabels(),
labels.EmptyLabels(),
"",
true, log.NewNopLogger(),
)
require.False(t, rule.NoDependencyRules())
rule.SetNoDependencyRules(false)
require.False(t, rule.NoDependencyRules())
rule.SetNoDependencyRules(true)
require.True(t, rule.NoDependencyRules())
}

View file

@ -579,7 +579,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
// If the rule has no dependencies, it can run concurrently because no other rules in this group depend on its output.
// Try run concurrently if there are slots available.
if ctrl := g.concurrencyController; ctrl.RuleEligible(g, rule) && ctrl.Allow() {
if ctrl := g.concurrencyController; isRuleEligibleForConcurrentExecution(rule) && ctrl.Allow() {
wg.Add(1)
go eval(i, rule, func() {
@ -1008,3 +1008,7 @@ func buildDependencyMap(rules []Rule) dependencyMap {
return dependencies
}
func isRuleEligibleForConcurrentExecution(rule Rule) bool {
return rule.NoDependentRules() && rule.NoDependencyRules()
}

View file

@ -119,6 +119,7 @@ type ManagerOptions struct {
MaxConcurrentEvals int64
ConcurrentEvalsEnabled bool
RuleConcurrencyController RuleConcurrencyController
RuleDependencyController RuleDependencyController
Metrics *Metrics
}
@ -142,6 +143,10 @@ func NewManager(o *ManagerOptions) *Manager {
}
}
if o.RuleDependencyController == nil {
o.RuleDependencyController = ruleDependencyController{}
}
m := &Manager{
groups: map[string]*Group{},
opts: o,
@ -188,8 +193,6 @@ func (m *Manager) Update(interval time.Duration, files []string, externalLabels
m.mtx.Lock()
defer m.mtx.Unlock()
m.opts.RuleConcurrencyController.Invalidate()
groups, errs := m.LoadGroups(interval, externalLabels, externalURL, groupEvalIterationFunc, files...)
if errs != nil {
@ -322,6 +325,9 @@ func (m *Manager) LoadGroups(
))
}
// Check dependencies between rules and store it on the Rule itself.
m.opts.RuleDependencyController.AnalyseRules(rules)
groups[GroupKey(fn, rg.Name)] = NewGroup(GroupOptions{
Name: rg.Name,
File: fn,
@ -418,24 +424,35 @@ func SendAlerts(s Sender, externalURL string) NotifyFunc {
}
}
// RuleConcurrencyController controls whether rules can be evaluated concurrently. Its purpose is to bound the amount
// of concurrency in rule evaluations to avoid overwhelming the Prometheus server with additional query load and ensure
// the correctness of rules running concurrently. Concurrency is controlled globally, not on a per-group basis.
type RuleConcurrencyController interface {
// RuleEligible determines if the rule can guarantee correct results while running concurrently.
RuleEligible(g *Group, r Rule) bool
// RuleDependencyController controls whether a set of rules have dependencies between each other.
type RuleDependencyController interface {
// AnalyseRules analyses dependencies between the input rules. For each rule that it's guaranteed
// not having any dependants and/or dependency, this function should call Rule.SetNoDependentRules(true)
// and/or Rule.SetNoDependencyRules(true).
AnalyseRules(rules []Rule)
}
type ruleDependencyController struct{}
// AnalyseRules implements RuleDependencyController.
func (c ruleDependencyController) AnalyseRules(rules []Rule) {
depMap := buildDependencyMap(rules)
for _, r := range rules {
r.SetNoDependentRules(depMap.dependents(r) == 0)
r.SetNoDependencyRules(depMap.dependencies(r) == 0)
}
}
// RuleConcurrencyController controls concurrency for rules that are safe to be evaluated concurrently.
// Its purpose is to bound the amount of concurrency in rule evaluations to avoid overwhelming the Prometheus
// server with additional query load. Concurrency is controlled globally, not on a per-group basis.
type RuleConcurrencyController interface {
// Allow determines whether any concurrent evaluation slots are available.
// If Allow() returns true, then Done() must be called to release the acquired slot.
Allow() bool
// Done releases a concurrent evaluation slot.
Done()
// Invalidate instructs the controller to invalidate its state.
// This should be called when groups are modified (during a reload, for instance), because the controller may
// store some state about each group in order to more efficiently determine rule eligibility.
Invalidate()
}
// concurrentRuleEvalController holds a weighted semaphore which controls the concurrent evaluation of rules.

View file

@ -1314,6 +1314,8 @@ func TestRuleGroupEvalIterationFunc(t *testing.T) {
evaluationTimestamp: atomic.NewTime(time.Time{}),
evaluationDuration: atomic.NewDuration(0),
lastError: atomic.NewError(nil),
noDependentRules: atomic.NewBool(false),
noDependencyRules: atomic.NewBool(false),
}
group := NewGroup(GroupOptions{
@ -1407,6 +1409,66 @@ func TestNativeHistogramsInRecordingRules(t *testing.T) {
require.Equal(t, chunkenc.ValNone, it.Next())
}
func TestManager_LoadGroups_ShouldCheckWhetherEachRuleHasDependentsAndDependencies(t *testing.T) {
storage := teststorage.New(t)
t.Cleanup(func() {
require.NoError(t, storage.Close())
})
ruleManager := NewManager(&ManagerOptions{
Context: context.Background(),
Logger: log.NewNopLogger(),
Appendable: storage,
QueryFunc: func(ctx context.Context, q string, ts time.Time) (promql.Vector, error) { return nil, nil },
})
t.Run("load a mix of dependent and independent rules", func(t *testing.T) {
groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, []string{"fixtures/rules_multiple.yaml"}...)
require.Empty(t, errs)
require.Len(t, groups, 1)
expected := map[string]struct {
noDependentRules bool
noDependencyRules bool
}{
"job:http_requests:rate1m": {
noDependentRules: true,
noDependencyRules: true,
},
"job:http_requests:rate5m": {
noDependentRules: true,
noDependencyRules: true,
},
"job:http_requests:rate15m": {
noDependentRules: true,
noDependencyRules: false,
},
"TooManyRequests": {
noDependentRules: false,
noDependencyRules: true,
},
}
for _, r := range ruleManager.Rules() {
exp, ok := expected[r.Name()]
require.Truef(t, ok, "rule: %s", r.String())
require.Equalf(t, exp.noDependentRules, r.NoDependentRules(), "rule: %s", r.String())
require.Equalf(t, exp.noDependencyRules, r.NoDependencyRules(), "rule: %s", r.String())
}
})
t.Run("load only independent rules", func(t *testing.T) {
groups, errs := ruleManager.LoadGroups(time.Second, labels.EmptyLabels(), "", nil, []string{"fixtures/rules_multiple_independent.yaml"}...)
require.Empty(t, errs)
require.Len(t, groups, 1)
for _, r := range ruleManager.Rules() {
require.Truef(t, r.NoDependentRules(), "rule: %s", r.String())
require.Truef(t, r.NoDependencyRules(), "rule: %s", r.String())
}
})
}
func TestDependencyMap(t *testing.T) {
ctx := context.Background()
opts := &ManagerOptions{

View file

@ -28,6 +28,14 @@ type RuleDetail struct {
Query string
Labels labels.Labels
Kind string
// NoDependentRules is set to true if it's guaranteed that in the rule group there's no other rule
// which depends on this one.
NoDependentRules bool
// NoDependencyRules is set to true if it's guaranteed that this rule doesn't depend on any other
// rule within the rule group.
NoDependencyRules bool
}
const (
@ -48,10 +56,12 @@ func NewRuleDetail(r Rule) RuleDetail {
}
return RuleDetail{
Name: r.Name(),
Query: r.Query().String(),
Labels: r.Labels(),
Kind: kind,
Name: r.Name(),
Query: r.Query().String(),
Labels: r.Labels(),
Kind: kind,
NoDependentRules: r.NoDependentRules(),
NoDependencyRules: r.NoDependencyRules(),
}
}

View file

@ -19,6 +19,7 @@ import (
"testing"
"time"
"github.com/go-kit/log"
"github.com/stretchr/testify/require"
"github.com/prometheus/prometheus/model/labels"
@ -43,9 +44,73 @@ func (u unknownRule) SetEvaluationDuration(time.Duration) {}
func (u unknownRule) GetEvaluationDuration() time.Duration { return 0 }
func (u unknownRule) SetEvaluationTimestamp(time.Time) {}
func (u unknownRule) GetEvaluationTimestamp() time.Time { return time.Time{} }
func (u unknownRule) SetNoDependentRules(bool) {}
func (u unknownRule) NoDependentRules() bool { return false }
func (u unknownRule) SetNoDependencyRules(bool) {}
func (u unknownRule) NoDependencyRules() bool { return false }
func TestNewRuleDetailPanics(t *testing.T) {
require.PanicsWithValue(t, `unknown rule type "rules.unknownRule"`, func() {
NewRuleDetail(unknownRule{})
})
}
func TestFromOriginContext(t *testing.T) {
t.Run("should return zero value if RuleDetail is missing in the context", func(t *testing.T) {
detail := FromOriginContext(context.Background())
require.Zero(t, detail)
// The zero value for NoDependentRules must be the most conservative option.
require.False(t, detail.NoDependentRules)
// The zero value for NoDependencyRules must be the most conservative option.
require.False(t, detail.NoDependencyRules)
})
}
func TestNewRuleDetail(t *testing.T) {
t.Run("should populate NoDependentRules and NoDependencyRules for a RecordingRule", func(t *testing.T) {
rule := NewRecordingRule("test", &parser.NumberLiteral{Val: 1}, labels.EmptyLabels())
detail := NewRuleDetail(rule)
require.False(t, detail.NoDependentRules)
require.False(t, detail.NoDependencyRules)
rule.SetNoDependentRules(true)
detail = NewRuleDetail(rule)
require.True(t, detail.NoDependentRules)
require.False(t, detail.NoDependencyRules)
rule.SetNoDependencyRules(true)
detail = NewRuleDetail(rule)
require.True(t, detail.NoDependentRules)
require.True(t, detail.NoDependencyRules)
})
t.Run("should populate NoDependentRules and NoDependencyRules for a AlertingRule", func(t *testing.T) {
rule := NewAlertingRule(
"test",
&parser.NumberLiteral{Val: 1},
time.Minute,
0,
labels.FromStrings("test", "test"),
labels.EmptyLabels(),
labels.EmptyLabels(),
"",
true, log.NewNopLogger(),
)
detail := NewRuleDetail(rule)
require.False(t, detail.NoDependentRules)
require.False(t, detail.NoDependencyRules)
rule.SetNoDependentRules(true)
detail = NewRuleDetail(rule)
require.True(t, detail.NoDependentRules)
require.False(t, detail.NoDependencyRules)
rule.SetNoDependencyRules(true)
detail = NewRuleDetail(rule)
require.True(t, detail.NoDependentRules)
require.True(t, detail.NoDependencyRules)
})
}

View file

@ -41,6 +41,9 @@ type RecordingRule struct {
lastError *atomic.Error
// Duration of how long it took to evaluate the recording rule.
evaluationDuration *atomic.Duration
noDependentRules *atomic.Bool
noDependencyRules *atomic.Bool
}
// NewRecordingRule returns a new recording rule.
@ -53,6 +56,8 @@ func NewRecordingRule(name string, vector parser.Expr, lset labels.Labels) *Reco
evaluationTimestamp: atomic.NewTime(time.Time{}),
evaluationDuration: atomic.NewDuration(0),
lastError: atomic.NewError(nil),
noDependentRules: atomic.NewBool(false),
noDependencyRules: atomic.NewBool(false),
}
}
@ -166,3 +171,19 @@ func (rule *RecordingRule) SetEvaluationTimestamp(ts time.Time) {
func (rule *RecordingRule) GetEvaluationTimestamp() time.Time {
return rule.evaluationTimestamp.Load()
}
func (rule *RecordingRule) SetNoDependentRules(noDependentRules bool) {
rule.noDependentRules.Store(noDependentRules)
}
func (rule *RecordingRule) NoDependentRules() bool {
return rule.noDependentRules.Load()
}
func (rule *RecordingRule) SetNoDependencyRules(noDependencyRules bool) {
rule.noDependencyRules.Store(noDependencyRules)
}
func (rule *RecordingRule) NoDependencyRules() bool {
return rule.noDependencyRules.Load()
}

View file

@ -249,3 +249,25 @@ func TestRecordingEvalWithOrigin(t *testing.T) {
require.NoError(t, err)
require.Equal(t, detail, NewRuleDetail(rule))
}
func TestRecordingRule_SetNoDependentRules(t *testing.T) {
rule := NewRecordingRule("1", &parser.NumberLiteral{Val: 1}, labels.EmptyLabels())
require.False(t, rule.NoDependentRules())
rule.SetNoDependentRules(false)
require.False(t, rule.NoDependentRules())
rule.SetNoDependentRules(true)
require.True(t, rule.NoDependentRules())
}
func TestRecordingRule_SetNoDependencyRules(t *testing.T) {
rule := NewRecordingRule("1", &parser.NumberLiteral{Val: 1}, labels.EmptyLabels())
require.False(t, rule.NoDependencyRules())
rule.SetNoDependencyRules(false)
require.False(t, rule.NoDependencyRules())
rule.SetNoDependencyRules(true)
require.True(t, rule.NoDependencyRules())
}

View file

@ -61,4 +61,20 @@ type Rule interface {
// GetEvaluationTimestamp returns last evaluation timestamp.
// NOTE: Used dynamically by rules.html template.
GetEvaluationTimestamp() time.Time
// SetNoDependentRules sets whether there's no other rule in the rule group that depends on this rule.
SetNoDependentRules(bool)
// NoDependentRules returns true if it's guaranteed that in the rule group there's no other rule
// which depends on this one. In case this function returns false there's no such guarantee, which
// means there may or may not be other rules depending on this one.
NoDependentRules() bool
// SetNoDependencyRules sets whether this rule doesn't depend on the output of any rule in the rule group.
SetNoDependencyRules(bool)
// NoDependencyRules returns true if it's guaranteed that this rule doesn't depend on the output of
// any other rule in the group. In case this function returns false there's no such guarantee, which
// means the rule may or may not depend on other rules.
NoDependencyRules() bool
}