mirror of
https://github.com/prometheus/prometheus.git
synced 2025-03-05 20:59:13 -08:00
Remove storage hook and metrics for errors
Signed-off-by: Mustafain Ali Khan <mustalik@amazon.com>
This commit is contained in:
parent
5660122c5f
commit
c727e0945b
|
@ -813,7 +813,7 @@ func main() {
|
|||
queryEngine = promql.NewEngine(opts)
|
||||
var alertStore rules.AlertStore
|
||||
if cfg.enableAlertStatePersistence {
|
||||
alertStore = rules.NewFileStore(logger.With("component", "alertStore"), cfg.alertStoragePath)
|
||||
alertStore = rules.NewFileStore(logger.With("component", "alertStore"), cfg.alertStoragePath, prometheus.DefaultRegisterer)
|
||||
}
|
||||
|
||||
ruleManager = rules.NewManager(&rules.ManagerOptions{
|
||||
|
@ -834,7 +834,6 @@ func main() {
|
|||
return time.Duration(cfgFile.GlobalConfig.RuleQueryOffset)
|
||||
},
|
||||
AlertStore: alertStore,
|
||||
AlertStoreFunc: rules.DefaultAlertStoreFunc,
|
||||
})
|
||||
}
|
||||
|
||||
|
|
|
@ -75,7 +75,6 @@ type Group struct {
|
|||
evalIterationFunc GroupEvalIterationFunc
|
||||
|
||||
appOpts *storage.AppendOptions
|
||||
alertStoreFunc AlertStateStoreFunc
|
||||
alertStore AlertStore
|
||||
}
|
||||
|
||||
|
@ -96,7 +95,6 @@ type GroupOptions struct {
|
|||
QueryOffset *time.Duration
|
||||
done chan struct{}
|
||||
EvalIterationFunc GroupEvalIterationFunc
|
||||
AlertStoreFunc AlertStateStoreFunc
|
||||
AlertStore AlertStore
|
||||
}
|
||||
|
||||
|
@ -128,11 +126,6 @@ func NewGroup(o GroupOptions) *Group {
|
|||
evalIterationFunc = DefaultEvalIterationFunc
|
||||
}
|
||||
|
||||
alertStoreFunc := o.AlertStoreFunc
|
||||
if alertStoreFunc == nil {
|
||||
alertStoreFunc = DefaultAlertStoreFunc
|
||||
}
|
||||
|
||||
if opts.Logger == nil {
|
||||
opts.Logger = promslog.NewNopLogger()
|
||||
}
|
||||
|
@ -154,7 +147,6 @@ func NewGroup(o GroupOptions) *Group {
|
|||
metrics: metrics,
|
||||
evalIterationFunc: evalIterationFunc,
|
||||
appOpts: &storage.AppendOptions{DiscardOutOfOrder: true},
|
||||
alertStoreFunc: alertStoreFunc,
|
||||
alertStore: o.AlertStore,
|
||||
}
|
||||
}
|
||||
|
@ -554,7 +546,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) {
|
|||
restoredAlerts, _ := g.alertStore.GetAlerts(ar.GetFingerprint(GroupKey(g.File(), g.Name())))
|
||||
if len(restoredAlerts) > 0 {
|
||||
ar.SetActiveAlerts(restoredAlerts)
|
||||
logger.Info("Restored alerts from store", "rule", ar.name, "alerts", len(restoredAlerts))
|
||||
g.logger.Info("Restored alerts from store", "rule", ar.name, "alerts", len(restoredAlerts))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1190,3 +1182,35 @@ func buildDependencyMap(rules []Rule) dependencyMap {
|
|||
|
||||
return dependencies
|
||||
}
|
||||
|
||||
// AlertStore provides persistent storage of alert state.
|
||||
type AlertStore interface {
|
||||
// SetAlerts stores the provided list of alerts for a rule.
|
||||
SetAlerts(key uint64, groupKey string, alerts []*Alert) error
|
||||
// GetAlerts returns a list of alerts for each alerting rule,
|
||||
// alerting rule is identified by a fingerprint of its config.
|
||||
GetAlerts(key uint64) (map[uint64]*Alert, error)
|
||||
}
|
||||
|
||||
// StoreKeepFiringForState is periodically invoked to store the state of alerting rules using 'keep_firing_for'.
|
||||
func (g *Group) StoreKeepFiringForState() {
|
||||
for _, rule := range g.rules {
|
||||
ar, ok := rule.(*AlertingRule)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if ar.KeepFiringFor() != 0 {
|
||||
alertsToStore := make([]*Alert, 0)
|
||||
ar.ForEachActiveAlert(func(alert *Alert) {
|
||||
if !alert.KeepFiringSince.IsZero() {
|
||||
alertsToStore = append(alertsToStore, alert)
|
||||
}
|
||||
})
|
||||
groupKey := GroupKey(g.File(), g.Name())
|
||||
err := g.alertStore.SetAlerts(ar.GetFingerprint(groupKey), groupKey, alertsToStore)
|
||||
if err != nil {
|
||||
g.logger.Error("Failed to store alerting rule state", "rule", ar.Name(), "err", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -89,33 +89,7 @@ func DefaultEvalIterationFunc(ctx context.Context, g *Group, evalTimestamp time.
|
|||
|
||||
if g.alertStore != nil {
|
||||
// feature enabled.
|
||||
go func() {
|
||||
g.alertStoreFunc(g)
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
// DefaultAlertStoreFunc is the default implementation of
|
||||
// AlertStateStoreFunc that is periodically invoked to store the state
|
||||
// of alerting rules in a group at a given point in time.
|
||||
func DefaultAlertStoreFunc(g *Group) {
|
||||
for _, rule := range g.rules {
|
||||
ar, ok := rule.(*AlertingRule)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if ar.KeepFiringFor() != 0 {
|
||||
alertsToStore := make([]*Alert, 0)
|
||||
ar.ForEachActiveAlert(func(alert *Alert) {
|
||||
if !alert.KeepFiringSince.IsZero() {
|
||||
alertsToStore = append(alertsToStore, alert)
|
||||
}
|
||||
})
|
||||
err := g.alertStore.SetAlerts(ar.GetFingerprint(GroupKey(g.File(), g.Name())), alertsToStore)
|
||||
if err != nil {
|
||||
g.logger.Error("Failed to store alerting rule state", "rule", ar.Name(), "err", err)
|
||||
}
|
||||
}
|
||||
g.StoreKeepFiringForState()
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -155,7 +129,6 @@ type ManagerOptions struct {
|
|||
RuleConcurrencyController RuleConcurrencyController
|
||||
RuleDependencyController RuleDependencyController
|
||||
AlertStore AlertStore
|
||||
AlertStoreFunc AlertStateStoreFunc
|
||||
// At present, manager only restores `for` state when manager is newly created which happens
|
||||
// during restarts. This flag provides an option to restore the `for` state when new rule groups are
|
||||
// added to an existing manager
|
||||
|
@ -239,8 +212,6 @@ func (m *Manager) Stop() {
|
|||
m.logger.Info("Rule manager stopped")
|
||||
}
|
||||
|
||||
type AlertStateStoreFunc func(g *Group)
|
||||
|
||||
// Update the rule manager's state as the config requires. If
|
||||
// loading the new rules failed the old rule set is restored.
|
||||
// This method will no-op in case the manager is already stopped.
|
||||
|
@ -402,7 +373,6 @@ func (m *Manager) LoadGroups(
|
|||
QueryOffset: (*time.Duration)(rg.QueryOffset),
|
||||
done: m.done,
|
||||
EvalIterationFunc: groupEvalIterationFunc,
|
||||
AlertStoreFunc: m.opts.AlertStoreFunc,
|
||||
AlertStore: m.opts.AlertStore,
|
||||
})
|
||||
}
|
||||
|
|
|
@ -2568,8 +2568,8 @@ func TestKeepFiringForStateRestore(t *testing.T) {
|
|||
},
|
||||
)
|
||||
|
||||
alertStore := NewFileStore(promslog.NewNopLogger(), testStoreFile)
|
||||
ng := testEngine(t)
|
||||
alertStore := NewFileStore(promslog.NewNopLogger(), testStoreFile, prometheus.NewRegistry())
|
||||
opts := &ManagerOptions{
|
||||
QueryFunc: EngineQueryFunc(ng, testStorage),
|
||||
Appendable: testStorage,
|
||||
|
@ -2613,7 +2613,6 @@ func TestKeepFiringForStateRestore(t *testing.T) {
|
|||
Rules: []Rule{rule, rule2},
|
||||
ShouldRestore: true,
|
||||
Opts: opts,
|
||||
AlertStoreFunc: DefaultAlertStoreFunc,
|
||||
AlertStore: alertStore,
|
||||
})
|
||||
|
||||
|
@ -2657,7 +2656,7 @@ func TestKeepFiringForStateRestore(t *testing.T) {
|
|||
group.Eval(opts.Context, evalTime)
|
||||
group.setLastEvalTimestamp(evalTime)
|
||||
// Manager will store alert state.
|
||||
DefaultAlertStoreFunc(group)
|
||||
group.StoreKeepFiringForState()
|
||||
}
|
||||
|
||||
exp := rule.ActiveAlerts()
|
||||
|
@ -2683,7 +2682,7 @@ func TestKeepFiringForStateRestore(t *testing.T) {
|
|||
labels.FromStrings("annotation_test", "rule2"), labels.EmptyLabels(), "", true, nil,
|
||||
)
|
||||
// Restart alert store.
|
||||
newAlertStore := NewFileStore(promslog.NewNopLogger(), testStoreFile)
|
||||
newAlertStore := NewFileStore(promslog.NewNopLogger(), testStoreFile, prometheus.NewRegistry())
|
||||
|
||||
newGroup := NewGroup(GroupOptions{
|
||||
Name: "default",
|
||||
|
@ -2692,7 +2691,6 @@ func TestKeepFiringForStateRestore(t *testing.T) {
|
|||
ShouldRestore: true,
|
||||
Opts: opts,
|
||||
AlertStore: newAlertStore,
|
||||
AlertStoreFunc: DefaultAlertStoreFunc,
|
||||
})
|
||||
|
||||
newGroups := make(map[string]*Group)
|
||||
|
|
|
@ -5,16 +5,9 @@ import (
|
|||
"log/slog"
|
||||
"os"
|
||||
"sync"
|
||||
)
|
||||
|
||||
// AlertStore provides persistent storage of alert state.
|
||||
type AlertStore interface {
|
||||
// SetAlerts stores the provided list of alerts for a rule.
|
||||
SetAlerts(key uint64, alerts []*Alert) error
|
||||
// GetAlerts returns a list of alerts for each alerting rule,
|
||||
// alerting rule is identified by a fingerprint of its config.
|
||||
GetAlerts(key uint64) (map[uint64]*Alert, error)
|
||||
}
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
// FileStore implements the AlertStore interface.
|
||||
type FileStore struct {
|
||||
|
@ -23,23 +16,46 @@ type FileStore struct {
|
|||
// protects the `alertsByRule` map.
|
||||
stateMtx sync.RWMutex
|
||||
path string
|
||||
registerer prometheus.Registerer
|
||||
storeInitErrors prometheus.Counter
|
||||
alertStoreErrors *prometheus.CounterVec
|
||||
}
|
||||
|
||||
func NewFileStore(l *slog.Logger, storagePath string) *FileStore {
|
||||
func NewFileStore(l *slog.Logger, storagePath string, registerer prometheus.Registerer) *FileStore {
|
||||
s := &FileStore{
|
||||
logger: l,
|
||||
alertsByRule: make(map[uint64][]*Alert),
|
||||
path: storagePath,
|
||||
registerer: registerer,
|
||||
}
|
||||
s.storeInitErrors = prometheus.NewCounter(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: namespace,
|
||||
Name: "alert_store_init_errors_total",
|
||||
Help: "The total number of errors starting alert store.",
|
||||
},
|
||||
)
|
||||
s.alertStoreErrors = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: namespace,
|
||||
Name: "rule_group_alert_store_errors_total",
|
||||
Help: "The total number of errors in alert store.",
|
||||
},
|
||||
[]string{"rule_group"},
|
||||
)
|
||||
s.initState()
|
||||
return s
|
||||
}
|
||||
|
||||
// initState reads the state from file storage into the alertsByRule map.
|
||||
func (s *FileStore) initState() {
|
||||
if s.registerer != nil {
|
||||
s.registerer.MustRegister(s.alertStoreErrors, s.storeInitErrors)
|
||||
}
|
||||
file, err := os.OpenFile(s.path, os.O_RDWR|os.O_CREATE, 0o666)
|
||||
if err != nil {
|
||||
s.logger.Error("Failed reading alerts state from file", "err", err)
|
||||
s.storeInitErrors.Inc()
|
||||
return
|
||||
}
|
||||
defer file.Close()
|
||||
|
@ -48,6 +64,7 @@ func (s *FileStore) initState() {
|
|||
err = json.NewDecoder(file).Decode(&alertsByRule)
|
||||
if err != nil {
|
||||
s.logger.Error("Failed reading alerts state from file", "err", err)
|
||||
s.storeInitErrors.Inc()
|
||||
}
|
||||
if alertsByRule == nil {
|
||||
alertsByRule = make(map[uint64][]*Alert)
|
||||
|
@ -77,7 +94,7 @@ func (s *FileStore) GetAlerts(key uint64) (map[uint64]*Alert, error) {
|
|||
}
|
||||
|
||||
// SetAlerts updates the stateByRule map and writes state to file storage.
|
||||
func (s *FileStore) SetAlerts(key uint64, alerts []*Alert) error {
|
||||
func (s *FileStore) SetAlerts(key uint64, groupKey string, alerts []*Alert) error {
|
||||
s.stateMtx.Lock()
|
||||
defer s.stateMtx.Unlock()
|
||||
|
||||
|
@ -88,6 +105,7 @@ func (s *FileStore) SetAlerts(key uint64, alerts []*Alert) error {
|
|||
// flush in memory state to file storage
|
||||
file, err := os.Create(s.path)
|
||||
if err != nil {
|
||||
s.alertStoreErrors.WithLabelValues(groupKey).Inc()
|
||||
return err
|
||||
}
|
||||
defer file.Close()
|
||||
|
@ -95,6 +113,7 @@ func (s *FileStore) SetAlerts(key uint64, alerts []*Alert) error {
|
|||
encoder := json.NewEncoder(file)
|
||||
err = encoder.Encode(s.alertsByRule)
|
||||
if err != nil {
|
||||
s.alertStoreErrors.WithLabelValues(groupKey).Inc()
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
|
|
|
@ -5,6 +5,7 @@ import (
|
|||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/prometheus/common/promslog"
|
||||
|
@ -12,7 +13,7 @@ import (
|
|||
)
|
||||
|
||||
func TestAlertStore(t *testing.T) {
|
||||
alertStore := NewFileStore(promslog.NewNopLogger(), "alertstoretest")
|
||||
alertStore := NewFileStore(promslog.NewNopLogger(), "alertstoretest", prometheus.NewRegistry())
|
||||
t.Cleanup(func() {
|
||||
os.Remove("alertstoretest")
|
||||
})
|
||||
|
@ -29,7 +30,7 @@ func TestAlertStore(t *testing.T) {
|
|||
|
||||
for key, alerts := range alertsByRule {
|
||||
sortAlerts(alerts)
|
||||
err := alertStore.SetAlerts(key, alerts)
|
||||
err := alertStore.SetAlerts(key, "test/test1", alerts)
|
||||
require.NoError(t, err)
|
||||
|
||||
got, err := alertStore.GetAlerts(key)
|
||||
|
|
Loading…
Reference in a new issue