Add metrics to count inconsistencies and fp collisions.

This commit is contained in:
beorn7 2015-05-21 17:50:06 +02:00
parent c44e7cd105
commit 3b9ab546e6
3 changed files with 64 additions and 31 deletions

View file

@ -7,6 +7,7 @@ import (
"sync"
"sync/atomic"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/log"
clientmodel "github.com/prometheus/client_golang/model"
@ -31,22 +32,31 @@ type fpMapper struct {
fpToSeries *seriesMap
p *persistence
mappingsCounter prometheus.Counter
}
// newFPMapper loads the collision map from the persistence and
// returns an fpMapper ready to use.
func newFPMapper(fpToSeries *seriesMap, p *persistence) (*fpMapper, error) {
r := &fpMapper{
m := &fpMapper{
fpToSeries: fpToSeries,
p: p,
mappingsCounter: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "fingerprint_mappings_total",
Help: "The total number of fingerprints being mapped to avoid collisions.",
}),
}
mappings, nextFP, err := p.loadFPMappings()
if err != nil {
return nil, err
}
r.mappings = mappings
r.highestMappedFP = nextFP
return r, nil
m.mappings = mappings
m.mappingsCounter.Set(float64(len(m.mappings)))
m.highestMappedFP = nextFP
return m, nil
}
// mapFP takes a raw fingerprint (as returned by Metrics.FastFingerprint) and
@ -55,33 +65,33 @@ func newFPMapper(fpToSeries *seriesMap, p *persistence) (*fpMapper, error) {
//
// If an error is encountered, it is returned together with the unchanged raw
// fingerprint.
func (r *fpMapper) mapFP(fp clientmodel.Fingerprint, m clientmodel.Metric) (clientmodel.Fingerprint, error) {
func (m *fpMapper) mapFP(fp clientmodel.Fingerprint, metric clientmodel.Metric) (clientmodel.Fingerprint, error) {
// First check if we are in the reserved FP space, in which case this is
// automatically a collision that has to be mapped.
if fp <= maxMappedFP {
return r.maybeAddMapping(fp, m)
return m.maybeAddMapping(fp, metric)
}
// Then check the most likely case: This fp belongs to a series that is
// already in memory.
s, ok := r.fpToSeries.get(fp)
s, ok := m.fpToSeries.get(fp)
if ok {
// FP exists in memory, but is it for the same metric?
if m.Equal(s.metric) {
if metric.Equal(s.metric) {
// Yupp. We are done.
return fp, nil
}
// Collision detected!
return r.maybeAddMapping(fp, m)
return m.maybeAddMapping(fp, metric)
}
// Metric is not in memory. Before doing the expensive archive lookup,
// check if we have a mapping for this metric in place already.
r.mtx.RLock()
mappedFPs, fpAlreadyMapped := r.mappings[fp]
r.mtx.RUnlock()
m.mtx.RLock()
mappedFPs, fpAlreadyMapped := m.mappings[fp]
m.mtx.RUnlock()
if fpAlreadyMapped {
// We indeed have mapped fp historically.
ms := metricToUniqueString(m)
ms := metricToUniqueString(metric)
// fp is locked by the caller, so no further locking of
// 'collisions' required (it is specific to fp).
mappedFP, ok := mappedFPs[ms]
@ -93,18 +103,18 @@ func (r *fpMapper) mapFP(fp clientmodel.Fingerprint, m clientmodel.Metric) (clie
// If we are here, FP does not exist in memory and is either not mapped
// at all, or existing mappings for FP are not for m. Check if we have
// something for FP in the archive.
archivedMetric, err := r.p.archivedMetric(fp)
archivedMetric, err := m.p.archivedMetric(fp)
if err != nil {
return fp, err
}
if archivedMetric != nil {
// FP exists in archive, but is it for the same metric?
if m.Equal(archivedMetric) {
if metric.Equal(archivedMetric) {
// Yupp. We are done.
return fp, nil
}
// Collision detected!
return r.maybeAddMapping(fp, m)
return m.maybeAddMapping(fp, metric)
}
// As fp does not exist, neither in memory nor in archive, we can safely
// keep it unmapped.
@ -114,14 +124,14 @@ func (r *fpMapper) mapFP(fp clientmodel.Fingerprint, m clientmodel.Metric) (clie
// maybeAddMapping is only used internally. It takes a detected collision and
// adds it to the collisions map if not yet there. In any case, it returns the
// truly unique fingerprint for the colliding metric.
func (r *fpMapper) maybeAddMapping(
func (m *fpMapper) maybeAddMapping(
fp clientmodel.Fingerprint,
collidingMetric clientmodel.Metric,
) (clientmodel.Fingerprint, error) {
ms := metricToUniqueString(collidingMetric)
r.mtx.RLock()
mappedFPs, ok := r.mappings[fp]
r.mtx.RUnlock()
m.mtx.RLock()
mappedFPs, ok := m.mappings[fp]
m.mtx.RUnlock()
if ok {
// fp is locked by the caller, so no further locking required.
mappedFP, ok := mappedFPs[ms]
@ -129,12 +139,12 @@ func (r *fpMapper) maybeAddMapping(
return mappedFP, nil // Existing mapping.
}
// A new mapping has to be created.
mappedFP = r.nextMappedFP()
mappedFP = m.nextMappedFP()
mappedFPs[ms] = mappedFP
r.mtx.RLock()
m.mtx.RLock()
// Checkpoint mappings after each change.
err := r.p.checkpointFPMappings(r.mappings)
r.mtx.RUnlock()
err := m.p.checkpointFPMappings(m.mappings)
m.mtx.RUnlock()
log.Infof(
"Collision detected for fingerprint %v, metric %v, mapping to new fingerprint %v.",
fp, collidingMetric, mappedFP,
@ -142,13 +152,14 @@ func (r *fpMapper) maybeAddMapping(
return mappedFP, err
}
// This is the first collision for fp.
mappedFP := r.nextMappedFP()
mappedFP := m.nextMappedFP()
mappedFPs = map[string]clientmodel.Fingerprint{ms: mappedFP}
r.mtx.Lock()
r.mappings[fp] = mappedFPs
m.mtx.Lock()
m.mappings[fp] = mappedFPs
m.mappingsCounter.Inc()
// Checkpoint mappings after each change.
err := r.p.checkpointFPMappings(r.mappings)
r.mtx.Unlock()
err := m.p.checkpointFPMappings(m.mappings)
m.mtx.Unlock()
log.Infof(
"Collision detected for fingerprint %v, metric %v, mapping to new fingerprint %v.",
fp, collidingMetric, mappedFP,
@ -156,14 +167,24 @@ func (r *fpMapper) maybeAddMapping(
return mappedFP, err
}
func (r *fpMapper) nextMappedFP() clientmodel.Fingerprint {
mappedFP := clientmodel.Fingerprint(atomic.AddUint64((*uint64)(&r.highestMappedFP), 1))
func (m *fpMapper) nextMappedFP() clientmodel.Fingerprint {
mappedFP := clientmodel.Fingerprint(atomic.AddUint64((*uint64)(&m.highestMappedFP), 1))
if mappedFP > maxMappedFP {
panic(fmt.Errorf("more than %v fingerprints mapped in collision detection", maxMappedFP))
}
return mappedFP
}
// Describe implements prometheus.Collector.
func (m *fpMapper) Describe(ch chan<- *prometheus.Desc) {
ch <- m.mappingsCounter.Desc()
}
// Collect implements prometheus.Collector.
func (m *fpMapper) Collect(ch chan<- prometheus.Metric) {
ch <- m.mappingsCounter
}
// metricToUniqueString turns a metric into a string in a reproducible and
// unique way, i.e. the same metric will always create the same string, and
// different metrics will always create different strings. In a way, it is the

View file

@ -120,6 +120,7 @@ type persistence struct {
indexingBatchSizes prometheus.Summary
indexingBatchDuration prometheus.Summary
checkpointDuration prometheus.Gauge
dirtyCounter prometheus.Counter
dirtyMtx sync.Mutex // Protects dirty and becameDirty.
dirty bool // true if persistence was started in dirty state.
@ -237,6 +238,12 @@ func newPersistence(basePath string, dirty, pedanticChecks bool, shouldSync sync
Name: "checkpoint_duration_milliseconds",
Help: "The duration (in milliseconds) it took to checkpoint in-memory metrics and head chunks.",
}),
dirtyCounter: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "inconsistencies_total",
Help: "A counter incremented each time an inconsistency in the local storage is detected. If this is greater zero, restart the server as soon as possible.",
}),
dirty: dirty,
pedanticChecks: pedanticChecks,
dirtyFileName: dirtyPath,
@ -282,6 +289,7 @@ func (p *persistence) Describe(ch chan<- *prometheus.Desc) {
p.indexingBatchSizes.Describe(ch)
p.indexingBatchDuration.Describe(ch)
ch <- p.checkpointDuration.Desc()
ch <- p.dirtyCounter.Desc()
}
// Collect implements prometheus.Collector.
@ -293,6 +301,7 @@ func (p *persistence) Collect(ch chan<- prometheus.Metric) {
p.indexingBatchSizes.Collect(ch)
p.indexingBatchDuration.Collect(ch)
ch <- p.checkpointDuration
ch <- p.dirtyCounter
}
// isDirty returns the dirty flag in a goroutine-safe way.
@ -307,6 +316,7 @@ func (p *persistence) isDirty() bool {
// dirty during our runtime, there is no way back. If we were dirty from the
// start, a clean-up might make us clean again.)
func (p *persistence) setDirty(dirty bool) {
p.dirtyCounter.Inc()
p.dirtyMtx.Lock()
defer p.dirtyMtx.Unlock()
if p.becameDirty {

View file

@ -1003,6 +1003,7 @@ func (s *memorySeriesStorage) persistenceBacklogScore() float64 {
// Describe implements prometheus.Collector.
func (s *memorySeriesStorage) Describe(ch chan<- *prometheus.Desc) {
s.persistence.Describe(ch)
s.mapper.Describe(ch)
ch <- s.persistErrors.Desc()
ch <- maxChunksToPersistDesc
@ -1018,6 +1019,7 @@ func (s *memorySeriesStorage) Describe(ch chan<- *prometheus.Desc) {
// Collect implements prometheus.Collector.
func (s *memorySeriesStorage) Collect(ch chan<- prometheus.Metric) {
s.persistence.Collect(ch)
s.mapper.Collect(ch)
ch <- s.persistErrors
ch <- prometheus.MustNewConstMetric(