mirror of
https://github.com/prometheus/prometheus.git
synced 2025-01-11 22:07:27 -08:00
Snapshot in-memory chunks on shutdown for faster restarts (#7229)
Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
This commit is contained in:
parent
5df37892aa
commit
ee7e0071d1
|
@ -149,6 +149,9 @@ func (c *flagConfig) setFeatureListOptions(logger log.Logger) error {
|
||||||
case "exemplar-storage":
|
case "exemplar-storage":
|
||||||
c.tsdb.EnableExemplarStorage = true
|
c.tsdb.EnableExemplarStorage = true
|
||||||
level.Info(logger).Log("msg", "Experimental in-memory exemplar storage enabled")
|
level.Info(logger).Log("msg", "Experimental in-memory exemplar storage enabled")
|
||||||
|
case "memory-snapshot-on-shutdown":
|
||||||
|
c.tsdb.EnableMemorySnapshotOnShutdown = true
|
||||||
|
level.Info(logger).Log("msg", "Experimental memory snapshot on shutdown enabled")
|
||||||
case "":
|
case "":
|
||||||
continue
|
continue
|
||||||
default:
|
default:
|
||||||
|
@ -309,7 +312,7 @@ func main() {
|
||||||
a.Flag("query.max-samples", "Maximum number of samples a single query can load into memory. Note that queries will fail if they try to load more samples than this into memory, so this also limits the number of samples a query can return.").
|
a.Flag("query.max-samples", "Maximum number of samples a single query can load into memory. Note that queries will fail if they try to load more samples than this into memory, so this also limits the number of samples a query can return.").
|
||||||
Default("50000000").IntVar(&cfg.queryMaxSamples)
|
Default("50000000").IntVar(&cfg.queryMaxSamples)
|
||||||
|
|
||||||
a.Flag("enable-feature", "Comma separated feature names to enable. Valid options: promql-at-modifier, promql-negative-offset, remote-write-receiver, exemplar-storage, expand-external-labels. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details.").
|
a.Flag("enable-feature", "Comma separated feature names to enable. Valid options: exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-at-modifier, promql-negative-offset, remote-write-receiver. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details.").
|
||||||
Default("").StringsVar(&cfg.featureList)
|
Default("").StringsVar(&cfg.featureList)
|
||||||
|
|
||||||
promlogflag.AddFlags(a, &cfg.promlogConfig)
|
promlogflag.AddFlags(a, &cfg.promlogConfig)
|
||||||
|
@ -1263,34 +1266,36 @@ func (rm *readyScrapeManager) Get() (*scrape.Manager, error) {
|
||||||
// tsdbOptions is tsdb.Option version with defined units.
|
// tsdbOptions is tsdb.Option version with defined units.
|
||||||
// This is required as tsdb.Option fields are unit agnostic (time).
|
// This is required as tsdb.Option fields are unit agnostic (time).
|
||||||
type tsdbOptions struct {
|
type tsdbOptions struct {
|
||||||
WALSegmentSize units.Base2Bytes
|
WALSegmentSize units.Base2Bytes
|
||||||
MaxBlockChunkSegmentSize units.Base2Bytes
|
MaxBlockChunkSegmentSize units.Base2Bytes
|
||||||
RetentionDuration model.Duration
|
RetentionDuration model.Duration
|
||||||
MaxBytes units.Base2Bytes
|
MaxBytes units.Base2Bytes
|
||||||
NoLockfile bool
|
NoLockfile bool
|
||||||
AllowOverlappingBlocks bool
|
AllowOverlappingBlocks bool
|
||||||
WALCompression bool
|
WALCompression bool
|
||||||
StripeSize int
|
StripeSize int
|
||||||
MinBlockDuration model.Duration
|
MinBlockDuration model.Duration
|
||||||
MaxBlockDuration model.Duration
|
MaxBlockDuration model.Duration
|
||||||
EnableExemplarStorage bool
|
EnableExemplarStorage bool
|
||||||
MaxExemplars int64
|
MaxExemplars int64
|
||||||
|
EnableMemorySnapshotOnShutdown bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func (opts tsdbOptions) ToTSDBOptions() tsdb.Options {
|
func (opts tsdbOptions) ToTSDBOptions() tsdb.Options {
|
||||||
return tsdb.Options{
|
return tsdb.Options{
|
||||||
WALSegmentSize: int(opts.WALSegmentSize),
|
WALSegmentSize: int(opts.WALSegmentSize),
|
||||||
MaxBlockChunkSegmentSize: int64(opts.MaxBlockChunkSegmentSize),
|
MaxBlockChunkSegmentSize: int64(opts.MaxBlockChunkSegmentSize),
|
||||||
RetentionDuration: int64(time.Duration(opts.RetentionDuration) / time.Millisecond),
|
RetentionDuration: int64(time.Duration(opts.RetentionDuration) / time.Millisecond),
|
||||||
MaxBytes: int64(opts.MaxBytes),
|
MaxBytes: int64(opts.MaxBytes),
|
||||||
NoLockfile: opts.NoLockfile,
|
NoLockfile: opts.NoLockfile,
|
||||||
AllowOverlappingBlocks: opts.AllowOverlappingBlocks,
|
AllowOverlappingBlocks: opts.AllowOverlappingBlocks,
|
||||||
WALCompression: opts.WALCompression,
|
WALCompression: opts.WALCompression,
|
||||||
StripeSize: opts.StripeSize,
|
StripeSize: opts.StripeSize,
|
||||||
MinBlockDuration: int64(time.Duration(opts.MinBlockDuration) / time.Millisecond),
|
MinBlockDuration: int64(time.Duration(opts.MinBlockDuration) / time.Millisecond),
|
||||||
MaxBlockDuration: int64(time.Duration(opts.MaxBlockDuration) / time.Millisecond),
|
MaxBlockDuration: int64(time.Duration(opts.MaxBlockDuration) / time.Millisecond),
|
||||||
EnableExemplarStorage: opts.EnableExemplarStorage,
|
EnableExemplarStorage: opts.EnableExemplarStorage,
|
||||||
MaxExemplars: opts.MaxExemplars,
|
MaxExemplars: opts.MaxExemplars,
|
||||||
|
EnableMemorySnapshotOnShutdown: opts.EnableMemorySnapshotOnShutdown,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -40,7 +40,7 @@ with more recent data.
|
||||||
|
|
||||||
More details can be found [here](querying/basics.md#offset-modifier).
|
More details can be found [here](querying/basics.md#offset-modifier).
|
||||||
|
|
||||||
## Remote Write Receiver
|
## Remote Write Receiver
|
||||||
|
|
||||||
`--enable-feature=remote-write-receiver`
|
`--enable-feature=remote-write-receiver`
|
||||||
|
|
||||||
|
@ -53,3 +53,11 @@ The remote write receiver allows Prometheus to accept remote write requests from
|
||||||
[OpenMetrics](https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#exemplars) introduces the ability for scrape targets to add exemplars to certain metrics. Exemplars are references to data outside of the MetricSet. A common use case are IDs of program traces.
|
[OpenMetrics](https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#exemplars) introduces the ability for scrape targets to add exemplars to certain metrics. Exemplars are references to data outside of the MetricSet. A common use case are IDs of program traces.
|
||||||
|
|
||||||
Exemplar storage is implemented as a fixed size circular buffer that stores exemplars in memory for all series. Enabling this feature will enable the storage of exemplars scraped by Prometheus. The flag `storage.exemplars.exemplars-limit` can be used to control the size of circular buffer by # of exemplars. An exemplar with just a `traceID=<jaeger-trace-id>` uses roughly 100 bytes of memory via the in-memory exemplar storage. If the exemplar storage is enabled, we will also append the exemplars to WAL for local persistence (for WAL duration).
|
Exemplar storage is implemented as a fixed size circular buffer that stores exemplars in memory for all series. Enabling this feature will enable the storage of exemplars scraped by Prometheus. The flag `storage.exemplars.exemplars-limit` can be used to control the size of circular buffer by # of exemplars. An exemplar with just a `traceID=<jaeger-trace-id>` uses roughly 100 bytes of memory via the in-memory exemplar storage. If the exemplar storage is enabled, we will also append the exemplars to WAL for local persistence (for WAL duration).
|
||||||
|
|
||||||
|
## Memory Snapshot on Shutdown
|
||||||
|
|
||||||
|
`--enable-feature=memory-snapshot-on-shutdown`
|
||||||
|
|
||||||
|
This takes the snapshot of the chunks that are in memory along with the series information when shutting down and stores
|
||||||
|
it on disk. This will reduce the startup time since the memory state can be restored with this snapshot and m-mapped
|
||||||
|
chunks without the need of WAL replay.
|
||||||
|
|
|
@ -151,6 +151,9 @@ type Options struct {
|
||||||
// Enables the in memory exemplar storage,.
|
// Enables the in memory exemplar storage,.
|
||||||
EnableExemplarStorage bool
|
EnableExemplarStorage bool
|
||||||
|
|
||||||
|
// Enables the snapshot of in-memory chunks on shutdown. This makes restarts faster.
|
||||||
|
EnableMemorySnapshotOnShutdown bool
|
||||||
|
|
||||||
// MaxExemplars sets the size, in # of exemplars stored, of the single circular buffer used to store exemplars in memory.
|
// MaxExemplars sets the size, in # of exemplars stored, of the single circular buffer used to store exemplars in memory.
|
||||||
// See tsdb/exemplar.go, specifically the CircularExemplarStorage struct and it's constructor NewCircularExemplarStorage.
|
// See tsdb/exemplar.go, specifically the CircularExemplarStorage struct and it's constructor NewCircularExemplarStorage.
|
||||||
MaxExemplars int64
|
MaxExemplars int64
|
||||||
|
@ -722,6 +725,7 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
|
||||||
headOpts.SeriesCallback = opts.SeriesLifecycleCallback
|
headOpts.SeriesCallback = opts.SeriesLifecycleCallback
|
||||||
headOpts.EnableExemplarStorage = opts.EnableExemplarStorage
|
headOpts.EnableExemplarStorage = opts.EnableExemplarStorage
|
||||||
headOpts.MaxExemplars.Store(opts.MaxExemplars)
|
headOpts.MaxExemplars.Store(opts.MaxExemplars)
|
||||||
|
headOpts.EnableMemorySnapshotOnShutdown = opts.EnableMemorySnapshotOnShutdown
|
||||||
db.head, err = NewHead(r, l, wlog, headOpts, stats.Head)
|
db.head, err = NewHead(r, l, wlog, headOpts, stats.Head)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
|
|
@ -5,3 +5,4 @@
|
||||||
* [Head Chunks](head_chunks.md)
|
* [Head Chunks](head_chunks.md)
|
||||||
* [Tombstones](tombstones.md)
|
* [Tombstones](tombstones.md)
|
||||||
* [Wal](wal.md)
|
* [Wal](wal.md)
|
||||||
|
* [Memory Snapshot](memory_snapshot.md)
|
||||||
|
|
62
tsdb/docs/format/memory_snapshot.md
Normal file
62
tsdb/docs/format/memory_snapshot.md
Normal file
|
@ -0,0 +1,62 @@
|
||||||
|
# Memory Snapshot Format
|
||||||
|
|
||||||
|
Memory snapshot uses the WAL package and writes each series as a WAL record.
|
||||||
|
Below are the formats of the individual records.
|
||||||
|
|
||||||
|
### Series records
|
||||||
|
|
||||||
|
This record is a snapshot of a single series. Only one series exists per record.
|
||||||
|
It includes the metadata of the series and the in-memory chunk data if it exists.
|
||||||
|
The sampleBuf is the last 4 samples in the in-memory chunk.
|
||||||
|
|
||||||
|
```
|
||||||
|
┌──────────────────────────┬────────────────────────────┐
|
||||||
|
│ Record Type <byte> │ Series Ref <uint64> │
|
||||||
|
├──────────────────────────┴────────────────────────────┤
|
||||||
|
│ Number of Labels <uvarint> │
|
||||||
|
├──────────────────────────────┬────────────────────────┤
|
||||||
|
│ len(name_1) <uvarint> │ name_1 <bytes> │
|
||||||
|
├──────────────────────────────┼────────────────────────┤
|
||||||
|
│ len(val_1) <uvarint> │ val_1 <bytes> │
|
||||||
|
├──────────────────────────────┴────────────────────────┤
|
||||||
|
│ . . . │
|
||||||
|
├──────────────────────────────┬────────────────────────┤
|
||||||
|
│ len(name_N) <uvarint> │ name_N <bytes> │
|
||||||
|
├──────────────────────────────┼────────────────────────┤
|
||||||
|
│ len(val_N) <uvarint> │ val_N <bytes> │
|
||||||
|
├──────────────────────────────┴────────────────────────┤
|
||||||
|
│ Chunk Range <int64> │
|
||||||
|
├───────────────────────────────────────────────────────┤
|
||||||
|
│ Chunk Exists <uvarint> │
|
||||||
|
│ # 1 if head chunk exists, 0 otherwise to detect a nil |
|
||||||
|
| # chunk. Below fields exists only when it's 1 here. |
|
||||||
|
├───────────────────────────┬───────────────────────────┤
|
||||||
|
│ Chunk Mint <int64> │ Chunk Maxt <int64> │
|
||||||
|
├───────────────────────────┴───────────────────────────┤
|
||||||
|
│ Chunk Encoding <byte> │
|
||||||
|
├──────────────────────────────┬────────────────────────┤
|
||||||
|
│ len(Chunk) <uvarint> │ Chunk <bytes> │
|
||||||
|
├──────────────────────────┬───┴────────────────────────┤
|
||||||
|
| sampleBuf[0].t <int64> | sampleBuf[0].v <float64> |
|
||||||
|
├──────────────────────────┼────────────────────────────┤
|
||||||
|
| sampleBuf[1].t <int64> | sampleBuf[1].v <float64> |
|
||||||
|
├──────────────────────────┼────────────────────────────┤
|
||||||
|
| sampleBuf[2].t <int64> | sampleBuf[2].v <float64> |
|
||||||
|
├──────────────────────────┼────────────────────────────┤
|
||||||
|
| sampleBuf[3].t <int64> | sampleBuf[3].v <float64> |
|
||||||
|
└──────────────────────────┴────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
### Tombstone record
|
||||||
|
|
||||||
|
This includes all the tombstones in the Head block. A single record is written into
|
||||||
|
the snapshot for all the tombstones. The encoded tombstones uses the same encoding
|
||||||
|
as tombstone file in blocks.
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────────────────────────┐
|
||||||
|
│ Record Type <byte> │
|
||||||
|
├───────────────────────────────────┬─────────────────────────────┤
|
||||||
|
│ len(Encoded Tombstones) <uvarint> │ Encoded Tombstones <bytes> │
|
||||||
|
└───────────────────────────────────┴─────────────────────────────┘
|
||||||
|
```
|
|
@ -17,6 +17,7 @@ import (
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
"hash"
|
"hash"
|
||||||
"hash/crc32"
|
"hash/crc32"
|
||||||
|
"math"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
|
|
||||||
"github.com/dennwc/varint"
|
"github.com/dennwc/varint"
|
||||||
|
@ -40,6 +41,7 @@ func (e *Encbuf) Len() int { return len(e.B) }
|
||||||
|
|
||||||
func (e *Encbuf) PutString(s string) { e.B = append(e.B, s...) }
|
func (e *Encbuf) PutString(s string) { e.B = append(e.B, s...) }
|
||||||
func (e *Encbuf) PutByte(c byte) { e.B = append(e.B, c) }
|
func (e *Encbuf) PutByte(c byte) { e.B = append(e.B, c) }
|
||||||
|
func (e *Encbuf) PutBytes(b []byte) { e.B = append(e.B, b...) }
|
||||||
|
|
||||||
func (e *Encbuf) PutBE32int(x int) { e.PutBE32(uint32(x)) }
|
func (e *Encbuf) PutBE32int(x int) { e.PutBE32(uint32(x)) }
|
||||||
func (e *Encbuf) PutUvarint32(x uint32) { e.PutUvarint64(uint64(x)) }
|
func (e *Encbuf) PutUvarint32(x uint32) { e.PutUvarint64(uint64(x)) }
|
||||||
|
@ -56,6 +58,10 @@ func (e *Encbuf) PutBE64(x uint64) {
|
||||||
e.B = append(e.B, e.C[:8]...)
|
e.B = append(e.B, e.C[:8]...)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (e *Encbuf) PutBEFloat64(x float64) {
|
||||||
|
e.PutBE64(math.Float64bits(x))
|
||||||
|
}
|
||||||
|
|
||||||
func (e *Encbuf) PutUvarint64(x uint64) {
|
func (e *Encbuf) PutUvarint64(x uint64) {
|
||||||
n := binary.PutUvarint(e.C[:], x)
|
n := binary.PutUvarint(e.C[:], x)
|
||||||
e.B = append(e.B, e.C[:n]...)
|
e.B = append(e.B, e.C[:n]...)
|
||||||
|
@ -73,6 +79,12 @@ func (e *Encbuf) PutUvarintStr(s string) {
|
||||||
e.PutString(s)
|
e.PutString(s)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// PutUvarintBytes writes a a variable length byte buffer.
|
||||||
|
func (e *Encbuf) PutUvarintBytes(b []byte) {
|
||||||
|
e.PutUvarint(len(b))
|
||||||
|
e.PutBytes(b)
|
||||||
|
}
|
||||||
|
|
||||||
// PutHash appends a hash over the buffers current contents to the buffer.
|
// PutHash appends a hash over the buffers current contents to the buffer.
|
||||||
func (e *Encbuf) PutHash(h hash.Hash) {
|
func (e *Encbuf) PutHash(h hash.Hash) {
|
||||||
h.Reset()
|
h.Reset()
|
||||||
|
@ -249,6 +261,10 @@ func (d *Decbuf) Be64() uint64 {
|
||||||
return x
|
return x
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (d *Decbuf) Be64Float64() float64 {
|
||||||
|
return math.Float64frombits(d.Be64())
|
||||||
|
}
|
||||||
|
|
||||||
func (d *Decbuf) Be32() uint32 {
|
func (d *Decbuf) Be32() uint32 {
|
||||||
if d.E != nil {
|
if d.E != nil {
|
||||||
return 0
|
return 0
|
||||||
|
|
96
tsdb/head.go
96
tsdb/head.go
|
@ -97,6 +97,8 @@ type Head struct {
|
||||||
// chunkDiskMapper is used to write and read Head chunks to/from disk.
|
// chunkDiskMapper is used to write and read Head chunks to/from disk.
|
||||||
chunkDiskMapper *chunks.ChunkDiskMapper
|
chunkDiskMapper *chunks.ChunkDiskMapper
|
||||||
|
|
||||||
|
chunkSnapshotMtx sync.Mutex
|
||||||
|
|
||||||
closedMtx sync.Mutex
|
closedMtx sync.Mutex
|
||||||
closed bool
|
closed bool
|
||||||
|
|
||||||
|
@ -122,9 +124,10 @@ type HeadOptions struct {
|
||||||
// StripeSize sets the number of entries in the hash map, it must be a power of 2.
|
// StripeSize sets the number of entries in the hash map, it must be a power of 2.
|
||||||
// A larger StripeSize will allocate more memory up-front, but will increase performance when handling a large number of series.
|
// A larger StripeSize will allocate more memory up-front, but will increase performance when handling a large number of series.
|
||||||
// A smaller StripeSize reduces the memory allocated, but can decrease performance with large number of series.
|
// A smaller StripeSize reduces the memory allocated, but can decrease performance with large number of series.
|
||||||
StripeSize int
|
StripeSize int
|
||||||
SeriesCallback SeriesLifecycleCallback
|
SeriesCallback SeriesLifecycleCallback
|
||||||
EnableExemplarStorage bool
|
EnableExemplarStorage bool
|
||||||
|
EnableMemorySnapshotOnShutdown bool
|
||||||
|
|
||||||
// Runtime reloadable options.
|
// Runtime reloadable options.
|
||||||
MaxExemplars atomic.Int64
|
MaxExemplars atomic.Int64
|
||||||
|
@ -439,11 +442,25 @@ func (h *Head) Init(minValidTime int64) error {
|
||||||
h.minValidTime.Store(minValidTime)
|
h.minValidTime.Store(minValidTime)
|
||||||
defer h.postings.EnsureOrder()
|
defer h.postings.EnsureOrder()
|
||||||
defer h.gc() // After loading the wal remove the obsolete data from the head.
|
defer h.gc() // After loading the wal remove the obsolete data from the head.
|
||||||
|
defer func() {
|
||||||
|
// Loading of m-mapped chunks and snapshot can make the mint of the Head
|
||||||
|
// to go below minValidTime.
|
||||||
|
if h.MinTime() < h.minValidTime.Load() {
|
||||||
|
h.minTime.Store(h.minValidTime.Load())
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
level.Info(h.logger).Log("msg", "Replaying on-disk memory mappable chunks if any")
|
level.Info(h.logger).Log("msg", "Replaying on-disk memory mappable chunks if any")
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
|
|
||||||
mmappedChunks, err := h.loadMmappedChunks()
|
snapIdx, snapOffset, refSeries, err := h.loadChunkSnapshot()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
level.Info(h.logger).Log("msg", "Chunk snapshot loading time", "duration", time.Since(start).String())
|
||||||
|
|
||||||
|
mmapChunkReplayStart := time.Now()
|
||||||
|
mmappedChunks, err := h.loadMmappedChunks(refSeries)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
level.Error(h.logger).Log("msg", "Loading on-disk chunks failed", "err", err)
|
level.Error(h.logger).Log("msg", "Loading on-disk chunks failed", "err", err)
|
||||||
if _, ok := errors.Cause(err).(*chunks.CorruptionErr); ok {
|
if _, ok := errors.Cause(err).(*chunks.CorruptionErr); ok {
|
||||||
|
@ -451,10 +468,10 @@ func (h *Head) Init(minValidTime int64) error {
|
||||||
}
|
}
|
||||||
// If this fails, data will be recovered from WAL.
|
// If this fails, data will be recovered from WAL.
|
||||||
// Hence we wont lose any data (given WAL is not corrupt).
|
// Hence we wont lose any data (given WAL is not corrupt).
|
||||||
mmappedChunks = h.removeCorruptedMmappedChunks(err)
|
mmappedChunks = h.removeCorruptedMmappedChunks(err, refSeries)
|
||||||
}
|
}
|
||||||
|
|
||||||
level.Info(h.logger).Log("msg", "On-disk memory mappable chunks replay completed", "duration", time.Since(start).String())
|
level.Info(h.logger).Log("msg", "On-disk memory mappable chunks replay completed", "duration", time.Since(mmapChunkReplayStart).String())
|
||||||
if h.wal == nil {
|
if h.wal == nil {
|
||||||
level.Info(h.logger).Log("msg", "WAL not found")
|
level.Info(h.logger).Log("msg", "WAL not found")
|
||||||
return nil
|
return nil
|
||||||
|
@ -502,6 +519,9 @@ func (h *Head) Init(minValidTime int64) error {
|
||||||
|
|
||||||
walReplayStart := time.Now()
|
walReplayStart := time.Now()
|
||||||
|
|
||||||
|
if snapIdx > startFrom {
|
||||||
|
startFrom = snapIdx
|
||||||
|
}
|
||||||
// Backfill segments from the most recent checkpoint onwards.
|
// Backfill segments from the most recent checkpoint onwards.
|
||||||
for i := startFrom; i <= endAt; i++ {
|
for i := startFrom; i <= endAt; i++ {
|
||||||
s, err := wal.OpenReadSegment(wal.SegmentName(h.wal.Dir(), i))
|
s, err := wal.OpenReadSegment(wal.SegmentName(h.wal.Dir(), i))
|
||||||
|
@ -509,7 +529,14 @@ func (h *Head) Init(minValidTime int64) error {
|
||||||
return errors.Wrap(err, fmt.Sprintf("open WAL segment: %d", i))
|
return errors.Wrap(err, fmt.Sprintf("open WAL segment: %d", i))
|
||||||
}
|
}
|
||||||
|
|
||||||
sr := wal.NewSegmentBufReader(s)
|
offset := 0
|
||||||
|
if i == snapIdx {
|
||||||
|
offset = snapOffset
|
||||||
|
}
|
||||||
|
sr, err := wal.NewSegmentBufReaderWithOffset(offset, s)
|
||||||
|
if err != nil {
|
||||||
|
return errors.Wrapf(err, "segment reader (offset=%d)", offset)
|
||||||
|
}
|
||||||
err = h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks)
|
err = h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks)
|
||||||
if err := sr.Close(); err != nil {
|
if err := sr.Close(); err != nil {
|
||||||
level.Warn(h.logger).Log("msg", "Error while closing the wal segments reader", "err", err)
|
level.Warn(h.logger).Log("msg", "Error while closing the wal segments reader", "err", err)
|
||||||
|
@ -533,29 +560,49 @@ func (h *Head) Init(minValidTime int64) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *Head) loadMmappedChunks() (map[uint64][]*mmappedChunk, error) {
|
func (h *Head) loadMmappedChunks(refSeries map[uint64]*memSeries) (map[uint64][]*mmappedChunk, error) {
|
||||||
mmappedChunks := map[uint64][]*mmappedChunk{}
|
mmappedChunks := map[uint64][]*mmappedChunk{}
|
||||||
if err := h.chunkDiskMapper.IterateAllChunks(func(seriesRef, chunkRef uint64, mint, maxt int64, numSamples uint16) error {
|
if err := h.chunkDiskMapper.IterateAllChunks(func(seriesRef, chunkRef uint64, mint, maxt int64, numSamples uint16) error {
|
||||||
if maxt < h.minValidTime.Load() {
|
if maxt < h.minValidTime.Load() {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
ms, ok := refSeries[seriesRef]
|
||||||
slice := mmappedChunks[seriesRef]
|
if !ok {
|
||||||
if len(slice) > 0 {
|
slice := mmappedChunks[seriesRef]
|
||||||
if slice[len(slice)-1].maxTime >= mint {
|
if len(slice) > 0 && slice[len(slice)-1].maxTime >= mint {
|
||||||
return &chunks.CorruptionErr{
|
return errors.Errorf("out of sequence m-mapped chunk for series ref %d", seriesRef)
|
||||||
Err: errors.Errorf("out of sequence m-mapped chunk for series ref %d", seriesRef),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
slice = append(slice, &mmappedChunk{
|
||||||
|
ref: chunkRef,
|
||||||
|
minTime: mint,
|
||||||
|
maxTime: maxt,
|
||||||
|
numSamples: numSamples,
|
||||||
|
})
|
||||||
|
mmappedChunks[seriesRef] = slice
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
slice = append(slice, &mmappedChunk{
|
if len(ms.mmappedChunks) > 0 && ms.mmappedChunks[len(ms.mmappedChunks)-1].maxTime >= mint {
|
||||||
|
return errors.Errorf("out of sequence m-mapped chunk for series ref %d", seriesRef)
|
||||||
|
}
|
||||||
|
|
||||||
|
h.metrics.chunks.Inc()
|
||||||
|
h.metrics.chunksCreated.Inc()
|
||||||
|
ms.mmappedChunks = append(ms.mmappedChunks, &mmappedChunk{
|
||||||
ref: chunkRef,
|
ref: chunkRef,
|
||||||
minTime: mint,
|
minTime: mint,
|
||||||
maxTime: maxt,
|
maxTime: maxt,
|
||||||
numSamples: numSamples,
|
numSamples: numSamples,
|
||||||
})
|
})
|
||||||
mmappedChunks[seriesRef] = slice
|
h.updateMinMaxTime(mint, maxt)
|
||||||
|
if ms.headChunk != nil && maxt >= ms.headChunk.minTime {
|
||||||
|
// The head chunk was completed and was m-mapped after taking the snapshot.
|
||||||
|
// Hence remove this chunk.
|
||||||
|
ms.nextAt = 0
|
||||||
|
ms.headChunk = nil
|
||||||
|
ms.app = nil
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
}); err != nil {
|
}); err != nil {
|
||||||
return nil, errors.Wrap(err, "iterate on on-disk chunks")
|
return nil, errors.Wrap(err, "iterate on on-disk chunks")
|
||||||
|
@ -565,7 +612,7 @@ func (h *Head) loadMmappedChunks() (map[uint64][]*mmappedChunk, error) {
|
||||||
|
|
||||||
// removeCorruptedMmappedChunks attempts to delete the corrupted mmapped chunks and if it fails, it clears all the previously
|
// removeCorruptedMmappedChunks attempts to delete the corrupted mmapped chunks and if it fails, it clears all the previously
|
||||||
// loaded mmapped chunks.
|
// loaded mmapped chunks.
|
||||||
func (h *Head) removeCorruptedMmappedChunks(err error) map[uint64][]*mmappedChunk {
|
func (h *Head) removeCorruptedMmappedChunks(err error, refSeries map[uint64]*memSeries) map[uint64][]*mmappedChunk {
|
||||||
level.Info(h.logger).Log("msg", "Deleting mmapped chunk files")
|
level.Info(h.logger).Log("msg", "Deleting mmapped chunk files")
|
||||||
|
|
||||||
if err := h.chunkDiskMapper.DeleteCorrupted(err); err != nil {
|
if err := h.chunkDiskMapper.DeleteCorrupted(err); err != nil {
|
||||||
|
@ -574,7 +621,7 @@ func (h *Head) removeCorruptedMmappedChunks(err error) map[uint64][]*mmappedChun
|
||||||
}
|
}
|
||||||
|
|
||||||
level.Info(h.logger).Log("msg", "Deletion of mmap chunk files successful, reattempting m-mapping the on-disk chunks")
|
level.Info(h.logger).Log("msg", "Deletion of mmap chunk files successful, reattempting m-mapping the on-disk chunks")
|
||||||
mmappedChunks, err := h.loadMmappedChunks()
|
mmappedChunks, err := h.loadMmappedChunks(refSeries)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
level.Error(h.logger).Log("msg", "Loading on-disk chunks failed, discarding chunk files completely", "err", err)
|
level.Error(h.logger).Log("msg", "Loading on-disk chunks failed, discarding chunk files completely", "err", err)
|
||||||
mmappedChunks = map[uint64][]*mmappedChunk{}
|
mmappedChunks = map[uint64][]*mmappedChunk{}
|
||||||
|
@ -661,6 +708,9 @@ func (h *Head) Truncate(mint int64) (err error) {
|
||||||
|
|
||||||
// truncateMemory removes old data before mint from the head.
|
// truncateMemory removes old data before mint from the head.
|
||||||
func (h *Head) truncateMemory(mint int64) (err error) {
|
func (h *Head) truncateMemory(mint int64) (err error) {
|
||||||
|
h.chunkSnapshotMtx.Lock()
|
||||||
|
defer h.chunkSnapshotMtx.Unlock()
|
||||||
|
|
||||||
defer func() {
|
defer func() {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
h.metrics.headTruncateFail.Inc()
|
h.metrics.headTruncateFail.Inc()
|
||||||
|
@ -796,6 +846,9 @@ func (h *Head) IsQuerierCollidingWithTruncation(querierMint, querierMaxt int64)
|
||||||
|
|
||||||
// truncateWAL removes old data before mint from the WAL.
|
// truncateWAL removes old data before mint from the WAL.
|
||||||
func (h *Head) truncateWAL(mint int64) error {
|
func (h *Head) truncateWAL(mint int64) error {
|
||||||
|
h.chunkSnapshotMtx.Lock()
|
||||||
|
defer h.chunkSnapshotMtx.Unlock()
|
||||||
|
|
||||||
if h.wal == nil || mint <= h.lastWALTruncationTime.Load() {
|
if h.wal == nil || mint <= h.lastWALTruncationTime.Load() {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
@ -1095,15 +1148,20 @@ func (h *Head) compactable() bool {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Close flushes the WAL and closes the head.
|
// Close flushes the WAL and closes the head.
|
||||||
|
// It also takes a snapshot of in-memory chunks if enabled.
|
||||||
func (h *Head) Close() error {
|
func (h *Head) Close() error {
|
||||||
h.closedMtx.Lock()
|
h.closedMtx.Lock()
|
||||||
defer h.closedMtx.Unlock()
|
defer h.closedMtx.Unlock()
|
||||||
h.closed = true
|
h.closed = true
|
||||||
errs := tsdb_errors.NewMulti(h.chunkDiskMapper.Close())
|
errs := tsdb_errors.NewMulti(h.chunkDiskMapper.Close())
|
||||||
|
if errs.Err() == nil && h.opts.EnableMemorySnapshotOnShutdown {
|
||||||
|
errs.Add(h.performChunkSnapshot())
|
||||||
|
}
|
||||||
if h.wal != nil {
|
if h.wal != nil {
|
||||||
errs.Add(h.wal.Close())
|
errs.Add(h.wal.Close())
|
||||||
}
|
}
|
||||||
return errs.Err()
|
return errs.Err()
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// String returns an human readable representation of the TSDB head. It's important to
|
// String returns an human readable representation of the TSDB head. It's important to
|
||||||
|
|
|
@ -468,12 +468,20 @@ func (s *memSeries) append(t int64, v float64, appendID uint64, chunkDiskMapper
|
||||||
c = s.cutNewHeadChunk(t, chunkDiskMapper)
|
c = s.cutNewHeadChunk(t, chunkDiskMapper)
|
||||||
chunkCreated = true
|
chunkCreated = true
|
||||||
}
|
}
|
||||||
numSamples := c.chunk.NumSamples()
|
|
||||||
|
|
||||||
// Out of order sample.
|
// Out of order sample.
|
||||||
if c.maxTime >= t {
|
if c.maxTime >= t {
|
||||||
return false, chunkCreated
|
return false, chunkCreated
|
||||||
}
|
}
|
||||||
|
|
||||||
|
numSamples := c.chunk.NumSamples()
|
||||||
|
if numSamples == 0 {
|
||||||
|
// It could be the new chunk created after reading the chunk snapshot,
|
||||||
|
// hence we fix the minTime of the chunk here.
|
||||||
|
c.minTime = t
|
||||||
|
s.nextAt = rangeForTimestamp(c.minTime, s.chunkRange)
|
||||||
|
}
|
||||||
|
|
||||||
// If we reach 25% of a chunk's desired sample count, predict an end time
|
// If we reach 25% of a chunk's desired sample count, predict an end time
|
||||||
// for this chunk that will try to make samples equally distributed within
|
// for this chunk that will try to make samples equally distributed within
|
||||||
// the remaining chunks in the current chunk range.
|
// the remaining chunks in the current chunk range.
|
||||||
|
|
|
@ -2465,3 +2465,163 @@ func TestWaitForPendingReadersInTimeRange(t *testing.T) {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestChunkSnapshot(t *testing.T) {
|
||||||
|
head, _ := newTestHead(t, 120*4, false)
|
||||||
|
defer func() {
|
||||||
|
head.opts.EnableMemorySnapshotOnShutdown = false
|
||||||
|
require.NoError(t, head.Close())
|
||||||
|
}()
|
||||||
|
|
||||||
|
numSeries := 10
|
||||||
|
expSeries := make(map[string][]tsdbutil.Sample)
|
||||||
|
expTombstones := make(map[uint64]tombstones.Intervals)
|
||||||
|
{ // Initial data that goes into snapshot.
|
||||||
|
// Add some initial samples with >=1 m-map chunk.
|
||||||
|
app := head.Appender(context.Background())
|
||||||
|
for i := 1; i <= numSeries; i++ {
|
||||||
|
lbls := labels.Labels{labels.Label{Name: "foo", Value: fmt.Sprintf("bar%d", i)}}
|
||||||
|
lblStr := lbls.String()
|
||||||
|
// 240 samples should m-map at least 1 chunk.
|
||||||
|
for ts := int64(1); ts <= 240; ts++ {
|
||||||
|
val := rand.Float64()
|
||||||
|
expSeries[lblStr] = append(expSeries[lblStr], sample{ts, val})
|
||||||
|
_, err := app.Append(0, lbls, ts, val)
|
||||||
|
require.NoError(t, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
require.NoError(t, app.Commit())
|
||||||
|
|
||||||
|
// Add some tombstones.
|
||||||
|
var enc record.Encoder
|
||||||
|
for i := 1; i <= numSeries; i++ {
|
||||||
|
ref := uint64(i)
|
||||||
|
itvs := tombstones.Intervals{
|
||||||
|
{Mint: 1234, Maxt: 2345},
|
||||||
|
{Mint: 3456, Maxt: 4567},
|
||||||
|
}
|
||||||
|
for _, itv := range itvs {
|
||||||
|
expTombstones[ref].Add(itv)
|
||||||
|
}
|
||||||
|
head.tombstones.AddInterval(ref, itvs...)
|
||||||
|
err := head.wal.Log(enc.Tombstones([]tombstones.Stone{
|
||||||
|
{Ref: ref, Intervals: itvs},
|
||||||
|
}, nil))
|
||||||
|
require.NoError(t, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// These references should be the ones used for the snapshot.
|
||||||
|
wlast, woffset, err := head.wal.LastSegmentAndOffset()
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
{ // Creating snapshot and verifying it.
|
||||||
|
head.opts.EnableMemorySnapshotOnShutdown = true
|
||||||
|
require.NoError(t, head.Close()) // This will create a snapshot.
|
||||||
|
|
||||||
|
_, sidx, soffset, err := LastChunkSnapshot(head.opts.ChunkDirRoot)
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, wlast, sidx)
|
||||||
|
require.Equal(t, woffset, soffset)
|
||||||
|
}
|
||||||
|
|
||||||
|
{ // Test the replay of snapshot.
|
||||||
|
// Create new Head which should replay this snapshot.
|
||||||
|
w, err := wal.NewSize(nil, nil, head.wal.Dir(), 32768, false)
|
||||||
|
require.NoError(t, err)
|
||||||
|
head, err = NewHead(nil, nil, w, head.opts, nil)
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.NoError(t, head.Init(math.MinInt64))
|
||||||
|
|
||||||
|
// Test query for snapshot replay.
|
||||||
|
q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64)
|
||||||
|
require.NoError(t, err)
|
||||||
|
series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "foo", ".*"))
|
||||||
|
require.Equal(t, expSeries, series)
|
||||||
|
|
||||||
|
// Check the tombstones.
|
||||||
|
tr, err := head.Tombstones()
|
||||||
|
require.NoError(t, err)
|
||||||
|
actTombstones := make(map[uint64]tombstones.Intervals)
|
||||||
|
require.NoError(t, tr.Iter(func(ref uint64, itvs tombstones.Intervals) error {
|
||||||
|
for _, itv := range itvs {
|
||||||
|
actTombstones[ref].Add(itv)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}))
|
||||||
|
require.Equal(t, expTombstones, actTombstones)
|
||||||
|
}
|
||||||
|
|
||||||
|
{ // Additional data to only include in WAL and m-mapped chunks and not snapshot. This mimics having an old snapshot on disk.
|
||||||
|
|
||||||
|
// Add more samples.
|
||||||
|
app := head.Appender(context.Background())
|
||||||
|
for i := 1; i <= numSeries; i++ {
|
||||||
|
lbls := labels.Labels{labels.Label{Name: "foo", Value: fmt.Sprintf("bar%d", i)}}
|
||||||
|
lblStr := lbls.String()
|
||||||
|
// 240 samples should m-map at least 1 chunk.
|
||||||
|
for ts := int64(241); ts <= 480; ts++ {
|
||||||
|
val := rand.Float64()
|
||||||
|
expSeries[lblStr] = append(expSeries[lblStr], sample{ts, val})
|
||||||
|
_, err := app.Append(0, lbls, ts, val)
|
||||||
|
require.NoError(t, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
require.NoError(t, app.Commit())
|
||||||
|
|
||||||
|
// Add more tombstones.
|
||||||
|
var enc record.Encoder
|
||||||
|
for i := 1; i <= numSeries; i++ {
|
||||||
|
ref := uint64(i)
|
||||||
|
itvs := tombstones.Intervals{
|
||||||
|
{Mint: 12345, Maxt: 23456},
|
||||||
|
{Mint: 34567, Maxt: 45678},
|
||||||
|
}
|
||||||
|
for _, itv := range itvs {
|
||||||
|
expTombstones[ref].Add(itv)
|
||||||
|
}
|
||||||
|
head.tombstones.AddInterval(ref, itvs...)
|
||||||
|
err := head.wal.Log(enc.Tombstones([]tombstones.Stone{
|
||||||
|
{Ref: ref, Intervals: itvs},
|
||||||
|
}, nil))
|
||||||
|
require.NoError(t, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{ // Close Head and verify that new snapshot was not created.
|
||||||
|
head.opts.EnableMemorySnapshotOnShutdown = false
|
||||||
|
require.NoError(t, head.Close()) // This should not create a snapshot.
|
||||||
|
|
||||||
|
_, sidx, soffset, err := LastChunkSnapshot(head.opts.ChunkDirRoot)
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, wlast, sidx)
|
||||||
|
require.Equal(t, woffset, soffset)
|
||||||
|
}
|
||||||
|
|
||||||
|
{ // Test the replay of snapshot, m-map chunks, and WAL.
|
||||||
|
// Create new Head to replay snapshot, m-map chunks, and WAL.
|
||||||
|
w, err := wal.NewSize(nil, nil, head.wal.Dir(), 32768, false)
|
||||||
|
require.NoError(t, err)
|
||||||
|
head, err = NewHead(nil, nil, w, head.opts, nil)
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.NoError(t, head.Init(math.MinInt64))
|
||||||
|
|
||||||
|
// Test query when data is replayed from snapshot, m-map chunks, and WAL.
|
||||||
|
q, err := NewBlockQuerier(head, math.MinInt64, math.MaxInt64)
|
||||||
|
require.NoError(t, err)
|
||||||
|
series := query(t, q, labels.MustNewMatcher(labels.MatchRegexp, "foo", ".*"))
|
||||||
|
require.Equal(t, expSeries, series)
|
||||||
|
|
||||||
|
// Check the tombstones.
|
||||||
|
tr, err := head.Tombstones()
|
||||||
|
require.NoError(t, err)
|
||||||
|
actTombstones := make(map[uint64]tombstones.Intervals)
|
||||||
|
require.NoError(t, tr.Iter(func(ref uint64, itvs tombstones.Intervals) error {
|
||||||
|
for _, itv := range itvs {
|
||||||
|
actTombstones[ref].Add(itv)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}))
|
||||||
|
require.Equal(t, expTombstones, actTombstones)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
481
tsdb/head_wal.go
481
tsdb/head_wal.go
|
@ -15,8 +15,18 @@ package tsdb
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"github.com/prometheus/prometheus/pkg/labels"
|
||||||
|
"github.com/prometheus/prometheus/tsdb/chunkenc"
|
||||||
|
"github.com/prometheus/prometheus/tsdb/encoding"
|
||||||
|
tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
|
||||||
|
"github.com/prometheus/prometheus/tsdb/fileutil"
|
||||||
|
"io/ioutil"
|
||||||
"math"
|
"math"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
"runtime"
|
"runtime"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
@ -386,3 +396,474 @@ func (h *Head) processWALSamples(
|
||||||
|
|
||||||
return unknownRefs
|
return unknownRefs
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
chunkSnapshotRecordTypeSeries uint8 = 1
|
||||||
|
chunkSnapshotRecordTypeTombstones uint8 = 2
|
||||||
|
)
|
||||||
|
|
||||||
|
type chunkSnapshotRecord struct {
|
||||||
|
ref uint64
|
||||||
|
lset labels.Labels
|
||||||
|
chunkRange int64
|
||||||
|
mc *memChunk
|
||||||
|
sampleBuf [4]sample
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *memSeries) encodeToSnapshotRecord(b []byte) []byte {
|
||||||
|
buf := encoding.Encbuf{B: b}
|
||||||
|
|
||||||
|
buf.PutByte(chunkSnapshotRecordTypeSeries)
|
||||||
|
buf.PutBE64(s.ref)
|
||||||
|
buf.PutUvarint(len(s.lset))
|
||||||
|
for _, l := range s.lset {
|
||||||
|
buf.PutUvarintStr(l.Name)
|
||||||
|
buf.PutUvarintStr(l.Value)
|
||||||
|
}
|
||||||
|
buf.PutBE64int64(s.chunkRange)
|
||||||
|
|
||||||
|
s.Lock()
|
||||||
|
if s.headChunk == nil {
|
||||||
|
buf.PutUvarint(0)
|
||||||
|
} else {
|
||||||
|
buf.PutUvarint(1)
|
||||||
|
buf.PutBE64int64(s.headChunk.minTime)
|
||||||
|
buf.PutBE64int64(s.headChunk.maxTime)
|
||||||
|
buf.PutByte(byte(s.headChunk.chunk.Encoding()))
|
||||||
|
buf.PutUvarintBytes(s.headChunk.chunk.Bytes())
|
||||||
|
// Put the sample buf.
|
||||||
|
for _, smpl := range s.sampleBuf {
|
||||||
|
buf.PutBE64int64(smpl.t)
|
||||||
|
buf.PutBEFloat64(smpl.v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
s.Unlock()
|
||||||
|
|
||||||
|
return buf.Get()
|
||||||
|
}
|
||||||
|
|
||||||
|
func decodeSeriesFromChunkSnapshot(b []byte) (csr chunkSnapshotRecord, err error) {
|
||||||
|
dec := encoding.Decbuf{B: b}
|
||||||
|
|
||||||
|
if flag := dec.Byte(); flag != chunkSnapshotRecordTypeSeries {
|
||||||
|
return csr, errors.Errorf("invalid record type %x", flag)
|
||||||
|
}
|
||||||
|
|
||||||
|
csr.ref = dec.Be64()
|
||||||
|
|
||||||
|
// The label set written to the disk is already sorted.
|
||||||
|
csr.lset = make(labels.Labels, dec.Uvarint())
|
||||||
|
for i := range csr.lset {
|
||||||
|
csr.lset[i].Name = dec.UvarintStr()
|
||||||
|
csr.lset[i].Value = dec.UvarintStr()
|
||||||
|
}
|
||||||
|
|
||||||
|
csr.chunkRange = dec.Be64int64()
|
||||||
|
if dec.Uvarint() == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
csr.mc = &memChunk{}
|
||||||
|
csr.mc.minTime = dec.Be64int64()
|
||||||
|
csr.mc.maxTime = dec.Be64int64()
|
||||||
|
enc := chunkenc.Encoding(dec.Byte())
|
||||||
|
|
||||||
|
// The underlying bytes gets re-used later, so make a copy.
|
||||||
|
chunkBytes := dec.UvarintBytes()
|
||||||
|
chunkBytesCopy := make([]byte, len(chunkBytes))
|
||||||
|
copy(chunkBytesCopy, chunkBytes)
|
||||||
|
|
||||||
|
chk, err := chunkenc.FromData(enc, chunkBytesCopy)
|
||||||
|
if err != nil {
|
||||||
|
return csr, errors.Wrap(err, "chunk from data")
|
||||||
|
}
|
||||||
|
csr.mc.chunk = chk
|
||||||
|
|
||||||
|
for i := range csr.sampleBuf {
|
||||||
|
csr.sampleBuf[i].t = dec.Be64int64()
|
||||||
|
csr.sampleBuf[i].v = dec.Be64Float64()
|
||||||
|
}
|
||||||
|
|
||||||
|
err = dec.Err()
|
||||||
|
if err != nil && len(dec.B) > 0 {
|
||||||
|
err = errors.Errorf("unexpected %d bytes left in entry", len(dec.B))
|
||||||
|
}
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func encodeTombstonesToSnapshotRecord(tr tombstones.Reader) ([]byte, error) {
|
||||||
|
buf := encoding.Encbuf{}
|
||||||
|
|
||||||
|
buf.PutByte(chunkSnapshotRecordTypeTombstones)
|
||||||
|
b, err := tombstones.Encode(tr)
|
||||||
|
if err != nil {
|
||||||
|
return nil, errors.Wrap(err, "encode tombstones")
|
||||||
|
}
|
||||||
|
buf.PutUvarintBytes(b)
|
||||||
|
|
||||||
|
return buf.Get(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func decodeTombstonesSnapshotRecord(b []byte) (tombstones.Reader, error) {
|
||||||
|
dec := encoding.Decbuf{B: b}
|
||||||
|
|
||||||
|
if flag := dec.Byte(); flag != chunkSnapshotRecordTypeTombstones {
|
||||||
|
return nil, errors.Errorf("invalid record type %x", flag)
|
||||||
|
}
|
||||||
|
|
||||||
|
tr, err := tombstones.Decode(dec.UvarintBytes())
|
||||||
|
return tr, errors.Wrap(err, "decode tombstones")
|
||||||
|
}
|
||||||
|
|
||||||
|
const chunkSnapshotPrefix = "chunk_snapshot."
|
||||||
|
|
||||||
|
// ChunkSnapshot creates a snapshot of all the series and tombstones in the head.
|
||||||
|
// It deletes the old chunk snapshots if the chunk snapshot creation is successful.
|
||||||
|
//
|
||||||
|
// The chunk snapshot is stored in a directory named chunk_snapshot.N.M and is written
|
||||||
|
// using the WAL package. N is the last WAL segment present during snapshotting and
|
||||||
|
// M is the offset in segment N upto which data was written.
|
||||||
|
func (h *Head) ChunkSnapshot() (*ChunkSnapshotStats, error) {
|
||||||
|
if h.wal == nil {
|
||||||
|
// If we are not storing any WAL, does not make sense to take a snapshot too.
|
||||||
|
level.Warn(h.logger).Log("msg", "skipping chunk snapshotting as WAL is disabled")
|
||||||
|
return &ChunkSnapshotStats{}, nil
|
||||||
|
}
|
||||||
|
h.chunkSnapshotMtx.Lock()
|
||||||
|
defer h.chunkSnapshotMtx.Unlock()
|
||||||
|
|
||||||
|
stats := &ChunkSnapshotStats{}
|
||||||
|
|
||||||
|
wlast, woffset, err := h.wal.LastSegmentAndOffset()
|
||||||
|
if err != nil && err != record.ErrNotFound {
|
||||||
|
return stats, errors.Wrap(err, "get last wal segment and offset")
|
||||||
|
}
|
||||||
|
|
||||||
|
_, cslast, csoffset, err := LastChunkSnapshot(h.opts.ChunkDirRoot)
|
||||||
|
if err != nil && err != record.ErrNotFound {
|
||||||
|
return stats, errors.Wrap(err, "find last chunk snapshot")
|
||||||
|
}
|
||||||
|
|
||||||
|
if wlast == cslast && woffset == csoffset {
|
||||||
|
// Nothing has been written to the WAL/Head since the last snapshot.
|
||||||
|
return stats, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
snapshotName := fmt.Sprintf(chunkSnapshotPrefix+"%06d.%010d", wlast, woffset)
|
||||||
|
|
||||||
|
cpdir := filepath.Join(h.opts.ChunkDirRoot, snapshotName)
|
||||||
|
cpdirtmp := cpdir + ".tmp"
|
||||||
|
stats.Dir = cpdir
|
||||||
|
|
||||||
|
if err := os.MkdirAll(cpdirtmp, 0777); err != nil {
|
||||||
|
return stats, errors.Wrap(err, "create chunk snapshot dir")
|
||||||
|
}
|
||||||
|
cp, err := wal.New(nil, nil, cpdirtmp, h.wal.CompressionEnabled())
|
||||||
|
if err != nil {
|
||||||
|
return stats, errors.Wrap(err, "open chunk snapshot")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensures that an early return caused by an error doesn't leave any tmp files.
|
||||||
|
defer func() {
|
||||||
|
cp.Close()
|
||||||
|
os.RemoveAll(cpdirtmp)
|
||||||
|
}()
|
||||||
|
|
||||||
|
var (
|
||||||
|
buf []byte
|
||||||
|
recs [][]byte
|
||||||
|
)
|
||||||
|
stripeSize := h.series.size
|
||||||
|
for i := 0; i < stripeSize; i++ {
|
||||||
|
h.series.locks[i].RLock()
|
||||||
|
|
||||||
|
for _, s := range h.series.series[i] {
|
||||||
|
start := len(buf)
|
||||||
|
buf = s.encodeToSnapshotRecord(buf)
|
||||||
|
if len(buf[start:]) == 0 {
|
||||||
|
continue // All contents discarded.
|
||||||
|
}
|
||||||
|
recs = append(recs, buf[start:])
|
||||||
|
// Flush records in 10 MB increments.
|
||||||
|
if len(buf) > 10*1024*1024 {
|
||||||
|
if err := cp.Log(recs...); err != nil {
|
||||||
|
h.series.locks[i].RUnlock()
|
||||||
|
return stats, errors.Wrap(err, "flush records")
|
||||||
|
}
|
||||||
|
buf, recs = buf[:0], recs[:0]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stats.TotalSeries += len(h.series.series[i])
|
||||||
|
|
||||||
|
h.series.locks[i].RUnlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add tombstones to the snapshot.
|
||||||
|
tombstonesReader, err := h.Tombstones()
|
||||||
|
if err != nil {
|
||||||
|
return stats, errors.Wrap(err, "get tombstones")
|
||||||
|
}
|
||||||
|
rec, err := encodeTombstonesToSnapshotRecord(tombstonesReader)
|
||||||
|
if err != nil {
|
||||||
|
return stats, errors.Wrap(err, "encode tombstones")
|
||||||
|
}
|
||||||
|
recs = append(recs, rec)
|
||||||
|
|
||||||
|
// Flush remaining records.
|
||||||
|
if err := cp.Log(recs...); err != nil {
|
||||||
|
return stats, errors.Wrap(err, "flush records")
|
||||||
|
}
|
||||||
|
if err := cp.Close(); err != nil {
|
||||||
|
return stats, errors.Wrap(err, "close chunk snapshot")
|
||||||
|
}
|
||||||
|
if err := fileutil.Replace(cpdirtmp, cpdir); err != nil {
|
||||||
|
return stats, errors.Wrap(err, "rename chunk snapshot directory")
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := DeleteChunkSnapshots(h.opts.ChunkDirRoot, cslast, csoffset); err != nil {
|
||||||
|
// Leftover old chunk snapshots do not cause problems down the line beyond
|
||||||
|
// occupying disk space.
|
||||||
|
// They will just be ignored since a higher chunk snapshot exists.
|
||||||
|
level.Error(h.logger).Log("msg", "delete old chunk snapshots", "err", err)
|
||||||
|
}
|
||||||
|
return stats, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *Head) performChunkSnapshot() error {
|
||||||
|
level.Info(h.logger).Log("msg", "creating chunk snapshot")
|
||||||
|
startTime := time.Now()
|
||||||
|
stats, err := h.ChunkSnapshot()
|
||||||
|
elapsed := time.Since(startTime)
|
||||||
|
if err == nil {
|
||||||
|
level.Info(h.logger).Log("msg", "chunk snapshot complete", "duration", elapsed.String(), "num_series", stats.TotalSeries, "dir", stats.Dir)
|
||||||
|
}
|
||||||
|
return errors.Wrap(err, "chunk snapshot")
|
||||||
|
}
|
||||||
|
|
||||||
|
// ChunkSnapshotStats returns stats about a created chunk snapshot.
|
||||||
|
type ChunkSnapshotStats struct {
|
||||||
|
TotalSeries int
|
||||||
|
Dir string
|
||||||
|
}
|
||||||
|
|
||||||
|
// LastChunkSnapshot returns the directory name and index of the most recent chunk snapshot.
|
||||||
|
// If dir does not contain any chunk snapshots, ErrNotFound is returned.
|
||||||
|
func LastChunkSnapshot(dir string) (string, int, int, error) {
|
||||||
|
files, err := ioutil.ReadDir(dir)
|
||||||
|
if err != nil {
|
||||||
|
return "", 0, 0, err
|
||||||
|
}
|
||||||
|
// Traverse list backwards since there may be multiple chunk snapshots left.
|
||||||
|
for i := len(files) - 1; i >= 0; i-- {
|
||||||
|
fi := files[i]
|
||||||
|
|
||||||
|
if !strings.HasPrefix(fi.Name(), chunkSnapshotPrefix) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !fi.IsDir() {
|
||||||
|
return "", 0, 0, errors.Errorf("chunk snapshot %s is not a directory", fi.Name())
|
||||||
|
}
|
||||||
|
|
||||||
|
splits := strings.Split(fi.Name()[len(chunkSnapshotPrefix):], ".")
|
||||||
|
if len(splits) != 2 {
|
||||||
|
return "", 0, 0, errors.Errorf("chunk snapshot %s is not in the right format", fi.Name())
|
||||||
|
}
|
||||||
|
|
||||||
|
idx, err := strconv.Atoi(splits[0])
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
offset, err := strconv.Atoi(splits[1])
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
return filepath.Join(dir, fi.Name()), idx, offset, nil
|
||||||
|
}
|
||||||
|
return "", 0, 0, record.ErrNotFound
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeleteChunkSnapshots deletes all chunk snapshots in a directory below a given index.
|
||||||
|
func DeleteChunkSnapshots(dir string, maxIndex, maxOffset int) error {
|
||||||
|
files, err := ioutil.ReadDir(dir)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
errs := tsdb_errors.NewMulti()
|
||||||
|
for _, fi := range files {
|
||||||
|
if !strings.HasPrefix(fi.Name(), chunkSnapshotPrefix) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
splits := strings.Split(fi.Name()[len(chunkSnapshotPrefix):], ".")
|
||||||
|
if len(splits) != 2 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
idx, err := strconv.Atoi(splits[0])
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
offset, err := strconv.Atoi(splits[1])
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if idx <= maxIndex && offset < maxOffset {
|
||||||
|
if err := os.RemoveAll(filepath.Join(dir, fi.Name())); err != nil {
|
||||||
|
errs.Add(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
return errs.Err()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *Head) loadChunkSnapshot() (int, int, map[uint64]*memSeries, error) {
|
||||||
|
dir, snapIdx, snapOffset, err := LastChunkSnapshot(h.opts.ChunkDirRoot)
|
||||||
|
if err != nil {
|
||||||
|
if err == record.ErrNotFound {
|
||||||
|
return snapIdx, snapOffset, nil, nil
|
||||||
|
}
|
||||||
|
return snapIdx, snapOffset, nil, errors.Wrap(err, "find last chunk snapshot")
|
||||||
|
}
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
|
sr, err := wal.NewSegmentsReader(dir)
|
||||||
|
if err != nil {
|
||||||
|
return snapIdx, snapOffset, nil, errors.Wrap(err, "open chunk snapshot")
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if err := sr.Close(); err != nil {
|
||||||
|
level.Warn(h.logger).Log("msg", "error while closing the wal segments reader", "err", err)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
var (
|
||||||
|
numSeries = 0
|
||||||
|
unknownRefs = int64(0)
|
||||||
|
n = runtime.GOMAXPROCS(0)
|
||||||
|
wg sync.WaitGroup
|
||||||
|
recordChan = make(chan chunkSnapshotRecord, 5*n)
|
||||||
|
shardedRefSeries = make([]map[uint64]*memSeries, n)
|
||||||
|
errChan = make(chan error, n)
|
||||||
|
)
|
||||||
|
|
||||||
|
wg.Add(n)
|
||||||
|
for i := 0; i < n; i++ {
|
||||||
|
go func(idx int, rc <-chan chunkSnapshotRecord) {
|
||||||
|
defer wg.Done()
|
||||||
|
defer func() {
|
||||||
|
// If there was an error, drain the channel
|
||||||
|
// to unblock the main thread.
|
||||||
|
for range rc {
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
shardedRefSeries[idx] = make(map[uint64]*memSeries)
|
||||||
|
localRefSeries := shardedRefSeries[idx]
|
||||||
|
|
||||||
|
for csr := range rc {
|
||||||
|
series, _, err := h.getOrCreateWithID(csr.ref, csr.lset.Hash(), csr.lset)
|
||||||
|
if err != nil {
|
||||||
|
errChan <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
localRefSeries[csr.ref] = series
|
||||||
|
if h.lastSeriesID.Load() < series.ref {
|
||||||
|
h.lastSeriesID.Store(series.ref)
|
||||||
|
}
|
||||||
|
|
||||||
|
series.chunkRange = csr.chunkRange
|
||||||
|
if csr.mc == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
series.nextAt = csr.mc.maxTime // This will create a new chunk on append.
|
||||||
|
series.headChunk = csr.mc
|
||||||
|
for i := range series.sampleBuf {
|
||||||
|
series.sampleBuf[i].t = csr.sampleBuf[i].t
|
||||||
|
series.sampleBuf[i].v = csr.sampleBuf[i].v
|
||||||
|
}
|
||||||
|
|
||||||
|
app, err := series.headChunk.chunk.Appender()
|
||||||
|
if err != nil {
|
||||||
|
errChan <- err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
series.app = app
|
||||||
|
|
||||||
|
h.updateMinMaxTime(csr.mc.minTime, csr.mc.maxTime)
|
||||||
|
}
|
||||||
|
}(i, recordChan)
|
||||||
|
}
|
||||||
|
|
||||||
|
r := wal.NewReader(sr)
|
||||||
|
var loopErr error
|
||||||
|
Outer:
|
||||||
|
for r.Next() {
|
||||||
|
select {
|
||||||
|
case err := <-errChan:
|
||||||
|
errChan <- err
|
||||||
|
break Outer
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
|
||||||
|
rec := r.Record()
|
||||||
|
switch rec[0] {
|
||||||
|
case chunkSnapshotRecordTypeSeries:
|
||||||
|
numSeries++
|
||||||
|
csr, err := decodeSeriesFromChunkSnapshot(rec)
|
||||||
|
if err != nil {
|
||||||
|
loopErr = errors.Wrap(err, "decode series record")
|
||||||
|
break Outer
|
||||||
|
}
|
||||||
|
recordChan <- csr
|
||||||
|
|
||||||
|
case chunkSnapshotRecordTypeTombstones:
|
||||||
|
tr, err := decodeTombstonesSnapshotRecord(rec)
|
||||||
|
if err != nil {
|
||||||
|
loopErr = errors.Wrap(err, "decode tombstones")
|
||||||
|
break Outer
|
||||||
|
}
|
||||||
|
|
||||||
|
if err = tr.Iter(func(ref uint64, ivs tombstones.Intervals) error {
|
||||||
|
h.tombstones.AddInterval(ref, ivs...)
|
||||||
|
return nil
|
||||||
|
}); err != nil {
|
||||||
|
loopErr = errors.Wrap(err, "iterate tombstones")
|
||||||
|
break Outer
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
close(recordChan)
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
close(errChan)
|
||||||
|
merr := tsdb_errors.NewMulti(errors.Wrap(loopErr, "decode loop"))
|
||||||
|
for err := range errChan {
|
||||||
|
merr.Add(errors.Wrap(err, "record processing"))
|
||||||
|
}
|
||||||
|
if err := merr.Err(); err != nil {
|
||||||
|
return -1, -1, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
refSeries := make(map[uint64]*memSeries, numSeries)
|
||||||
|
for _, shard := range shardedRefSeries {
|
||||||
|
for k, v := range shard {
|
||||||
|
refSeries[k] = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
elapsed := time.Since(start)
|
||||||
|
level.Info(h.logger).Log("msg", "chunk snapshot loaded", "dir", dir, "num_series", numSeries, "duration", elapsed.String())
|
||||||
|
if unknownRefs > 0 {
|
||||||
|
level.Warn(h.logger).Log("msg", "unknown series references during chunk snapshot replay", "count", unknownRefs)
|
||||||
|
}
|
||||||
|
|
||||||
|
return snapIdx, snapOffset, refSeries, nil
|
||||||
|
}
|
||||||
|
|
|
@ -699,6 +699,22 @@ func (w *WAL) log(rec []byte, final bool) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LastSegmentAndOffset returns the last segment number of the WAL
|
||||||
|
// and the offset in that file upto which the segment has been filled.
|
||||||
|
func (w *WAL) LastSegmentAndOffset() (seg, offset int, err error) {
|
||||||
|
w.mtx.Lock()
|
||||||
|
defer w.mtx.Unlock()
|
||||||
|
|
||||||
|
_, seg, err = Segments(w.Dir())
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
offset = (w.donePages * pageSize) + w.page.alloc
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
// Truncate drops all segments before i.
|
// Truncate drops all segments before i.
|
||||||
func (w *WAL) Truncate(i int) (err error) {
|
func (w *WAL) Truncate(i int) (err error) {
|
||||||
w.metrics.truncateTotal.Inc()
|
w.metrics.truncateTotal.Inc()
|
||||||
|
@ -867,6 +883,21 @@ func NewSegmentBufReader(segs ...*Segment) *segmentBufReader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// nolint:golint
|
||||||
|
func NewSegmentBufReaderWithOffset(offset int, segs ...*Segment) (sbr *segmentBufReader, err error) {
|
||||||
|
if offset == 0 {
|
||||||
|
return NewSegmentBufReader(segs...), nil
|
||||||
|
}
|
||||||
|
sbr = &segmentBufReader{
|
||||||
|
buf: bufio.NewReaderSize(segs[0], 16*pageSize),
|
||||||
|
segs: segs,
|
||||||
|
}
|
||||||
|
if offset > 0 {
|
||||||
|
_, err = sbr.buf.Discard(offset)
|
||||||
|
}
|
||||||
|
return sbr, err
|
||||||
|
}
|
||||||
|
|
||||||
func (r *segmentBufReader) Close() (err error) {
|
func (r *segmentBufReader) Close() (err error) {
|
||||||
for _, s := range r.segs {
|
for _, s := range r.segs {
|
||||||
if e := s.Close(); e != nil {
|
if e := s.Close(); e != nil {
|
||||||
|
|
Loading…
Reference in a new issue