2017-04-10 11:59:45 -07:00
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
2016-11-15 01:34:25 -08:00
// Package tsdb implements a time series storage for float64 sample data.
package tsdb
import (
2018-12-05 08:34:42 -08:00
"context"
2016-12-04 04:16:11 -08:00
"fmt"
2017-02-27 01:46:15 -08:00
"io"
2022-04-27 02:24:36 -07:00
"io/fs"
2018-04-05 05:51:33 -07:00
"math"
2016-12-04 04:16:11 -08:00
"os"
2016-12-08 08:43:10 -08:00
"path/filepath"
2017-05-18 07:09:30 -07:00
"sort"
2016-12-14 23:31:26 -08:00
"strconv"
2018-04-05 05:51:33 -07:00
"strings"
2016-12-08 08:43:10 -08:00
"sync"
2017-01-06 06:18:06 -08:00
"time"
2016-12-14 23:31:26 -08:00
2021-06-11 09:17:59 -07:00
"github.com/go-kit/log"
"github.com/go-kit/log/level"
2017-05-18 07:09:30 -07:00
"github.com/oklog/ulid"
2017-01-03 06:43:26 -08:00
"github.com/pkg/errors"
2016-12-31 00:48:49 -08:00
"github.com/prometheus/client_golang/prometheus"
2022-06-22 04:45:21 -07:00
"go.uber.org/atomic"
2020-10-22 02:00:08 -07:00
"golang.org/x/sync/errgroup"
2021-07-19 21:52:57 -07:00
"github.com/prometheus/prometheus/config"
2021-11-08 06:23:17 -08:00
"github.com/prometheus/prometheus/model/labels"
2020-02-06 07:58:38 -08:00
"github.com/prometheus/prometheus/storage"
2019-08-13 01:34:14 -07:00
"github.com/prometheus/prometheus/tsdb/chunkenc"
2020-11-19 05:00:47 -08:00
"github.com/prometheus/prometheus/tsdb/chunks"
2019-08-13 01:34:14 -07:00
tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
"github.com/prometheus/prometheus/tsdb/fileutil"
2020-10-22 02:00:08 -07:00
_ "github.com/prometheus/prometheus/tsdb/goversion" // Load the package into main to make sure minium Go version is met.
2021-08-17 06:31:08 -07:00
"github.com/prometheus/prometheus/tsdb/hashcache"
"github.com/prometheus/prometheus/tsdb/index"
2021-11-11 08:45:25 -08:00
"github.com/prometheus/prometheus/tsdb/tsdbutil"
2022-10-10 08:08:46 -07:00
"github.com/prometheus/prometheus/tsdb/wlog"
2016-11-15 01:34:25 -08:00
)
2019-11-21 04:10:25 -08:00
const (
2020-02-11 08:34:09 -08:00
// Default duration of a block in milliseconds.
DefaultBlockDuration = int64 ( 2 * time . Hour / time . Millisecond )
2020-08-10 22:56:08 -07:00
// Block dir suffixes to make deletion and creation operations atomic.
// We decided to do suffixes instead of creating meta.json as last (or delete as first) one,
// because in error case you still can recover meta.json from the block content within local TSDB dir.
// TODO(bwplotka): TSDB can end up with various .tmp files (e.g meta.json.tmp, WAL or segment tmp file. Think
// about removing those too on start to save space. Currently only blocks tmp dirs are removed.
tmpForDeletionBlockDirSuffix = ".tmp-for-deletion"
tmpForCreationBlockDirSuffix = ".tmp-for-creation"
2021-01-09 01:02:26 -08:00
// Pre-2.21 tmp dir suffix, used in clean-up functions.
tmpLegacy = ".tmp"
2020-02-06 07:58:38 -08:00
)
2021-10-22 01:06:44 -07:00
// ErrNotReady is returned if the underlying storage is not ready yet.
var ErrNotReady = errors . New ( "TSDB not ready" )
2019-11-21 04:10:25 -08:00
2022-11-28 09:09:18 -08:00
// DefaultOptions used for the DB. They are reasonable for setups using
2018-03-02 03:12:32 -08:00
// millisecond precision timestamps.
2020-02-06 07:58:38 -08:00
func DefaultOptions ( ) * Options {
return & Options {
2023-03-21 22:40:11 -07:00
WALSegmentSize : wlog . DefaultSegmentSize ,
MaxBlockChunkSegmentSize : chunks . DefaultChunkSegmentSize ,
RetentionDuration : int64 ( 15 * 24 * time . Hour / time . Millisecond ) ,
MinBlockDuration : DefaultBlockDuration ,
MaxBlockDuration : DefaultBlockDuration ,
NoLockfile : false ,
AllowOverlappingCompaction : true ,
WALCompression : false ,
StripeSize : DefaultStripeSize ,
HeadChunksWriteBufferSize : chunks . DefaultWriteBufferSize ,
IsolationDisabled : defaultIsolationDisabled ,
HeadChunksEndTimeVariance : 0 ,
HeadChunksWriteQueueSize : chunks . DefaultWriteQueueSize ,
OutOfOrderCapMax : DefaultOutOfOrderCapMax ,
HeadPostingsForMatchersCacheTTL : defaultPostingsForMatchersCacheTTL ,
HeadPostingsForMatchersCacheSize : defaultPostingsForMatchersCacheSize ,
HeadPostingsForMatchersCacheForce : false ,
BlockPostingsForMatchersCacheTTL : defaultPostingsForMatchersCacheTTL ,
BlockPostingsForMatchersCacheSize : defaultPostingsForMatchersCacheSize ,
BlockPostingsForMatchersCacheForce : false ,
2020-02-06 07:58:38 -08:00
}
2016-11-15 01:34:25 -08:00
}
// Options of the DB storage.
type Options struct {
2019-03-25 16:38:12 -07:00
// Segments (wal files) max size.
// WALSegmentSize = 0, segment size is default size.
// WALSegmentSize > 0, segment size is WALSegmentSize.
// WALSegmentSize < 0, wal is disabled.
2020-02-11 08:34:09 -08:00
WALSegmentSize int
2017-01-28 23:11:47 -08:00
2021-04-15 01:55:01 -07:00
// MaxBlockChunkSegmentSize is the max size of block chunk segment files.
// MaxBlockChunkSegmentSize = 0, chunk segment size is default size.
// MaxBlockChunkSegmentSize > 0, chunk segment size is MaxBlockChunkSegmentSize.
MaxBlockChunkSegmentSize int64
2017-02-09 17:54:26 -08:00
// Duration of persisted data to keep.
2020-02-11 08:34:09 -08:00
// Unit agnostic as long as unit is consistent with MinBlockDuration and MaxBlockDuration.
// Typically it is in milliseconds.
RetentionDuration int64
2017-02-09 17:54:26 -08:00
2019-01-16 02:03:52 -08:00
// Maximum number of bytes in blocks to be retained.
// 0 or less means disabled.
// NOTE: For proper storage calculations need to consider
// the size of the WAL folder which is not added when calculating
// the current size of the database.
2020-02-11 08:34:09 -08:00
MaxBytes int64
2017-01-28 23:11:47 -08:00
2017-05-09 03:52:47 -07:00
// NoLockfile disables creation and consideration of a lock file.
NoLockfile bool
2019-02-26 11:50:37 -08:00
2022-06-09 12:11:42 -07:00
// Compaction of overlapping blocks are allowed if AllowOverlappingCompaction is true.
// This is an optional flag for overlapping blocks.
2022-09-20 10:05:50 -07:00
// The reason why this flag exists is because there are various users of the TSDB
// that do not want vertical compaction happening on ingest time. Instead,
// they'd rather keep overlapping blocks and let another component do the overlapping compaction later.
2022-09-28 06:47:54 -07:00
// For Prometheus, this will always be true.
2022-06-09 12:11:42 -07:00
AllowOverlappingCompaction bool
2019-06-19 06:46:24 -07:00
// WALCompression will turn on Snappy compression for records on the WAL.
WALCompression bool
2020-01-29 23:12:43 -08:00
2023-03-07 08:41:33 -08:00
// Maximum number of CPUs that can simultaneously processes WAL replay.
// If it is <=0, then GOMAXPROCS is used.
WALReplayConcurrency int
2020-02-17 03:45:11 -08:00
// StripeSize is the size in entries of the series hash map. Reducing the size will save memory but impact performance.
2020-01-29 23:12:43 -08:00
StripeSize int
2016-11-15 01:34:25 -08:00
2020-02-06 07:58:38 -08:00
// The timestamp range of head blocks after which they get persisted.
// It's the minimum duration of any persisted block.
2020-02-11 08:34:09 -08:00
// Unit agnostic as long as unit is consistent with RetentionDuration and MaxBlockDuration.
// Typically it is in milliseconds.
MinBlockDuration int64
2020-02-06 07:58:38 -08:00
// The maximum timestamp range of compacted blocks.
2020-02-11 08:34:09 -08:00
// Unit agnostic as long as unit is consistent with MinBlockDuration and RetentionDuration.
// Typically it is in milliseconds.
MaxBlockDuration int64
2020-05-20 06:22:08 -07:00
2020-11-19 05:00:47 -08:00
// HeadChunksWriteBufferSize configures the write buffer size used by the head chunks mapper.
HeadChunksWriteBufferSize int
2021-11-16 02:05:07 -08:00
// HeadChunksEndTimeVariance is how much variance (between 0 and 1) should be applied to the chunk end time,
// to spread chunks writing across time. Doesn't apply to the last chunk of the chunk range. 0 to disable variance.
HeadChunksEndTimeVariance float64
2021-12-07 23:44:26 -08:00
// HeadChunksWriteQueueSize configures the size of the chunk write queue used in the head chunks mapper.
HeadChunksWriteQueueSize int
2020-05-20 06:22:08 -07:00
// SeriesLifecycleCallback specifies a list of callbacks that will be called during a lifecycle of a series.
// It is always a no-op in Prometheus and mainly meant for external users who import TSDB.
SeriesLifecycleCallback SeriesLifecycleCallback
2020-07-22 08:19:33 -07:00
// BlocksToDelete is a function which returns the blocks which can be deleted.
// It is always the default time and size based retention in Prometheus and
// mainly meant for external users who import TSDB.
BlocksToDelete BlocksToDeleteFunc
2021-03-16 02:47:45 -07:00
2021-11-19 02:11:32 -08:00
// Enables the in memory exemplar storage.
2021-07-19 21:52:57 -07:00
EnableExemplarStorage bool
2021-08-06 09:51:01 -07:00
// Enables the snapshot of in-memory chunks on shutdown. This makes restarts faster.
EnableMemorySnapshotOnShutdown bool
2021-03-16 02:47:45 -07:00
// MaxExemplars sets the size, in # of exemplars stored, of the single circular buffer used to store exemplars in memory.
// See tsdb/exemplar.go, specifically the CircularExemplarStorage struct and it's constructor NewCircularExemplarStorage.
2021-07-19 21:52:57 -07:00
MaxExemplars int64
2021-08-17 06:31:08 -07:00
2021-11-19 02:11:32 -08:00
// Disables isolation between reads and in-flight appends.
IsolationDisabled bool
2021-11-19 03:11:26 -08:00
2021-08-17 06:31:08 -07:00
// SeriesHashCache specifies the series hash cache used when querying shards via Querier.Select().
// If nil, the cache won't be used.
SeriesHashCache * hashcache . SeriesHashCache
2022-04-19 12:08:58 -07:00
2022-09-14 05:08:34 -07:00
// EnableNativeHistograms enables the ingestion of native histograms.
EnableNativeHistograms bool
2022-10-05 13:14:49 -07:00
2022-06-24 03:09:28 -07:00
// OutOfOrderTimeWindow specifies how much out of order is allowed, if any.
2022-06-22 04:45:21 -07:00
// This can change during run-time, so this value from here should only be used
// while initialising.
2022-06-24 03:09:28 -07:00
OutOfOrderTimeWindow int64
2022-06-22 04:45:21 -07:00
// OutOfOrderCapMax is maximum capacity for OOO chunks (in samples).
// If it is <=0, the default value is assumed.
OutOfOrderCapMax int64
2022-12-28 04:13:54 -08:00
// HeadPostingsForMatchersCacheTTL is the TTL of the postings for matchers cache in the Head.
// If it's 0, the cache will only deduplicate in-flight requests, deleting the results once the first request has finished.
HeadPostingsForMatchersCacheTTL time . Duration
2023-03-21 22:40:11 -07:00
2022-12-28 04:13:54 -08:00
// HeadPostingsForMatchersCacheSize is the maximum size of cached postings for matchers elements in the Head.
2022-12-29 06:41:15 -08:00
// It's ignored when HeadPostingsForMatchersCacheTTL is 0.
2022-12-28 04:13:54 -08:00
HeadPostingsForMatchersCacheSize int
2023-03-21 22:40:11 -07:00
2022-12-28 04:13:54 -08:00
// HeadPostingsForMatchersCacheForce forces the usage of postings for matchers cache for all calls on Head and OOOHead regardless of the `concurrent` param.
HeadPostingsForMatchersCacheForce bool
2023-03-21 22:40:11 -07:00
// BlockPostingsForMatchersCacheTTL is the TTL of the postings for matchers cache of each compacted block.
// If it's 0, the cache will only deduplicate in-flight requests, deleting the results once the first request has finished.
BlockPostingsForMatchersCacheTTL time . Duration
// BlockPostingsForMatchersCacheSize is the maximum size of cached postings for matchers elements in each compacted block.
// It's ignored when BlockPostingsForMatchersCacheTTL is 0.
BlockPostingsForMatchersCacheSize int
// BlockPostingsForMatchersCacheForce forces the usage of postings for matchers cache for all calls on compacted blocks
// regardless of the `concurrent` param.
BlockPostingsForMatchersCacheForce bool
2016-12-10 09:08:50 -08:00
}
2020-07-22 08:19:33 -07:00
type BlocksToDeleteFunc func ( blocks [ ] * Block ) map [ ulid . ULID ] struct { }
2017-01-06 02:40:09 -08:00
// DB handles reads and writes of time series falling into
// a hashed partition of a seriedb.
type DB struct {
2021-11-11 08:45:25 -08:00
dir string
locker * tsdbutil . DirLocker
2017-03-04 07:50:48 -08:00
2020-07-22 08:19:33 -07:00
logger log . Logger
metrics * dbMetrics
opts * Options
chunkPool chunkenc . Pool
compactor Compactor
blocksToDelete BlocksToDeleteFunc
2016-12-09 01:00:14 -08:00
2017-07-14 00:00:22 -07:00
// Mutex for that must be held when modifying the general block layout.
2017-03-20 00:41:56 -07:00
mtx sync . RWMutex
2017-10-09 06:21:46 -07:00
blocks [ ] * Block
2017-03-04 07:50:48 -08:00
2017-08-28 15:39:17 -07:00
head * Head
2017-01-06 03:37:28 -08:00
compactc chan struct { }
donec chan struct { }
stopc chan struct { }
2017-05-20 00:51:10 -07:00
2018-11-20 02:34:26 -08:00
// cmtx ensures that compactions and deletions don't run simultaneously.
cmtx sync . Mutex
// autoCompactMtx ensures that no compaction gets triggered while
// changing the autoCompact var.
autoCompactMtx sync . Mutex
autoCompact bool
2018-12-05 08:34:42 -08:00
// Cancel a running compaction when a shutdown is initiated.
2019-02-06 04:07:35 -08:00
compactCancel context . CancelFunc
2022-06-22 04:45:21 -07:00
// oooWasEnabled is true if out of order support was enabled at least one time
// during the time TSDB was up. In which case we need to keep supporting
// out-of-order compaction and vertical queries.
oooWasEnabled atomic . Bool
registerer prometheus . Registerer
2016-12-09 01:00:14 -08:00
}
2017-01-06 02:40:09 -08:00
type dbMetrics struct {
2021-11-11 08:45:25 -08:00
loadedBlocks prometheus . GaugeFunc
symbolTableSize prometheus . GaugeFunc
reloads prometheus . Counter
reloadsFailed prometheus . Counter
compactionsFailed prometheus . Counter
compactionsTriggered prometheus . Counter
compactionsSkipped prometheus . Counter
sizeRetentionCount prometheus . Counter
timeRetentionCount prometheus . Counter
startTime prometheus . GaugeFunc
tombCleanTimer prometheus . Histogram
blocksBytes prometheus . Gauge
maxBytes prometheus . Gauge
2016-12-31 00:48:49 -08:00
}
2020-02-14 10:48:55 -08:00
func newDBMetrics ( db * DB , r prometheus . Registerer ) * dbMetrics {
2017-01-06 02:40:09 -08:00
m := & dbMetrics { }
2017-01-03 06:43:26 -08:00
2017-05-26 06:13:03 -07:00
m . loadedBlocks = prometheus . NewGaugeFunc ( prometheus . GaugeOpts {
2018-09-18 10:17:41 -07:00
Name : "prometheus_tsdb_blocks_loaded" ,
2017-05-26 06:13:03 -07:00
Help : "Number of currently loaded data blocks" ,
} , func ( ) float64 {
db . mtx . RLock ( )
defer db . mtx . RUnlock ( )
return float64 ( len ( db . blocks ) )
} )
2018-09-08 11:28:36 -07:00
m . symbolTableSize = prometheus . NewGaugeFunc ( prometheus . GaugeOpts {
2018-09-18 10:17:41 -07:00
Name : "prometheus_tsdb_symbol_table_size_bytes" ,
2020-10-21 06:35:40 -07:00
Help : "Size of symbol table in memory for loaded blocks" ,
2018-09-08 11:28:36 -07:00
} , func ( ) float64 {
db . mtx . RLock ( )
blocks := db . blocks [ : ]
db . mtx . RUnlock ( )
2018-09-12 02:09:02 -07:00
symTblSize := uint64 ( 0 )
2018-09-08 11:28:36 -07:00
for _ , b := range blocks {
2018-09-12 02:09:02 -07:00
symTblSize += b . GetSymbolTableSize ( )
2018-09-08 11:28:36 -07:00
}
2018-09-12 02:09:02 -07:00
return float64 ( symTblSize )
2018-09-08 11:28:36 -07:00
} )
2017-05-26 06:13:03 -07:00
m . reloads = prometheus . NewCounter ( prometheus . CounterOpts {
2018-09-18 10:17:41 -07:00
Name : "prometheus_tsdb_reloads_total" ,
2017-05-26 06:13:03 -07:00
Help : "Number of times the database reloaded block data from disk." ,
} )
m . reloadsFailed = prometheus . NewCounter ( prometheus . CounterOpts {
2018-09-18 10:17:41 -07:00
Name : "prometheus_tsdb_reloads_failures_total" ,
2020-10-19 08:27:08 -07:00
Help : "Number of times the database failed to reloadBlocks block data from disk." ,
2017-05-26 06:13:03 -07:00
} )
2017-01-06 03:37:28 -08:00
m . compactionsTriggered = prometheus . NewCounter ( prometheus . CounterOpts {
2018-09-18 10:17:41 -07:00
Name : "prometheus_tsdb_compactions_triggered_total" ,
2017-01-06 03:37:28 -08:00
Help : "Total number of triggered compactions for the partition." ,
} )
2019-05-30 04:57:28 -07:00
m . compactionsFailed = prometheus . NewCounter ( prometheus . CounterOpts {
Name : "prometheus_tsdb_compactions_failed_total" ,
Help : "Total number of compactions that failed for the partition." ,
} )
2019-01-16 02:03:52 -08:00
m . timeRetentionCount = prometheus . NewCounter ( prometheus . CounterOpts {
Name : "prometheus_tsdb_time_retentions_total" ,
Help : "The number of times that blocks were deleted because the maximum time limit was exceeded." ,
} )
2018-11-20 02:34:26 -08:00
m . compactionsSkipped = prometheus . NewCounter ( prometheus . CounterOpts {
Name : "prometheus_tsdb_compactions_skipped_total" ,
Help : "Total number of skipped compactions due to disabled auto compaction." ,
} )
2018-09-14 05:07:45 -07:00
m . startTime = prometheus . NewGaugeFunc ( prometheus . GaugeOpts {
2018-09-18 10:17:41 -07:00
Name : "prometheus_tsdb_lowest_timestamp" ,
2018-11-30 10:18:12 -08:00
Help : "Lowest timestamp value stored in the database. The unit is decided by the library consumer." ,
2018-09-14 05:07:45 -07:00
} , func ( ) float64 {
db . mtx . RLock ( )
defer db . mtx . RUnlock ( )
if len ( db . blocks ) == 0 {
2019-10-09 08:41:46 -07:00
return float64 ( db . head . MinTime ( ) )
2018-09-14 05:07:45 -07:00
}
return float64 ( db . blocks [ 0 ] . meta . MinTime )
} )
2017-11-22 04:34:50 -08:00
m . tombCleanTimer = prometheus . NewHistogram ( prometheus . HistogramOpts {
2018-09-18 10:17:41 -07:00
Name : "prometheus_tsdb_tombstone_cleanup_seconds" ,
2017-11-22 04:34:50 -08:00
Help : "The time taken to recompact blocks to remove tombstones." ,
} )
2019-01-16 02:03:52 -08:00
m . blocksBytes = prometheus . NewGauge ( prometheus . GaugeOpts {
2019-01-23 05:46:58 -08:00
Name : "prometheus_tsdb_storage_blocks_bytes" ,
2019-01-16 02:03:52 -08:00
Help : "The number of bytes that are currently used for local storage by all blocks." ,
} )
2019-07-27 01:52:25 -07:00
m . maxBytes = prometheus . NewGauge ( prometheus . GaugeOpts {
Name : "prometheus_tsdb_retention_limit_bytes" ,
Help : "Max number of bytes to be retained in the tsdb blocks, configured 0 means disabled" ,
} )
2019-01-16 02:03:52 -08:00
m . sizeRetentionCount = prometheus . NewCounter ( prometheus . CounterOpts {
Name : "prometheus_tsdb_size_retentions_total" ,
Help : "The number of times that blocks were deleted because the maximum number of bytes was exceeded." ,
} )
2020-02-11 08:34:09 -08:00
2016-12-31 00:48:49 -08:00
if r != nil {
r . MustRegister (
2017-05-26 06:13:03 -07:00
m . loadedBlocks ,
2018-09-08 11:28:36 -07:00
m . symbolTableSize ,
2017-05-26 06:13:03 -07:00
m . reloads ,
m . reloadsFailed ,
2019-05-30 04:57:28 -07:00
m . compactionsFailed ,
2020-01-13 14:15:45 -08:00
m . compactionsTriggered ,
m . compactionsSkipped ,
m . sizeRetentionCount ,
m . timeRetentionCount ,
2018-09-14 05:07:45 -07:00
m . startTime ,
2017-11-22 04:34:50 -08:00
m . tombCleanTimer ,
2019-01-16 02:03:52 -08:00
m . blocksBytes ,
2019-07-27 01:52:25 -07:00
m . maxBytes ,
2016-12-31 00:48:49 -08:00
)
}
return m
}
2022-03-03 04:03:07 -08:00
// DBStats contains statistics about the DB separated by component (eg. head).
2021-06-05 07:29:32 -07:00
// They are available before the DB has finished initializing.
type DBStats struct {
Head * HeadStats
}
// NewDBStats returns a new DBStats object initialized using the
2022-08-27 13:21:41 -07:00
// new function from each component.
2021-06-05 07:29:32 -07:00
func NewDBStats ( ) * DBStats {
return & DBStats {
Head : NewHeadStats ( ) ,
}
}
2019-07-23 01:04:48 -07:00
// ErrClosed is returned when the db is closed.
var ErrClosed = errors . New ( "db already closed" )
// DBReadOnly provides APIs for read only operations on a database.
2019-09-30 08:54:55 -07:00
// Current implementation doesn't support concurrency so
2019-07-23 01:04:48 -07:00
// all API calls should happen in the same go routine.
type DBReadOnly struct {
logger log . Logger
dir string
closers [ ] io . Closer
closed chan struct { }
}
// OpenDBReadOnly opens DB in the given directory for read only operations.
func OpenDBReadOnly ( dir string , l log . Logger ) ( * DBReadOnly , error ) {
if _ , err := os . Stat ( dir ) ; err != nil {
2019-09-30 08:54:55 -07:00
return nil , errors . Wrap ( err , "opening the db dir" )
2019-07-23 01:04:48 -07:00
}
if l == nil {
l = log . NewNopLogger ( )
}
return & DBReadOnly {
logger : l ,
dir : dir ,
closed : make ( chan struct { } ) ,
} , nil
}
2019-09-13 03:25:21 -07:00
// FlushWAL creates a new block containing all data that's currently in the memory buffer/WAL.
// Samples that are in existing blocks will not be written to the new block.
// Note that if the read only database is running concurrently with a
// writable database then writing the WAL to the database directory can race.
2020-03-23 02:19:44 -07:00
func ( db * DBReadOnly ) FlushWAL ( dir string ) ( returnErr error ) {
2019-09-13 03:25:21 -07:00
blockReaders , err := db . Blocks ( )
if err != nil {
return errors . Wrap ( err , "read blocks" )
}
maxBlockTime := int64 ( math . MinInt64 )
if len ( blockReaders ) > 0 {
maxBlockTime = blockReaders [ len ( blockReaders ) - 1 ] . Meta ( ) . MaxTime
}
2022-10-10 08:08:46 -07:00
w , err := wlog . Open ( db . logger , filepath . Join ( db . dir , "wal" ) )
2019-09-13 03:25:21 -07:00
if err != nil {
return err
}
2022-10-10 08:08:46 -07:00
var wbl * wlog . WL
wblDir := filepath . Join ( db . dir , wlog . WblDirName )
2022-06-22 04:45:21 -07:00
if _ , err := os . Stat ( wblDir ) ; ! os . IsNotExist ( err ) {
2022-10-10 08:08:46 -07:00
wbl , err = wlog . Open ( db . logger , wblDir )
2022-06-22 04:45:21 -07:00
if err != nil {
return err
}
}
2021-02-09 06:12:48 -08:00
opts := DefaultHeadOptions ( )
opts . ChunkDirRoot = db . dir
2022-06-22 04:45:21 -07:00
head , err := NewHead ( nil , db . logger , w , wbl , opts , NewHeadStats ( ) )
2019-09-13 03:25:21 -07:00
if err != nil {
return err
}
2020-03-23 02:19:44 -07:00
defer func ( ) {
2020-10-28 08:24:58 -07:00
returnErr = tsdb_errors . NewMulti (
returnErr ,
errors . Wrap ( head . Close ( ) , "closing Head" ) ,
) . Err ( )
2020-03-23 02:19:44 -07:00
} ( )
2019-09-13 03:25:21 -07:00
// Set the min valid time for the ingested wal samples
// to be no lower than the maxt of the last block.
if err := head . Init ( maxBlockTime ) ; err != nil {
return errors . Wrap ( err , "read WAL" )
}
mint := head . MinTime ( )
maxt := head . MaxTime ( )
2020-08-13 02:55:35 -07:00
rh := NewRangeHead ( head , mint , maxt )
2020-02-06 07:58:38 -08:00
compactor , err := NewLeveledCompactor (
context . Background ( ) ,
nil ,
db . logger ,
2020-02-11 08:34:09 -08:00
ExponentialBlockRanges ( DefaultOptions ( ) . MinBlockDuration , 3 , 5 ) ,
2020-02-07 08:24:17 -08:00
chunkenc . NewPool ( ) ,
2021-05-18 09:38:37 -07:00
nil ,
2022-06-09 12:11:42 -07:00
false ,
2020-02-06 07:58:38 -08:00
)
2019-09-13 03:25:21 -07:00
if err != nil {
return errors . Wrap ( err , "create leveled compactor" )
}
// Add +1 millisecond to block maxt because block intervals are half-open: [b.MinTime, b.MaxTime).
// Because of this block intervals are always +1 than the total samples it includes.
_ , err = compactor . Write ( dir , rh , mint , maxt + 1 , nil )
return errors . Wrap ( err , "writing WAL" )
}
2020-07-31 08:03:02 -07:00
func ( db * DBReadOnly ) loadDataAsQueryable ( maxt int64 ) ( storage . SampleAndChunkQueryable , error ) {
2019-07-23 01:04:48 -07:00
select {
case <- db . closed :
return nil , ErrClosed
default :
}
2019-09-13 03:25:21 -07:00
blockReaders , err := db . Blocks ( )
2019-07-23 01:04:48 -07:00
if err != nil {
return nil , err
}
2019-09-13 03:25:21 -07:00
blocks := make ( [ ] * Block , len ( blockReaders ) )
for i , b := range blockReaders {
2019-07-23 01:04:48 -07:00
b , ok := b . ( * Block )
if ! ok {
return nil , errors . New ( "unable to convert a read only block to a normal block" )
}
blocks [ i ] = b
}
2021-02-09 06:12:48 -08:00
opts := DefaultHeadOptions ( )
opts . ChunkDirRoot = db . dir
2022-06-22 04:45:21 -07:00
head , err := NewHead ( nil , db . logger , nil , nil , opts , NewHeadStats ( ) )
2019-07-23 01:04:48 -07:00
if err != nil {
return nil , err
}
maxBlockTime := int64 ( math . MinInt64 )
if len ( blocks ) > 0 {
maxBlockTime = blocks [ len ( blocks ) - 1 ] . Meta ( ) . MaxTime
}
2019-09-30 08:54:55 -07:00
// Also add the WAL if the current blocks don't cover the requests time range.
2019-07-23 01:04:48 -07:00
if maxBlockTime <= maxt {
2020-05-06 08:30:00 -07:00
if err := head . Close ( ) ; err != nil {
return nil , err
}
2022-10-10 08:08:46 -07:00
w , err := wlog . Open ( db . logger , filepath . Join ( db . dir , "wal" ) )
2019-07-23 01:04:48 -07:00
if err != nil {
return nil , err
}
2022-10-10 08:08:46 -07:00
var wbl * wlog . WL
wblDir := filepath . Join ( db . dir , wlog . WblDirName )
2022-06-22 04:45:21 -07:00
if _ , err := os . Stat ( wblDir ) ; ! os . IsNotExist ( err ) {
2022-10-10 08:08:46 -07:00
wbl , err = wlog . Open ( db . logger , wblDir )
2022-06-22 04:45:21 -07:00
if err != nil {
return nil , err
}
}
2021-02-09 06:12:48 -08:00
opts := DefaultHeadOptions ( )
opts . ChunkDirRoot = db . dir
2022-06-22 04:45:21 -07:00
head , err = NewHead ( nil , db . logger , w , wbl , opts , NewHeadStats ( ) )
2019-07-23 01:04:48 -07:00
if err != nil {
return nil , err
}
// Set the min valid time for the ingested wal samples
// to be no lower than the maxt of the last block.
if err := head . Init ( maxBlockTime ) ; err != nil {
return nil , errors . Wrap ( err , "read WAL" )
}
// Set the wal to nil to disable all wal operations.
// This is mainly to avoid blocking when closing the head.
head . wal = nil
}
2020-05-06 08:30:00 -07:00
db . closers = append ( db . closers , head )
2020-07-31 08:03:02 -07:00
return & DB {
2019-07-23 01:04:48 -07:00
dir : db . dir ,
logger : db . logger ,
blocks : blocks ,
head : head ,
2020-07-31 08:03:02 -07:00
} , nil
}
2019-07-23 01:04:48 -07:00
2020-07-31 08:03:02 -07:00
// Querier loads the blocks and wal and returns a new querier over the data partition for the given time range.
// Current implementation doesn't support multiple Queriers.
func ( db * DBReadOnly ) Querier ( ctx context . Context , mint , maxt int64 ) ( storage . Querier , error ) {
q , err := db . loadDataAsQueryable ( maxt )
if err != nil {
return nil , err
}
return q . Querier ( ctx , mint , maxt )
2019-07-23 01:04:48 -07:00
}
2020-07-31 08:03:02 -07:00
// ChunkQuerier loads blocks and the wal and returns a new chunk querier over the data partition for the given time range.
// Current implementation doesn't support multiple ChunkQueriers.
func ( db * DBReadOnly ) ChunkQuerier ( ctx context . Context , mint , maxt int64 ) ( storage . ChunkQuerier , error ) {
q , err := db . loadDataAsQueryable ( maxt )
if err != nil {
return nil , err
}
return q . ChunkQuerier ( ctx , mint , maxt )
2020-06-24 06:41:52 -07:00
}
2019-07-23 01:04:48 -07:00
// Blocks returns a slice of block readers for persisted blocks.
func ( db * DBReadOnly ) Blocks ( ) ( [ ] BlockReader , error ) {
select {
case <- db . closed :
return nil , ErrClosed
default :
}
2023-03-21 22:40:11 -07:00
loadable , corrupted , err := openBlocks ( db . logger , db . dir , nil , nil , nil , defaultPostingsForMatchersCacheTTL , defaultPostingsForMatchersCacheSize , false )
2019-07-23 01:04:48 -07:00
if err != nil {
return nil , err
}
// Corrupted blocks that have been superseded by a loadable block can be safely ignored.
for _ , block := range loadable {
for _ , b := range block . Meta ( ) . Compaction . Parents {
delete ( corrupted , b . ULID )
}
}
if len ( corrupted ) > 0 {
for _ , b := range loadable {
if err := b . Close ( ) ; err != nil {
2020-06-17 07:40:00 -07:00
level . Warn ( db . logger ) . Log ( "msg" , "Closing block failed" , "err" , err , "block" , b )
2019-07-23 01:04:48 -07:00
}
}
2020-10-28 08:24:58 -07:00
errs := tsdb_errors . NewMulti ( )
2020-06-17 07:40:00 -07:00
for ulid , err := range corrupted {
2020-10-28 08:24:58 -07:00
errs . Add ( errors . Wrapf ( err , "corrupted block %s" , ulid . String ( ) ) )
2020-06-17 07:40:00 -07:00
}
2020-10-28 08:24:58 -07:00
return nil , errs . Err ( )
2019-07-23 01:04:48 -07:00
}
if len ( loadable ) == 0 {
2019-09-13 03:25:21 -07:00
return nil , nil
2019-07-23 01:04:48 -07:00
}
sort . Slice ( loadable , func ( i , j int ) bool {
return loadable [ i ] . Meta ( ) . MinTime < loadable [ j ] . Meta ( ) . MinTime
} )
blockMetas := make ( [ ] BlockMeta , 0 , len ( loadable ) )
for _ , b := range loadable {
blockMetas = append ( blockMetas , b . Meta ( ) )
}
if overlaps := OverlappingBlocks ( blockMetas ) ; len ( overlaps ) > 0 {
2020-04-11 01:22:18 -07:00
level . Warn ( db . logger ) . Log ( "msg" , "Overlapping blocks found during opening" , "detail" , overlaps . String ( ) )
2019-07-23 01:04:48 -07:00
}
// Close all previously open readers and add the new ones to the cache.
for _ , closer := range db . closers {
closer . Close ( )
}
blockClosers := make ( [ ] io . Closer , len ( loadable ) )
blockReaders := make ( [ ] BlockReader , len ( loadable ) )
for i , b := range loadable {
blockClosers [ i ] = b
blockReaders [ i ] = b
}
db . closers = blockClosers
return blockReaders , nil
}
// Close all block readers.
func ( db * DBReadOnly ) Close ( ) error {
select {
case <- db . closed :
return ErrClosed
default :
}
close ( db . closed )
2020-10-28 08:24:58 -07:00
return tsdb_errors . CloseAll ( db . closers )
2019-07-23 01:04:48 -07:00
}
2020-02-07 08:24:17 -08:00
// Open returns a new DB in the given directory. If options are empty, DefaultOptions will be used.
2021-06-05 07:29:32 -07:00
func Open ( dir string , l log . Logger , r prometheus . Registerer , opts * Options , stats * DBStats ) ( db * DB , err error ) {
2020-02-06 07:58:38 -08:00
var rngs [ ] int64
opts , rngs = validateOpts ( opts , nil )
2021-06-05 07:29:32 -07:00
return open ( dir , l , r , opts , rngs , stats )
2020-02-06 07:58:38 -08:00
}
func validateOpts ( opts * Options , rngs [ ] int64 ) ( * Options , [ ] int64 ) {
if opts == nil {
opts = DefaultOptions ( )
}
if opts . StripeSize <= 0 {
opts . StripeSize = DefaultStripeSize
}
2020-11-19 05:00:47 -08:00
if opts . HeadChunksWriteBufferSize <= 0 {
opts . HeadChunksWriteBufferSize = chunks . DefaultWriteBufferSize
}
2021-11-16 02:05:07 -08:00
if opts . HeadChunksEndTimeVariance <= 0 {
opts . HeadChunksEndTimeVariance = 0
}
2021-12-07 23:44:26 -08:00
if opts . HeadChunksWriteQueueSize < 0 {
opts . HeadChunksWriteQueueSize = chunks . DefaultWriteQueueSize
}
2021-04-15 01:55:01 -07:00
if opts . MaxBlockChunkSegmentSize <= 0 {
opts . MaxBlockChunkSegmentSize = chunks . DefaultChunkSegmentSize
}
2020-02-06 07:58:38 -08:00
if opts . MinBlockDuration <= 0 {
2020-02-11 08:34:09 -08:00
opts . MinBlockDuration = DefaultBlockDuration
2020-02-06 07:58:38 -08:00
}
if opts . MinBlockDuration > opts . MaxBlockDuration {
opts . MaxBlockDuration = opts . MinBlockDuration
}
2022-06-22 04:45:21 -07:00
if opts . OutOfOrderCapMax <= 0 {
opts . OutOfOrderCapMax = DefaultOutOfOrderCapMax
}
2022-06-24 03:09:28 -07:00
if opts . OutOfOrderTimeWindow < 0 {
opts . OutOfOrderTimeWindow = 0
2022-06-22 04:45:21 -07:00
}
2020-02-06 07:58:38 -08:00
if len ( rngs ) == 0 {
// Start with smallest block duration and create exponential buckets until the exceed the
// configured maximum block duration.
2020-02-11 08:34:09 -08:00
rngs = ExponentialBlockRanges ( opts . MinBlockDuration , 10 , 3 )
2020-02-06 07:58:38 -08:00
}
return opts , rngs
}
2021-11-17 02:21:27 -08:00
// open returns a new DB in the given directory.
// It initializes the lockfile, WAL, compactor, and Head (by replaying the WAL), and runs the database.
// It is not safe to open more than one DB in the same directory.
2021-06-05 07:29:32 -07:00
func open ( dir string , l log . Logger , r prometheus . Registerer , opts * Options , rngs [ ] int64 , stats * DBStats ) ( _ * DB , returnedErr error ) {
2021-10-22 01:06:44 -07:00
if err := os . MkdirAll ( dir , 0 o777 ) ; err != nil {
2017-02-19 04:01:19 -08:00
return nil , err
}
2017-02-19 07:04:37 -08:00
if l == nil {
2017-09-13 01:17:20 -07:00
l = log . NewNopLogger ( )
2017-02-19 07:04:37 -08:00
}
2021-06-05 07:29:32 -07:00
if stats == nil {
stats = NewDBStats ( )
}
2020-02-06 07:58:38 -08:00
for i , v := range rngs {
2020-02-11 08:34:09 -08:00
if v > opts . MaxBlockDuration {
2020-02-06 07:58:38 -08:00
rngs = rngs [ : i ]
break
}
2020-01-29 23:12:43 -08:00
}
2020-02-06 07:58:38 -08:00
2018-02-12 02:40:12 -08:00
// Fixup bad format written by Prometheus 2.1.
2018-02-09 04:37:10 -08:00
if err := repairBadIndexVersion ( l , dir ) ; err != nil {
2020-08-10 22:56:08 -07:00
return nil , errors . Wrap ( err , "repair bad index version" )
2018-02-09 04:11:03 -08:00
}
2020-08-14 02:45:08 -07:00
walDir := filepath . Join ( dir , "wal" )
2022-10-10 08:08:46 -07:00
wblDir := filepath . Join ( dir , wlog . WblDirName )
2020-08-14 02:45:08 -07:00
2018-09-17 09:30:56 -07:00
// Migrate old WAL if one exists.
2020-08-14 02:45:08 -07:00
if err := MigrateWAL ( l , walDir ) ; err != nil {
2018-05-27 10:05:11 -07:00
return nil , errors . Wrap ( err , "migrate WAL" )
}
2022-03-24 03:44:14 -07:00
for _ , tmpDir := range [ ] string { walDir , dir } {
// Remove tmp dirs.
if err := removeBestEffortTmpDirs ( l , tmpDir ) ; err != nil {
return nil , errors . Wrap ( err , "remove tmp dirs" )
}
2020-08-10 22:56:08 -07:00
}
2017-01-17 21:18:32 -08:00
2020-10-28 03:09:03 -07:00
db := & DB {
2020-07-22 08:19:33 -07:00
dir : dir ,
logger : l ,
opts : opts ,
compactc : make ( chan struct { } , 1 ) ,
donec : make ( chan struct { } ) ,
stopc : make ( chan struct { } ) ,
autoCompact : true ,
chunkPool : chunkenc . NewPool ( ) ,
blocksToDelete : opts . BlocksToDelete ,
2022-06-22 04:45:21 -07:00
registerer : r ,
2020-07-22 08:19:33 -07:00
}
2020-10-21 08:08:28 -07:00
defer func ( ) {
// Close files if startup fails somewhere.
if returnedErr == nil {
return
}
2020-10-28 03:09:03 -07:00
close ( db . donec ) // DB is never run if it was an error, so close this channel here.
2020-10-28 08:24:58 -07:00
returnedErr = tsdb_errors . NewMulti (
returnedErr ,
errors . Wrap ( db . Close ( ) , "close DB after failed startup" ) ,
) . Err ( )
2020-10-21 08:08:28 -07:00
} ( )
2020-07-22 08:19:33 -07:00
if db . blocksToDelete == nil {
db . blocksToDelete = DefaultBlocksToDelete ( db )
2017-01-06 00:26:39 -08:00
}
2019-07-27 01:52:25 -07:00
2021-11-11 08:45:25 -08:00
var err error
db . locker , err = tsdbutil . NewDirLocker ( dir , "tsdb" , db . logger , r )
if err != nil {
return nil , err
}
2017-05-09 03:52:47 -07:00
if ! opts . NoLockfile {
2021-11-11 08:45:25 -08:00
if err := db . locker . Lock ( ) ; err != nil {
2017-05-18 07:09:30 -07:00
return nil , err
}
2017-05-09 03:52:47 -07:00
}
2019-02-06 04:07:35 -08:00
ctx , cancel := context . WithCancel ( context . Background ( ) )
2023-01-30 03:12:14 -08:00
db . compactor , err = NewLeveledCompactorWithChunkSize ( ctx , r , l , rngs , db . chunkPool , opts . MaxBlockChunkSegmentSize , nil , opts . AllowOverlappingCompaction )
2020-10-28 03:09:03 -07:00
if err != nil {
2019-02-06 04:07:35 -08:00
cancel ( )
2020-10-28 03:09:03 -07:00
return nil , errors . Wrap ( err , "create leveled compactor" )
2017-07-07 04:46:41 -07:00
}
2019-02-06 04:07:35 -08:00
db . compactCancel = cancel
2017-07-07 04:46:41 -07:00
2022-10-10 08:08:46 -07:00
var wal , wbl * wlog . WL
segmentSize := wlog . DefaultSegmentSize
2019-03-25 16:38:12 -07:00
// Wal is enabled.
if opts . WALSegmentSize >= 0 {
// Wal is set to a custom size.
if opts . WALSegmentSize > 0 {
2020-02-11 08:34:09 -08:00
segmentSize = opts . WALSegmentSize
2019-03-25 16:38:12 -07:00
}
2022-10-10 08:08:46 -07:00
wal , err = wlog . NewSize ( l , r , walDir , segmentSize , opts . WALCompression )
2020-10-28 03:09:03 -07:00
if err != nil {
return nil , err
2019-03-25 16:38:12 -07:00
}
2022-07-14 01:28:03 -07:00
// Check if there is a WBL on disk, in which case we should replay that data.
wblSize , err := fileutil . DirSize ( wblDir )
if err != nil && ! os . IsNotExist ( err ) {
return nil , err
}
if opts . OutOfOrderTimeWindow > 0 || wblSize > 0 {
2022-10-10 08:08:46 -07:00
wbl , err = wlog . NewSize ( l , r , wblDir , segmentSize , opts . WALCompression )
2022-06-22 04:45:21 -07:00
if err != nil {
return nil , err
}
}
2017-08-28 15:39:17 -07:00
}
2022-06-24 03:09:28 -07:00
db . oooWasEnabled . Store ( opts . OutOfOrderTimeWindow > 0 )
2021-02-09 06:12:48 -08:00
headOpts := DefaultHeadOptions ( )
headOpts . ChunkRange = rngs [ 0 ]
headOpts . ChunkDirRoot = dir
headOpts . ChunkPool = db . chunkPool
headOpts . ChunkWriteBufferSize = opts . HeadChunksWriteBufferSize
2021-11-16 02:05:07 -08:00
headOpts . ChunkEndTimeVariance = opts . HeadChunksEndTimeVariance
2021-12-07 23:44:26 -08:00
headOpts . ChunkWriteQueueSize = opts . HeadChunksWriteQueueSize
2021-02-09 06:12:48 -08:00
headOpts . StripeSize = opts . StripeSize
headOpts . SeriesCallback = opts . SeriesLifecycleCallback
2021-07-19 21:52:57 -07:00
headOpts . EnableExemplarStorage = opts . EnableExemplarStorage
headOpts . MaxExemplars . Store ( opts . MaxExemplars )
2021-08-06 09:51:01 -07:00
headOpts . EnableMemorySnapshotOnShutdown = opts . EnableMemorySnapshotOnShutdown
2022-09-14 05:08:34 -07:00
headOpts . EnableNativeHistograms . Store ( opts . EnableNativeHistograms )
2022-06-24 03:09:28 -07:00
headOpts . OutOfOrderTimeWindow . Store ( opts . OutOfOrderTimeWindow )
2022-06-22 04:45:21 -07:00
headOpts . OutOfOrderCapMax . Store ( opts . OutOfOrderCapMax )
2022-12-28 04:13:54 -08:00
headOpts . PostingsForMatchersCacheTTL = opts . HeadPostingsForMatchersCacheTTL
headOpts . PostingsForMatchersCacheSize = opts . HeadPostingsForMatchersCacheSize
headOpts . PostingsForMatchersCacheForce = opts . HeadPostingsForMatchersCacheForce
2023-03-07 08:41:33 -08:00
if opts . WALReplayConcurrency > 0 {
headOpts . WALReplayConcurrency = opts . WALReplayConcurrency
}
2021-11-19 02:11:32 -08:00
if opts . IsolationDisabled {
// We only override this flag if isolation is disabled at DB level. We use the default otherwise.
headOpts . IsolationDisabled = opts . IsolationDisabled
}
2022-10-10 08:08:46 -07:00
db . head , err = NewHead ( r , l , wal , wbl , headOpts , stats . Head )
2020-10-28 03:09:03 -07:00
if err != nil {
return nil , err
2017-08-30 08:38:25 -07:00
}
2018-12-04 02:30:49 -08:00
2020-07-04 21:41:42 -07:00
// Register metrics after assigning the head block.
db . metrics = newDBMetrics ( db , r )
maxBytes := opts . MaxBytes
if maxBytes < 0 {
maxBytes = 0
}
db . metrics . maxBytes . Set ( float64 ( maxBytes ) )
2018-11-28 01:23:50 -08:00
if err := db . reload ( ) ; err != nil {
return nil , err
}
2018-12-04 02:30:49 -08:00
// Set the min valid time for the ingested samples
// to be no lower than the maxt of the last block.
minValidTime := int64 ( math . MinInt64 )
2022-06-27 07:56:25 -07:00
// We do not consider blocks created from out-of-order samples for Head's minValidTime
// since minValidTime is only for the in-order data and we do not want to discard unnecessary
// samples from the Head.
inOrderMaxTime , ok := db . inOrderBlocksMaxTime ( )
if ok {
minValidTime = inOrderMaxTime
2018-12-04 02:30:49 -08:00
}
2019-06-14 08:39:22 -07:00
if initErr := db . head . Init ( minValidTime ) ; initErr != nil {
db . head . metrics . walCorruptionsTotal . Inc ( )
2022-06-22 04:45:21 -07:00
isOOOErr := isErrLoadOOOWal ( initErr )
if isOOOErr {
level . Warn ( db . logger ) . Log ( "msg" , "Encountered OOO WAL read error, attempting repair" , "err" , initErr )
2022-10-10 08:08:46 -07:00
if err := wbl . Repair ( initErr ) ; err != nil {
2022-06-22 04:45:21 -07:00
return nil , errors . Wrap ( err , "repair corrupted OOO WAL" )
}
2023-03-21 07:03:43 -07:00
level . Info ( db . logger ) . Log ( "msg" , "Successfully repaired OOO WAL" )
2022-06-22 04:45:21 -07:00
} else {
level . Warn ( db . logger ) . Log ( "msg" , "Encountered WAL read error, attempting repair" , "err" , initErr )
2022-10-10 08:08:46 -07:00
if err := wal . Repair ( initErr ) ; err != nil {
2022-06-22 04:45:21 -07:00
return nil , errors . Wrap ( err , "repair corrupted WAL" )
}
2023-03-21 07:03:43 -07:00
level . Info ( db . logger ) . Log ( "msg" , "Successfully repaired WAL" )
2019-06-14 08:39:22 -07:00
}
2018-12-04 02:30:49 -08:00
}
2017-08-28 15:39:17 -07:00
2022-08-22 05:56:43 -07:00
if db . head . MinOOOTime ( ) != int64 ( math . MaxInt64 ) {
// Some OOO data was replayed from the disk that needs compaction and cleanup.
db . oooWasEnabled . Store ( true )
}
2017-01-06 03:37:28 -08:00
go db . run ( )
return db , nil
}
2020-08-10 22:56:08 -07:00
func removeBestEffortTmpDirs ( l log . Logger , dir string ) error {
2022-04-27 02:24:36 -07:00
files , err := os . ReadDir ( dir )
2022-03-24 03:44:14 -07:00
if os . IsNotExist ( err ) {
return nil
}
2020-08-10 22:56:08 -07:00
if err != nil {
return err
}
2022-04-27 02:24:36 -07:00
for _ , f := range files {
if isTmpDir ( f ) {
if err := os . RemoveAll ( filepath . Join ( dir , f . Name ( ) ) ) ; err != nil {
level . Error ( l ) . Log ( "msg" , "failed to delete tmp block dir" , "dir" , filepath . Join ( dir , f . Name ( ) ) , "err" , err )
2020-08-10 22:56:08 -07:00
continue
}
2022-04-27 02:24:36 -07:00
level . Info ( l ) . Log ( "msg" , "Found and deleted tmp block dir" , "dir" , filepath . Join ( dir , f . Name ( ) ) )
2020-08-10 22:56:08 -07:00
}
}
return nil
}
2020-02-06 07:58:38 -08:00
// StartTime implements the Storage interface.
func ( db * DB ) StartTime ( ) ( int64 , error ) {
db . mtx . RLock ( )
defer db . mtx . RUnlock ( )
if len ( db . blocks ) > 0 {
return db . blocks [ 0 ] . Meta ( ) . MinTime , nil
}
return db . head . MinTime ( ) , nil
}
2017-06-08 03:14:13 -07:00
// Dir returns the directory of the database.
func ( db * DB ) Dir ( ) string {
return db . dir
}
2017-01-06 03:37:28 -08:00
func ( db * DB ) run ( ) {
defer close ( db . donec )
2017-08-28 15:39:17 -07:00
backoff := time . Duration ( 0 )
2017-02-28 06:08:52 -08:00
2017-01-19 22:58:19 -08:00
for {
select {
2017-08-28 15:39:17 -07:00
case <- db . stopc :
2017-08-30 09:34:54 -07:00
return
2017-08-28 15:39:17 -07:00
case <- time . After ( backoff ) :
}
select {
case <- time . After ( 1 * time . Minute ) :
2021-01-06 23:30:08 -08:00
db . cmtx . Lock ( )
if err := db . reloadBlocks ( ) ; err != nil {
level . Error ( db . logger ) . Log ( "msg" , "reloadBlocks" , "err" , err )
}
db . cmtx . Unlock ( )
2017-02-28 06:08:52 -08:00
select {
case db . compactc <- struct { } { } :
default :
}
2017-01-06 03:37:28 -08:00
case <- db . compactc :
db . metrics . compactionsTriggered . Inc ( )
2018-11-20 02:34:26 -08:00
db . autoCompactMtx . Lock ( )
if db . autoCompact {
2020-01-19 23:29:49 -08:00
if err := db . Compact ( ) ; err != nil {
2018-11-20 02:34:26 -08:00
level . Error ( db . logger ) . Log ( "msg" , "compaction failed" , "err" , err )
backoff = exponential ( backoff , 1 * time . Second , 1 * time . Minute )
} else {
backoff = 0
}
2017-08-30 09:34:54 -07:00
} else {
2018-11-20 02:34:26 -08:00
db . metrics . compactionsSkipped . Inc ( )
2017-01-06 03:37:28 -08:00
}
2018-11-20 02:34:26 -08:00
db . autoCompactMtx . Unlock ( )
2017-01-06 03:37:28 -08:00
case <- db . stopc :
return
}
}
}
2017-08-30 09:34:54 -07:00
// Appender opens a new appender against the database.
2020-07-24 07:10:51 -07:00
func ( db * DB ) Appender ( ctx context . Context ) storage . Appender {
2020-07-30 04:11:13 -07:00
return dbAppender { db : db , Appender : db . head . Appender ( ctx ) }
2017-08-30 09:34:54 -07:00
}
2022-06-22 04:45:21 -07:00
// ApplyConfig applies a new config to the DB.
2022-06-24 03:09:28 -07:00
// Behaviour of 'OutOfOrderTimeWindow' is as follows:
// OOO enabled = oooTimeWindow > 0. OOO disabled = oooTimeWindow is 0.
2022-06-22 04:45:21 -07:00
// 1) Before: OOO disabled, Now: OOO enabled =>
2022-08-18 02:04:13 -07:00
// - A new WBL is created for the head block.
// - OOO compaction is enabled.
// - Overlapping queries are enabled.
//
2022-06-22 04:45:21 -07:00
// 2) Before: OOO enabled, Now: OOO enabled =>
2022-08-18 02:04:13 -07:00
// - Only the time window is updated.
//
2022-06-22 04:45:21 -07:00
// 3) Before: OOO enabled, Now: OOO disabled =>
2022-08-18 02:04:13 -07:00
// - Time Window set to 0. So no new OOO samples will be allowed.
2022-09-20 10:05:50 -07:00
// - OOO WBL will stay and will be eventually cleaned up.
// - OOO Compaction and overlapping queries will remain enabled until a restart or until all OOO samples are compacted.
2022-08-18 02:04:13 -07:00
//
2022-06-22 04:45:21 -07:00
// 4) Before: OOO disabled, Now: OOO disabled => no-op.
2021-07-19 21:52:57 -07:00
func ( db * DB ) ApplyConfig ( conf * config . Config ) error {
2022-06-24 03:09:28 -07:00
oooTimeWindow := int64 ( 0 )
2022-06-22 04:45:21 -07:00
if conf . StorageConfig . TSDBConfig != nil {
2022-06-24 03:09:28 -07:00
oooTimeWindow = conf . StorageConfig . TSDBConfig . OutOfOrderTimeWindow
2022-06-22 04:45:21 -07:00
}
2022-06-24 03:09:28 -07:00
if oooTimeWindow < 0 {
oooTimeWindow = 0
2022-06-22 04:45:21 -07:00
}
// Create WBL if it was not present and if OOO is enabled with WAL enabled.
2022-10-10 08:08:46 -07:00
var wblog * wlog . WL
2022-06-22 04:45:21 -07:00
var err error
2022-08-09 06:55:17 -07:00
if db . head . wbl != nil {
// The existing WBL from the disk might have been replayed while OOO was disabled.
wblog = db . head . wbl
} else if ! db . oooWasEnabled . Load ( ) && oooTimeWindow > 0 && db . opts . WALSegmentSize >= 0 {
2022-10-10 08:08:46 -07:00
segmentSize := wlog . DefaultSegmentSize
2022-06-22 04:45:21 -07:00
// Wal is set to a custom size.
if db . opts . WALSegmentSize > 0 {
segmentSize = db . opts . WALSegmentSize
}
2022-10-10 08:08:46 -07:00
oooWalDir := filepath . Join ( db . dir , wlog . WblDirName )
wblog , err = wlog . NewSize ( db . logger , db . registerer , oooWalDir , segmentSize , db . opts . WALCompression )
2022-06-22 04:45:21 -07:00
if err != nil {
return err
}
}
2022-06-27 07:56:25 -07:00
db . opts . OutOfOrderTimeWindow = oooTimeWindow
2022-06-22 04:45:21 -07:00
db . head . ApplyConfig ( conf , wblog )
if ! db . oooWasEnabled . Load ( ) {
2022-06-24 03:09:28 -07:00
db . oooWasEnabled . Store ( oooTimeWindow > 0 )
2022-06-22 04:45:21 -07:00
}
return nil
2021-07-19 21:52:57 -07:00
}
2022-09-14 05:08:34 -07:00
// EnableNativeHistograms enables the native histogram feature.
func ( db * DB ) EnableNativeHistograms ( ) {
db . head . EnableNativeHistograms ( )
}
// DisableNativeHistograms disables the native histogram feature.
func ( db * DB ) DisableNativeHistograms ( ) {
db . head . DisableNativeHistograms ( )
}
2017-08-30 09:34:54 -07:00
// dbAppender wraps the DB's head appender and triggers compactions on commit
// if necessary.
type dbAppender struct {
2020-02-06 07:58:38 -08:00
storage . Appender
2017-08-30 09:34:54 -07:00
db * DB
}
2021-03-19 12:28:55 -07:00
var _ storage . GetRef = dbAppender { }
2022-10-24 01:17:45 -07:00
func ( a dbAppender ) GetRef ( lset labels . Labels , hash uint64 ) ( storage . SeriesRef , labels . Labels ) {
2021-03-19 12:28:55 -07:00
if g , ok := a . Appender . ( storage . GetRef ) ; ok {
2022-10-24 01:17:45 -07:00
return g . GetRef ( lset , hash )
2021-03-19 12:28:55 -07:00
}
2022-03-09 14:17:40 -08:00
return 0 , labels . EmptyLabels ( )
2021-03-19 12:28:55 -07:00
}
2017-08-30 09:34:54 -07:00
func ( a dbAppender ) Commit ( ) error {
err := a . Appender . Commit ( )
2017-09-04 06:07:30 -07:00
// We could just run this check every few minutes practically. But for benchmarks
// and high frequency use cases this is the safer way.
2019-04-01 01:19:06 -07:00
if a . db . head . compactable ( ) {
2017-08-30 09:34:54 -07:00
select {
case a . db . compactc <- struct { } { } :
default :
}
}
return err
}
2018-06-27 09:05:21 -07:00
// Compact data if possible. After successful compaction blocks are reloaded
2020-10-19 08:27:08 -07:00
// which will also delete the blocks that fall out of the retention window.
// Old blocks are only deleted on reloadBlocks based on the new block's parent information.
// See DB.reloadBlocks documentation for further information.
func ( db * DB ) Compact ( ) ( returnErr error ) {
2017-07-13 07:15:13 -07:00
db . cmtx . Lock ( )
defer db . cmtx . Unlock ( )
2019-05-30 04:57:28 -07:00
defer func ( ) {
2022-06-16 22:51:43 -07:00
if returnErr != nil && ! errors . Is ( returnErr , context . Canceled ) {
// If we got an error because context was canceled then we're most likely
// shutting down TSDB and we don't need to report this on metrics
2019-05-30 04:57:28 -07:00
db . metrics . compactionsFailed . Inc ( )
}
} ( )
2020-10-19 08:27:08 -07:00
lastBlockMaxt := int64 ( math . MinInt64 )
defer func ( ) {
2020-10-28 08:24:58 -07:00
returnErr = tsdb_errors . NewMulti (
returnErr ,
errors . Wrap ( db . head . truncateWAL ( lastBlockMaxt ) , "WAL truncation in Compact defer" ) ,
) . Err ( )
2020-10-19 08:27:08 -07:00
} ( )
2020-12-07 13:29:43 -08:00
start := time . Now ( )
2017-07-13 07:15:13 -07:00
// Check whether we have pending head blocks that are ready to be persisted.
// They have the highest priority.
2017-08-28 15:39:17 -07:00
for {
2017-03-04 07:50:48 -08:00
select {
case <- db . stopc :
2018-09-20 23:24:01 -07:00
return nil
2017-03-04 07:50:48 -08:00
default :
2017-02-02 00:32:06 -08:00
}
2019-04-01 01:19:06 -07:00
if ! db . head . compactable ( ) {
2017-08-28 15:39:17 -07:00
break
}
2018-12-04 02:30:49 -08:00
mint := db . head . MinTime ( )
2020-07-27 21:42:42 -07:00
maxt := rangeForTimestamp ( mint , db . head . chunkRange . Load ( ) )
2017-02-02 00:32:06 -08:00
2017-08-28 15:39:17 -07:00
// Wrap head into a range that bounds all reads to it.
2020-02-14 01:50:24 -08:00
// We remove 1 millisecond from maxt because block
// intervals are half-open: [b.MinTime, b.MaxTime). But
// chunk intervals are closed: [c.MinTime, c.MaxTime];
// so in order to make sure that overlaps are evaluated
// consistently, we explicitly remove the last value
// from the block interval here.
2022-09-27 07:01:23 -07:00
rh := NewRangeHeadWithIsolationDisabled ( db . head , mint , maxt - 1 )
// Compaction runs with isolation disabled, because head.compactable()
// ensures that maxt is more than chunkRange/2 back from now, and
// head.appendableMinValidTime() ensures that no new appends can start within the compaction range.
// We do need to wait for any overlapping appenders that started previously to finish.
db . head . WaitForAppendersOverlapping ( rh . MaxTime ( ) )
if err := db . compactHead ( rh ) ; err != nil {
2020-10-19 08:27:08 -07:00
return errors . Wrap ( err , "compact head" )
2017-03-04 07:50:48 -08:00
}
2020-10-19 08:27:08 -07:00
// Consider only successful compactions for WAL truncation.
lastBlockMaxt = maxt
}
// Clear some disk space before compacting blocks, especially important
// when Head compaction happened over a long time range.
if err := db . head . truncateWAL ( lastBlockMaxt ) ; err != nil {
return errors . Wrap ( err , "WAL truncation in Compact" )
2020-02-14 01:50:24 -08:00
}
2017-08-09 02:10:29 -07:00
2020-12-07 13:29:43 -08:00
compactionDuration := time . Since ( start )
2020-12-25 05:45:23 -08:00
if compactionDuration . Milliseconds ( ) > db . head . chunkRange . Load ( ) {
2020-12-07 13:29:43 -08:00
level . Warn ( db . logger ) . Log (
"msg" , "Head compaction took longer than the block time range, compactions are falling behind and won't be able to catch up" ,
"duration" , compactionDuration . String ( ) ,
"block_range" , db . head . chunkRange . Load ( ) ,
)
}
2022-06-22 04:45:21 -07:00
if lastBlockMaxt != math . MinInt64 {
// The head was compacted, so we compact OOO head as well.
if err := db . compactOOOHead ( ) ; err != nil {
return errors . Wrap ( err , "compact ooo head" )
}
}
2020-02-14 01:50:24 -08:00
return db . compactBlocks ( )
}
2017-09-08 06:09:24 -07:00
2020-10-19 08:27:08 -07:00
// CompactHead compacts the given RangeHead.
func ( db * DB ) CompactHead ( head * RangeHead ) error {
2020-02-14 01:50:24 -08:00
db . cmtx . Lock ( )
defer db . cmtx . Unlock ( )
2020-10-19 08:27:08 -07:00
if err := db . compactHead ( head ) ; err != nil {
return errors . Wrap ( err , "compact head" )
}
if err := db . head . truncateWAL ( head . BlockMaxTime ( ) ) ; err != nil {
return errors . Wrap ( err , "WAL truncation" )
}
return nil
2020-02-14 01:50:24 -08:00
}
2022-06-22 04:45:21 -07:00
// CompactOOOHead compacts the OOO Head.
func ( db * DB ) CompactOOOHead ( ) error {
db . cmtx . Lock ( )
defer db . cmtx . Unlock ( )
return db . compactOOOHead ( )
}
func ( db * DB ) compactOOOHead ( ) error {
if ! db . oooWasEnabled . Load ( ) {
return nil
}
oooHead , err := NewOOOCompactionHead ( db . head )
if err != nil {
return errors . Wrap ( err , "get ooo compaction head" )
}
2022-09-20 10:05:50 -07:00
ulids , err := db . compactOOO ( db . dir , oooHead )
2022-06-22 04:45:21 -07:00
if err != nil {
return errors . Wrap ( err , "compact ooo head" )
}
if err := db . reloadBlocks ( ) ; err != nil {
errs := tsdb_errors . NewMulti ( err )
for _ , uid := range ulids {
if errRemoveAll := os . RemoveAll ( filepath . Join ( db . dir , uid . String ( ) ) ) ; errRemoveAll != nil {
errs . Add ( errRemoveAll )
}
}
return errors . Wrap ( errs . Err ( ) , "reloadBlocks blocks after failed compact ooo head" )
}
lastWBLFile , minOOOMmapRef := oooHead . LastWBLFile ( ) , oooHead . LastMmapRef ( )
if lastWBLFile != 0 || minOOOMmapRef != 0 {
if err := db . head . truncateOOO ( lastWBLFile , minOOOMmapRef ) ; err != nil {
return errors . Wrap ( err , "truncate ooo wbl" )
}
}
return nil
}
2022-09-20 10:05:50 -07:00
// compactOOO creates a new block per possible block range in the compactor's directory from the OOO Head given.
// Each ULID in the result corresponds to a block in a unique time range.
func ( db * DB ) compactOOO ( dest string , oooHead * OOOCompactionHead ) ( _ [ ] ulid . ULID , err error ) {
start := time . Now ( )
blockSize := oooHead . ChunkRange ( )
oooHeadMint , oooHeadMaxt := oooHead . MinTime ( ) , oooHead . MaxTime ( )
ulids := make ( [ ] ulid . ULID , 0 )
defer func ( ) {
if err != nil {
// Best effort removal of created block on any error.
for _ , uid := range ulids {
_ = os . RemoveAll ( filepath . Join ( db . dir , uid . String ( ) ) )
}
}
} ( )
for t := blockSize * ( oooHeadMint / blockSize ) ; t <= oooHeadMaxt ; t = t + blockSize {
mint , maxt := t , t + blockSize
// Block intervals are half-open: [b.MinTime, b.MaxTime). Block intervals are always +1 than the total samples it includes.
uid , err := db . compactor . Write ( dest , oooHead . CloneForTimeRange ( mint , maxt - 1 ) , mint , maxt , nil )
if err != nil {
return nil , err
}
if uid . Compare ( ulid . ULID { } ) != 0 {
ulids = append ( ulids , uid )
blockDir := filepath . Join ( dest , uid . String ( ) )
meta , _ , err := readMetaFile ( blockDir )
if err != nil {
return ulids , errors . Wrap ( err , "read meta" )
}
meta . Compaction . SetOutOfOrder ( )
_ , err = writeMetaFile ( db . logger , blockDir , meta )
if err != nil {
return ulids , errors . Wrap ( err , "write meta" )
}
}
}
if len ( ulids ) == 0 {
level . Info ( db . logger ) . Log (
"msg" , "compact ooo head resulted in no blocks" ,
"duration" , time . Since ( start ) ,
)
return nil , nil
}
level . Info ( db . logger ) . Log (
"msg" , "out-of-order compaction completed" ,
"duration" , time . Since ( start ) ,
"ulids" , fmt . Sprintf ( "%v" , ulids ) ,
)
return ulids , nil
}
2020-10-19 08:27:08 -07:00
// compactHead compacts the given RangeHead.
2020-02-14 01:50:24 -08:00
// The compaction mutex should be held before calling this method.
2020-10-19 08:27:08 -07:00
func ( db * DB ) compactHead ( head * RangeHead ) error {
uid , err := db . compactor . Write ( db . dir , head , head . MinTime ( ) , head . BlockMaxTime ( ) , nil )
2020-02-14 01:50:24 -08:00
if err != nil {
return errors . Wrap ( err , "persist head block" )
}
2020-10-19 08:27:08 -07:00
if err := db . reloadBlocks ( ) ; err != nil {
2020-10-14 02:35:24 -07:00
if errRemoveAll := os . RemoveAll ( filepath . Join ( db . dir , uid . String ( ) ) ) ; errRemoveAll != nil {
2020-10-28 08:24:58 -07:00
return tsdb_errors . NewMulti (
errors . Wrap ( err , "reloadBlocks blocks" ) ,
errors . Wrapf ( errRemoveAll , "delete persisted head block after failed db reloadBlocks:%s" , uid ) ,
) . Err ( )
2017-08-09 02:10:29 -07:00
}
2020-10-19 08:27:08 -07:00
return errors . Wrap ( err , "reloadBlocks blocks" )
2020-02-14 01:50:24 -08:00
}
2020-10-19 08:27:08 -07:00
if err = db . head . truncateMemory ( head . BlockMaxTime ( ) ) ; err != nil {
return errors . Wrap ( err , "head memory truncate" )
2017-03-04 07:50:48 -08:00
}
2020-02-14 01:50:24 -08:00
return nil
}
2017-01-06 03:37:28 -08:00
2020-02-14 01:50:24 -08:00
// compactBlocks compacts all the eligible on-disk blocks.
// The compaction mutex should be held before calling this method.
func ( db * DB ) compactBlocks ( ) ( err error ) {
2017-03-02 00:13:29 -08:00
// Check for compactions of multiple blocks.
for {
2017-08-09 02:10:29 -07:00
plan , err := db . compactor . Plan ( db . dir )
2017-03-02 00:13:29 -08:00
if err != nil {
2018-09-20 23:24:01 -07:00
return errors . Wrap ( err , "plan compaction" )
2017-03-02 00:13:29 -08:00
}
2017-08-09 02:10:29 -07:00
if len ( plan ) == 0 {
2017-03-21 04:21:02 -07:00
break
}
2017-01-06 03:37:28 -08:00
2017-03-02 00:13:29 -08:00
select {
case <- db . stopc :
2018-09-20 23:24:01 -07:00
return nil
2017-03-02 00:13:29 -08:00
default :
}
2017-03-20 02:41:43 -07:00
2019-01-29 00:26:01 -08:00
uid , err := db . compactor . Compact ( db . dir , plan , db . blocks )
if err != nil {
2018-09-20 23:24:01 -07:00
return errors . Wrapf ( err , "compact %s" , plan )
2017-08-09 02:10:29 -07:00
}
2017-08-28 15:39:17 -07:00
2020-10-19 08:27:08 -07:00
if err := db . reloadBlocks ( ) ; err != nil {
2019-01-29 00:26:01 -08:00
if err := os . RemoveAll ( filepath . Join ( db . dir , uid . String ( ) ) ) ; err != nil {
2020-10-19 08:27:08 -07:00
return errors . Wrapf ( err , "delete compacted block after failed db reloadBlocks:%s" , uid )
2019-01-29 00:26:01 -08:00
}
2020-10-19 08:27:08 -07:00
return errors . Wrap ( err , "reloadBlocks blocks" )
2017-08-28 15:39:17 -07:00
}
2017-02-23 01:50:22 -08:00
}
2018-09-20 23:24:01 -07:00
return nil
2017-02-09 17:54:26 -08:00
}
2019-07-23 01:04:48 -07:00
// getBlock iterates a given block range to find a block by a given id.
// If found it returns the block itself and a boolean to indicate that it was found.
func getBlock ( allBlocks [ ] * Block , id ulid . ULID ) ( * Block , bool ) {
for _ , b := range allBlocks {
2017-05-18 07:09:30 -07:00
if b . Meta ( ) . ULID == id {
2017-03-20 00:41:56 -07:00
return b , true
}
}
return nil , false
}
2020-10-19 08:27:08 -07:00
// reload reloads blocks and truncates the head and its WAL.
func ( db * DB ) reload ( ) error {
if err := db . reloadBlocks ( ) ; err != nil {
return errors . Wrap ( err , "reloadBlocks" )
}
2022-06-27 07:56:25 -07:00
maxt , ok := db . inOrderBlocksMaxTime ( )
if ! ok {
2020-10-19 08:27:08 -07:00
return nil
}
2022-06-27 07:56:25 -07:00
if err := db . head . Truncate ( maxt ) ; err != nil {
2020-10-19 08:27:08 -07:00
return errors . Wrap ( err , "head truncate" )
}
return nil
}
// reloadBlocks reloads blocks without touching head.
2018-06-27 06:47:11 -07:00
// Blocks that are obsolete due to replacement or retention will be deleted.
2020-10-19 08:27:08 -07:00
func ( db * DB ) reloadBlocks ( ) ( err error ) {
2017-08-30 08:38:25 -07:00
defer func ( ) {
2017-05-26 06:13:03 -07:00
if err != nil {
db . metrics . reloadsFailed . Inc ( )
}
db . metrics . reloads . Inc ( )
2017-08-30 08:38:25 -07:00
} ( )
2017-05-26 06:13:03 -07:00
2021-02-16 21:32:43 -08:00
// Now that we reload TSDB every minute, there is high chance for race condition with a reload
// triggered by CleanTombstones(). We need to lock the reload to avoid the situation where
// a normal reload and CleanTombstones try to delete the same block.
db . mtx . Lock ( )
defer db . mtx . Unlock ( )
2023-03-21 22:40:11 -07:00
loadable , corrupted , err := openBlocks ( db . logger , db . dir , db . blocks , db . chunkPool , db . opts . SeriesHashCache , db . opts . BlockPostingsForMatchersCacheTTL , db . opts . BlockPostingsForMatchersCacheSize , db . opts . BlockPostingsForMatchersCacheForce )
2017-03-02 00:13:29 -08:00
if err != nil {
2019-01-16 02:03:52 -08:00
return err
}
2018-06-27 09:05:21 -07:00
2020-07-22 08:19:33 -07:00
deletableULIDs := db . blocksToDelete ( loadable )
deletable := make ( map [ ulid . ULID ] * Block , len ( deletableULIDs ) )
2019-01-16 02:03:52 -08:00
2020-08-11 07:53:23 -07:00
// Mark all parents of loaded blocks as deletable (no matter if they exists). This makes it resilient against the process
// crashing towards the end of a compaction but before deletions. By doing that, we can pick up the deletion where it left off during a crash.
2019-01-16 02:03:52 -08:00
for _ , block := range loadable {
2020-07-22 08:19:33 -07:00
if _ , ok := deletableULIDs [ block . meta . ULID ] ; ok {
deletable [ block . meta . ULID ] = block
}
2019-01-16 02:03:52 -08:00
for _ , b := range block . Meta ( ) . Compaction . Parents {
2020-08-11 07:53:23 -07:00
if _ , ok := corrupted [ b . ULID ] ; ok {
delete ( corrupted , b . ULID )
level . Warn ( db . logger ) . Log ( "msg" , "Found corrupted block, but replaced by compacted one so it's safe to delete. This should not happen with atomic deletes." , "block" , b . ULID )
}
2019-01-16 02:03:52 -08:00
deletable [ b . ULID ] = nil
2017-02-09 17:54:26 -08:00
}
2019-01-16 02:03:52 -08:00
}
2020-08-11 07:53:23 -07:00
2019-01-16 02:03:52 -08:00
if len ( corrupted ) > 0 {
2020-08-11 07:53:23 -07:00
// Corrupted but no child loaded for it.
2019-01-30 01:40:40 -08:00
// Close all new blocks to release the lock for windows.
for _ , block := range loadable {
2019-07-23 01:04:48 -07:00
if _ , open := getBlock ( db . blocks , block . Meta ( ) . ULID ) ; ! open {
2019-01-30 01:40:40 -08:00
block . Close ( )
}
}
2020-10-28 08:24:58 -07:00
errs := tsdb_errors . NewMulti ( )
2020-06-17 07:40:00 -07:00
for ulid , err := range corrupted {
2020-10-28 08:24:58 -07:00
errs . Add ( errors . Wrapf ( err , "corrupted block %s" , ulid . String ( ) ) )
2020-06-17 07:40:00 -07:00
}
2020-10-28 08:24:58 -07:00
return errs . Err ( )
2019-01-16 02:03:52 -08:00
}
var (
2020-08-11 07:53:23 -07:00
toLoad [ ] * Block
2019-01-16 02:03:52 -08:00
blocksSize int64
)
2020-08-11 07:53:23 -07:00
// All deletable blocks should be unloaded.
// NOTE: We need to loop through loadable one more time as there might be loadable ready to be removed (replaced by compacted block).
2019-01-16 02:03:52 -08:00
for _ , block := range loadable {
if _ , ok := deletable [ block . Meta ( ) . ULID ] ; ok {
deletable [ block . Meta ( ) . ULID ] = block
2017-11-03 12:34:21 -07:00
continue
}
2019-01-16 02:03:52 -08:00
2020-08-11 07:53:23 -07:00
toLoad = append ( toLoad , block )
blocksSize += block . Size ( )
2019-01-16 02:03:52 -08:00
}
db . metrics . blocksBytes . Set ( float64 ( blocksSize ) )
2020-08-11 07:53:23 -07:00
sort . Slice ( toLoad , func ( i , j int ) bool {
return toLoad [ i ] . Meta ( ) . MinTime < toLoad [ j ] . Meta ( ) . MinTime
2019-01-16 02:03:52 -08:00
} )
// Swap new blocks first for subsequently created readers to be seen.
oldBlocks := db . blocks
2020-08-11 07:53:23 -07:00
db . blocks = toLoad
2019-01-16 02:03:52 -08:00
2020-08-11 07:53:23 -07:00
blockMetas := make ( [ ] BlockMeta , 0 , len ( toLoad ) )
for _ , b := range toLoad {
2019-02-14 05:29:41 -08:00
blockMetas = append ( blockMetas , b . Meta ( ) )
}
if overlaps := OverlappingBlocks ( blockMetas ) ; len ( overlaps ) > 0 {
2022-08-02 00:14:12 -07:00
level . Debug ( db . logger ) . Log ( "msg" , "Overlapping blocks found during reloadBlocks" , "detail" , overlaps . String ( ) )
2019-02-14 05:29:41 -08:00
}
2020-08-11 07:53:23 -07:00
// Append blocks to old, deletable blocks, so we can close them.
2019-01-16 02:03:52 -08:00
for _ , b := range oldBlocks {
if _ , ok := deletable [ b . Meta ( ) . ULID ] ; ok {
deletable [ b . Meta ( ) . ULID ] = b
2018-06-27 09:05:21 -07:00
}
}
2019-01-16 02:03:52 -08:00
if err := db . deleteBlocks ( deletable ) ; err != nil {
2020-10-19 08:27:08 -07:00
return errors . Wrapf ( err , "delete %v blocks" , len ( deletable ) )
2019-01-16 02:03:52 -08:00
}
2020-10-19 08:27:08 -07:00
return nil
2019-01-16 02:03:52 -08:00
}
2023-03-21 22:40:11 -07:00
func openBlocks ( l log . Logger , dir string , loaded [ ] * Block , chunkPool chunkenc . Pool , cache * hashcache . SeriesHashCache , postingsCacheTTL time . Duration , postingsCacheSize int , postingsCacheForce bool ) ( blocks [ ] * Block , corrupted map [ ulid . ULID ] error , err error ) {
2019-07-23 01:04:48 -07:00
bDirs , err := blockDirs ( dir )
2019-01-16 02:03:52 -08:00
if err != nil {
return nil , nil , errors . Wrap ( err , "find blocks" )
}
corrupted = make ( map [ ulid . ULID ] error )
2019-07-23 01:04:48 -07:00
for _ , bDir := range bDirs {
meta , _ , err := readMetaFile ( bDir )
2018-06-27 06:47:11 -07:00
if err != nil {
2020-10-19 08:27:08 -07:00
level . Error ( l ) . Log ( "msg" , "Failed to read meta.json for a block during reloadBlocks. Skipping" , "dir" , bDir , "err" , err )
2018-06-27 06:47:11 -07:00
continue
}
2019-01-16 02:03:52 -08:00
2018-06-27 06:47:11 -07:00
// See if we already have the block in memory or open it otherwise.
2019-07-23 01:04:48 -07:00
block , open := getBlock ( loaded , meta . ULID )
if ! open {
2021-08-17 06:31:08 -07:00
var cacheProvider index . ReaderCacheProvider
if cache != nil {
cacheProvider = cache . GetBlockCacheProvider ( meta . ULID . String ( ) )
}
2023-03-21 22:40:11 -07:00
block , err = OpenBlockWithOptions ( l , bDir , chunkPool , cacheProvider , postingsCacheTTL , postingsCacheSize , postingsCacheForce )
2017-05-18 07:09:30 -07:00
if err != nil {
2019-01-16 02:03:52 -08:00
corrupted [ meta . ULID ] = err
continue
2017-03-02 00:13:29 -08:00
}
}
2019-01-16 02:03:52 -08:00
blocks = append ( blocks , block )
2016-12-09 01:00:14 -08:00
}
2019-01-16 02:03:52 -08:00
return blocks , corrupted , nil
}
2020-07-22 08:19:33 -07:00
// DefaultBlocksToDelete returns a filter which decides time based and size based
// retention from the options of the db.
func DefaultBlocksToDelete ( db * DB ) BlocksToDeleteFunc {
return func ( blocks [ ] * Block ) map [ ulid . ULID ] struct { } {
return deletableBlocks ( db , blocks )
}
}
2020-08-10 22:56:08 -07:00
// deletableBlocks returns all currently loaded blocks past retention policy or already compacted into a new block.
2020-07-22 08:19:33 -07:00
func deletableBlocks ( db * DB , blocks [ ] * Block ) map [ ulid . ULID ] struct { } {
deletable := make ( map [ ulid . ULID ] struct { } )
2019-01-16 02:03:52 -08:00
// Sort the blocks by time - newest to oldest (largest to smallest timestamp).
// This ensures that the retentions will remove the oldest blocks.
2018-05-28 13:00:36 -07:00
sort . Slice ( blocks , func ( i , j int ) bool {
2019-01-16 02:03:52 -08:00
return blocks [ i ] . Meta ( ) . MaxTime > blocks [ j ] . Meta ( ) . MaxTime
2018-05-28 13:00:36 -07:00
} )
2019-01-16 02:03:52 -08:00
2019-01-18 00:35:16 -08:00
for _ , block := range blocks {
if block . Meta ( ) . Compaction . Deletable {
2020-07-22 08:19:33 -07:00
deletable [ block . Meta ( ) . ULID ] = struct { } { }
2019-01-18 00:35:16 -08:00
}
2017-05-18 07:09:30 -07:00
}
2017-05-26 04:01:45 -07:00
2020-07-22 08:19:33 -07:00
for ulid := range BeyondTimeRetention ( db , blocks ) {
deletable [ ulid ] = struct { } { }
2017-05-18 07:09:30 -07:00
}
2020-07-22 08:19:33 -07:00
for ulid := range BeyondSizeRetention ( db , blocks ) {
deletable [ ulid ] = struct { } { }
2019-01-16 02:03:52 -08:00
}
2017-05-18 07:09:30 -07:00
2019-01-16 02:03:52 -08:00
return deletable
}
2020-07-22 08:19:33 -07:00
// BeyondTimeRetention returns those blocks which are beyond the time retention
// set in the db options.
func BeyondTimeRetention ( db * DB , blocks [ ] * Block ) ( deletable map [ ulid . ULID ] struct { } ) {
2019-01-16 02:03:52 -08:00
// Time retention is disabled or no blocks to work with.
2020-10-19 04:21:54 -07:00
if len ( blocks ) == 0 || db . opts . RetentionDuration == 0 {
2019-01-16 02:03:52 -08:00
return
}
2020-07-22 08:19:33 -07:00
deletable = make ( map [ ulid . ULID ] struct { } )
2019-01-16 02:03:52 -08:00
for i , block := range blocks {
// The difference between the first block and this block is larger than
2020-01-02 06:54:09 -08:00
// the retention period so any blocks after that are added as deletable.
2020-02-11 08:34:09 -08:00
if i > 0 && blocks [ 0 ] . Meta ( ) . MaxTime - block . Meta ( ) . MaxTime > db . opts . RetentionDuration {
2019-01-16 02:03:52 -08:00
for _ , b := range blocks [ i : ] {
2020-07-22 08:19:33 -07:00
deletable [ b . meta . ULID ] = struct { } { }
2019-01-16 02:03:52 -08:00
}
db . metrics . timeRetentionCount . Inc ( )
break
2017-11-03 12:34:21 -07:00
}
2019-01-16 02:03:52 -08:00
}
2020-01-02 06:54:09 -08:00
return deletable
2019-01-16 02:03:52 -08:00
}
2020-07-22 08:19:33 -07:00
// BeyondSizeRetention returns those blocks which are beyond the size retention
// set in the db options.
func BeyondSizeRetention ( db * DB , blocks [ ] * Block ) ( deletable map [ ulid . ULID ] struct { } ) {
2019-01-16 02:03:52 -08:00
// Size retention is disabled or no blocks to work with.
2020-10-19 04:21:54 -07:00
if len ( blocks ) == 0 || db . opts . MaxBytes <= 0 {
2019-01-16 02:03:52 -08:00
return
}
2020-07-22 08:19:33 -07:00
deletable = make ( map [ ulid . ULID ] struct { } )
2019-11-11 18:40:16 -08:00
2020-05-06 08:30:00 -07:00
// Initializing size counter with WAL size and Head chunks
// written to disk, as that is part of the retention strategy.
2020-10-12 14:15:40 -07:00
blocksSize := db . Head ( ) . Size ( )
2019-01-16 02:03:52 -08:00
for i , block := range blocks {
blocksSize += block . Size ( )
2020-02-06 07:58:38 -08:00
if blocksSize > int64 ( db . opts . MaxBytes ) {
2019-01-16 02:03:52 -08:00
// Add this and all following blocks for deletion.
for _ , b := range blocks [ i : ] {
2020-07-22 08:19:33 -07:00
deletable [ b . meta . ULID ] = struct { } { }
2019-01-16 02:03:52 -08:00
}
db . metrics . sizeRetentionCount . Inc ( )
break
2017-11-03 12:34:21 -07:00
}
2018-06-27 06:47:11 -07:00
}
2020-01-02 06:54:09 -08:00
return deletable
2019-01-16 02:03:52 -08:00
}
2020-08-11 07:53:23 -07:00
// deleteBlocks closes the block if loaded and deletes blocks from the disk if exists.
2019-01-16 02:03:52 -08:00
// When the map contains a non nil block object it means it is loaded in memory
// so needs to be closed first as it might need to wait for pending readers to complete.
func ( db * DB ) deleteBlocks ( blocks map [ ulid . ULID ] * Block ) error {
for ulid , block := range blocks {
if block != nil {
if err := block . Close ( ) ; err != nil {
2020-06-17 07:40:00 -07:00
level . Warn ( db . logger ) . Log ( "msg" , "Closing block failed" , "err" , err , "block" , ulid )
2019-01-16 02:03:52 -08:00
}
}
2020-08-10 22:56:08 -07:00
2020-08-11 07:53:23 -07:00
toDelete := filepath . Join ( db . dir , ulid . String ( ) )
if _ , err := os . Stat ( toDelete ) ; os . IsNotExist ( err ) {
// Noop.
continue
} else if err != nil {
return errors . Wrapf ( err , "stat dir %v" , toDelete )
}
// Replace atomically to avoid partial block when process would crash during deletion.
2020-08-10 22:56:08 -07:00
tmpToDelete := filepath . Join ( db . dir , fmt . Sprintf ( "%s%s" , ulid , tmpForDeletionBlockDirSuffix ) )
2020-08-11 07:53:23 -07:00
if err := fileutil . Replace ( toDelete , tmpToDelete ) ; err != nil {
2020-08-10 22:56:08 -07:00
return errors . Wrapf ( err , "replace of obsolete block for deletion %s" , ulid )
}
if err := os . RemoveAll ( tmpToDelete ) ; err != nil {
2018-06-27 06:47:11 -07:00
return errors . Wrapf ( err , "delete obsolete block %s" , ulid )
2017-10-23 11:30:03 -07:00
}
2020-08-11 07:53:23 -07:00
level . Info ( db . logger ) . Log ( "msg" , "Deleting obsolete block" , "block" , ulid )
2017-10-23 11:30:03 -07:00
}
2020-08-10 22:56:08 -07:00
2019-01-16 02:03:52 -08:00
return nil
2017-05-18 07:09:30 -07:00
}
2017-01-03 06:43:26 -08:00
2018-04-05 06:15:24 -07:00
// TimeRange specifies minTime and maxTime range.
2018-03-29 04:50:46 -07:00
type TimeRange struct {
2018-04-05 05:51:33 -07:00
Min , Max int64
2018-03-29 04:50:46 -07:00
}
2018-04-05 05:51:33 -07:00
2018-04-05 06:15:24 -07:00
// Overlaps contains overlapping blocks aggregated by overlapping range.
type Overlaps map [ TimeRange ] [ ] BlockMeta
// String returns human readable string form of overlapped blocks.
func ( o Overlaps ) String ( ) string {
var res [ ] string
for r , overlaps := range o {
var groups [ ] string
for _ , m := range overlaps {
groups = append ( groups , fmt . Sprintf (
2018-04-05 08:53:24 -07:00
"<ulid: %s, mint: %d, maxt: %d, range: %s>" ,
2018-04-05 06:15:24 -07:00
m . ULID . String ( ) ,
m . MinTime ,
m . MaxTime ,
( time . Duration ( ( m . MaxTime - m . MinTime ) / 1000 ) * time . Second ) . String ( ) ,
) )
}
2018-04-05 08:01:16 -07:00
res = append ( res , fmt . Sprintf (
2018-04-05 08:53:24 -07:00
"[mint: %d, maxt: %d, range: %s, blocks: %d]: %s" ,
2018-04-05 08:01:16 -07:00
r . Min , r . Max ,
( time . Duration ( ( r . Max - r . Min ) / 1000 ) * time . Second ) . String ( ) ,
len ( overlaps ) ,
2018-04-05 08:53:24 -07:00
strings . Join ( groups , ", " ) ) ,
2018-04-05 08:01:16 -07:00
)
2018-04-05 06:15:24 -07:00
}
2018-04-05 08:01:16 -07:00
return strings . Join ( res , "\n" )
2018-04-05 06:15:24 -07:00
}
// OverlappingBlocks returns all overlapping blocks from given meta files.
func OverlappingBlocks ( bm [ ] BlockMeta ) Overlaps {
2018-03-28 10:33:41 -07:00
if len ( bm ) <= 1 {
2018-03-28 07:50:52 -07:00
return nil
}
2018-03-28 15:50:42 -07:00
var (
2018-04-05 05:51:33 -07:00
overlaps [ ] [ ] BlockMeta
2018-03-28 15:50:42 -07:00
// pending contains not ended blocks in regards to "current" timestamp.
pending = [ ] BlockMeta { bm [ 0 ] }
2018-03-29 04:50:46 -07:00
// continuousPending helps to aggregate same overlaps to single group.
continuousPending = true
2018-03-28 15:50:42 -07:00
)
2018-04-05 06:15:24 -07:00
// We have here blocks sorted by minTime. We iterate over each block and treat its minTime as our "current" timestamp.
// We check if any of the pending block finished (blocks that we have seen before, but their maxTime was still ahead current
// timestamp). If not, it means they overlap with our current block. In the same time current block is assumed pending.
2018-03-28 15:18:24 -07:00
for _ , b := range bm [ 1 : ] {
2018-03-28 15:50:42 -07:00
var newPending [ ] BlockMeta
2018-03-28 10:33:41 -07:00
2018-03-28 15:18:24 -07:00
for _ , p := range pending {
2018-03-28 15:50:42 -07:00
// "b.MinTime" is our current time.
2018-03-28 15:18:24 -07:00
if b . MinTime >= p . MaxTime {
2018-03-29 04:50:46 -07:00
continuousPending = false
2018-03-28 15:18:24 -07:00
continue
2018-03-28 10:33:41 -07:00
}
2018-03-28 15:18:24 -07:00
// "p" overlaps with "b" and "p" is still pending.
newPending = append ( newPending , p )
2017-05-18 07:09:30 -07:00
}
2018-03-28 15:50:42 -07:00
2018-03-28 15:18:24 -07:00
// Our block "b" is now pending.
pending = append ( newPending , b )
if len ( newPending ) == 0 {
2018-03-28 15:50:42 -07:00
// No overlaps.
2018-03-28 15:18:24 -07:00
continue
2018-03-28 10:33:41 -07:00
}
2018-03-29 04:50:46 -07:00
if continuousPending && len ( overlaps ) > 0 {
2018-03-28 15:18:24 -07:00
overlaps [ len ( overlaps ) - 1 ] = append ( overlaps [ len ( overlaps ) - 1 ] , b )
2018-03-28 10:33:41 -07:00
continue
2017-05-18 07:09:30 -07:00
}
2018-03-28 15:18:24 -07:00
overlaps = append ( overlaps , append ( newPending , b ) )
2018-03-29 04:50:46 -07:00
// Start new pendings.
continuousPending = true
2017-05-18 07:09:30 -07:00
}
2018-04-05 05:51:33 -07:00
// Fetch the critical overlapped time range foreach overlap groups.
2018-04-05 06:15:24 -07:00
overlapGroups := Overlaps { }
2018-04-05 05:51:33 -07:00
for _ , overlap := range overlaps {
minRange := TimeRange { Min : 0 , Max : math . MaxInt64 }
for _ , b := range overlap {
if minRange . Max > b . MaxTime {
minRange . Max = b . MaxTime
}
if minRange . Min < b . MinTime {
minRange . Min = b . MinTime
}
}
overlapGroups [ minRange ] = overlap
}
return overlapGroups
2017-01-02 13:24:35 -08:00
}
2017-10-09 06:21:46 -07:00
func ( db * DB ) String ( ) string {
return "HEAD"
}
// Blocks returns the databases persisted blocks.
func ( db * DB ) Blocks ( ) [ ] * Block {
2017-08-29 06:39:27 -07:00
db . mtx . RLock ( )
defer db . mtx . RUnlock ( )
return db . blocks
}
2022-06-27 07:56:25 -07:00
// inOrderBlocksMaxTime returns the max time among the blocks that were not totally created
// out of out-of-order data. If the returned boolean is true, it means there is at least
// one such block.
func ( db * DB ) inOrderBlocksMaxTime ( ) ( maxt int64 , ok bool ) {
2022-09-20 10:05:50 -07:00
maxt , ok = int64 ( math . MinInt64 ) , false
2022-06-27 07:56:25 -07:00
// If blocks are overlapping, last block might not have the max time. So check all blocks.
for _ , b := range db . Blocks ( ) {
2022-10-05 08:01:08 -07:00
if ! b . meta . OutOfOrder && ! b . meta . Compaction . FromOutOfOrder ( ) && b . meta . MaxTime > maxt {
2022-06-27 07:56:25 -07:00
ok = true
maxt = b . meta . MaxTime
}
}
return maxt , ok
}
2017-10-09 06:21:46 -07:00
// Head returns the databases's head.
2017-09-25 07:45:24 -07:00
func ( db * DB ) Head ( ) * Head {
return db . head
}
2017-01-05 23:08:02 -08:00
// Close the partition.
2017-01-06 02:40:09 -08:00
func ( db * DB ) Close ( ) error {
2017-01-06 03:37:28 -08:00
close ( db . stopc )
2020-10-28 03:09:03 -07:00
if db . compactCancel != nil {
db . compactCancel ( )
}
2017-01-06 03:37:28 -08:00
<- db . donec
2017-01-06 02:40:09 -08:00
db . mtx . Lock ( )
2017-03-17 04:12:50 -07:00
defer db . mtx . Unlock ( )
2017-03-04 07:50:48 -08:00
2017-03-06 03:13:15 -08:00
var g errgroup . Group
2017-01-02 01:34:55 -08:00
2017-03-20 00:41:56 -07:00
// blocks also contains all head blocks.
for _ , pb := range db . blocks {
2017-03-06 03:13:15 -08:00
g . Go ( pb . Close )
2016-12-14 23:31:26 -08:00
}
2021-11-11 08:45:25 -08:00
errs := tsdb_errors . NewMulti ( g . Wait ( ) , db . locker . Release ( ) )
2020-10-21 08:08:28 -07:00
if db . head != nil {
2020-10-28 08:24:58 -07:00
errs . Add ( db . head . Close ( ) )
2020-10-21 08:08:28 -07:00
}
2020-10-28 08:24:58 -07:00
return errs . Err ( )
2016-12-09 01:00:14 -08:00
}
2018-11-20 02:34:26 -08:00
// DisableCompactions disables auto compactions.
2017-06-06 11:15:23 -07:00
func ( db * DB ) DisableCompactions ( ) {
2018-11-20 02:34:26 -08:00
db . autoCompactMtx . Lock ( )
defer db . autoCompactMtx . Unlock ( )
2017-07-14 01:06:07 -07:00
2018-11-20 02:34:26 -08:00
db . autoCompact = false
2020-04-11 01:22:18 -07:00
level . Info ( db . logger ) . Log ( "msg" , "Compactions disabled" )
2017-06-06 07:53:20 -07:00
}
2018-11-20 02:34:26 -08:00
// EnableCompactions enables auto compactions.
2017-06-06 11:15:23 -07:00
func ( db * DB ) EnableCompactions ( ) {
2018-11-20 02:34:26 -08:00
db . autoCompactMtx . Lock ( )
defer db . autoCompactMtx . Unlock ( )
2017-07-14 01:06:07 -07:00
2018-11-20 02:34:26 -08:00
db . autoCompact = true
2020-04-11 01:22:18 -07:00
level . Info ( db . logger ) . Log ( "msg" , "Compactions enabled" )
2017-06-05 01:18:31 -07:00
}
2018-02-28 03:04:55 -08:00
// Snapshot writes the current data to the directory. If withHead is set to true it
// will create a new block containing all data that's currently in the memory buffer/WAL.
func ( db * DB ) Snapshot ( dir string , withHead bool ) error {
2017-08-30 09:34:54 -07:00
if dir == db . dir {
return errors . Errorf ( "cannot snapshot into base directory" )
}
2019-03-18 07:14:10 -07:00
if _ , err := ulid . ParseStrict ( dir ) ; err == nil {
2017-08-30 09:34:54 -07:00
return errors . Errorf ( "dir must not be a valid ULID" )
}
2017-06-05 01:18:31 -07:00
2017-08-30 09:34:54 -07:00
db . cmtx . Lock ( )
defer db . cmtx . Unlock ( )
2017-10-23 11:30:03 -07:00
db . mtx . RLock ( )
defer db . mtx . RUnlock ( )
for _ , b := range db . blocks {
2020-04-11 01:22:18 -07:00
level . Info ( db . logger ) . Log ( "msg" , "Snapshotting block" , "block" , b )
2017-08-30 09:34:54 -07:00
if err := b . Snapshot ( dir ) ; err != nil {
2017-11-22 04:28:06 -08:00
return errors . Wrapf ( err , "error snapshotting block: %s" , b . Dir ( ) )
2017-08-30 09:34:54 -07:00
}
}
2018-02-28 03:04:55 -08:00
if ! withHead {
return nil
}
2019-07-03 03:47:31 -07:00
mint := db . head . MinTime ( )
maxt := db . head . MaxTime ( )
2020-08-13 02:55:35 -07:00
head := NewRangeHead ( db . head , mint , maxt )
2019-07-03 03:47:31 -07:00
// Add +1 millisecond to block maxt because block intervals are half-open: [b.MinTime, b.MaxTime).
// Because of this block intervals are always +1 than the total samples it includes.
if _ , err := db . compactor . Write ( dir , head , mint , maxt + 1 , nil ) ; err != nil {
return errors . Wrap ( err , "snapshot head block" )
}
return nil
2017-08-28 15:39:17 -07:00
}
2017-07-14 00:00:22 -07:00
2017-08-28 15:39:17 -07:00
// Querier returns a new querier over the data partition for the given time range.
2020-02-06 07:58:38 -08:00
func ( db * DB ) Querier ( _ context . Context , mint , maxt int64 ) ( storage . Querier , error ) {
2017-10-09 06:21:46 -07:00
var blocks [ ] BlockReader
2017-08-28 15:39:17 -07:00
2017-10-23 11:30:03 -07:00
db . mtx . RLock ( )
defer db . mtx . RUnlock ( )
for _ , b := range db . blocks {
2018-07-02 01:23:36 -07:00
if b . OverlapsClosedInterval ( mint , maxt ) {
2017-10-09 06:21:46 -07:00
blocks = append ( blocks , b )
}
}
2022-06-22 04:45:21 -07:00
var inOrderHeadQuerier storage . Querier
2017-10-09 06:21:46 -07:00
if maxt >= db . head . MinTime ( ) {
2021-07-20 01:47:20 -07:00
rh := NewRangeHead ( db . head , mint , maxt )
var err error
2022-06-22 04:45:21 -07:00
inOrderHeadQuerier , err = NewBlockQuerier ( rh , mint , maxt )
2021-07-20 01:47:20 -07:00
if err != nil {
2022-06-22 04:45:21 -07:00
return nil , errors . Wrapf ( err , "open block querier for head %s" , rh )
2021-07-20 01:47:20 -07:00
}
// Getting the querier above registers itself in the queue that the truncation waits on.
// So if the querier is currently not colliding with any truncation, we can continue to use it and still
// won't run into a race later since any truncation that comes after will wait on this querier if it overlaps.
shouldClose , getNew , newMint := db . head . IsQuerierCollidingWithTruncation ( mint , maxt )
if shouldClose {
2022-06-22 04:45:21 -07:00
if err := inOrderHeadQuerier . Close ( ) ; err != nil {
return nil , errors . Wrapf ( err , "closing head block querier %s" , rh )
2021-07-20 01:47:20 -07:00
}
2022-06-22 04:45:21 -07:00
inOrderHeadQuerier = nil
2021-07-20 01:47:20 -07:00
}
if getNew {
rh := NewRangeHead ( db . head , newMint , maxt )
2022-06-22 04:45:21 -07:00
inOrderHeadQuerier , err = NewBlockQuerier ( rh , newMint , maxt )
2021-07-20 01:47:20 -07:00
if err != nil {
2022-06-22 04:45:21 -07:00
return nil , errors . Wrapf ( err , "open block querier for head while getting new querier %s" , rh )
2021-07-20 01:47:20 -07:00
}
}
2017-10-09 06:21:46 -07:00
}
2017-08-28 15:39:17 -07:00
2022-06-22 04:45:21 -07:00
var outOfOrderHeadQuerier storage . Querier
if overlapsClosedInterval ( mint , maxt , db . head . MinOOOTime ( ) , db . head . MaxOOOTime ( ) ) {
rh := NewOOORangeHead ( db . head , mint , maxt )
var err error
outOfOrderHeadQuerier , err = NewBlockQuerier ( rh , mint , maxt )
if err != nil {
return nil , errors . Wrapf ( err , "open block querier for ooo head %s" , rh )
}
}
2020-02-06 07:58:38 -08:00
blockQueriers := make ( [ ] storage . Querier , 0 , len ( blocks ) )
2017-06-06 05:45:54 -07:00
for _ , b := range blocks {
2017-10-09 06:21:46 -07:00
q , err := NewBlockQuerier ( b , mint , maxt )
2017-10-23 11:30:03 -07:00
if err == nil {
2019-02-14 05:29:41 -08:00
blockQueriers = append ( blockQueriers , q )
2017-10-23 11:30:03 -07:00
continue
}
// If we fail, all previously opened queriers must be closed.
2019-02-14 05:29:41 -08:00
for _ , q := range blockQueriers {
2020-07-31 08:03:02 -07:00
// TODO(bwplotka): Handle error.
_ = q . Close ( )
2017-10-09 06:21:46 -07:00
}
2017-10-23 11:30:03 -07:00
return nil , errors . Wrapf ( err , "open querier for block %s" , b )
2017-10-09 06:21:46 -07:00
}
2022-06-22 04:45:21 -07:00
if inOrderHeadQuerier != nil {
blockQueriers = append ( blockQueriers , inOrderHeadQuerier )
}
if outOfOrderHeadQuerier != nil {
blockQueriers = append ( blockQueriers , outOfOrderHeadQuerier )
2021-07-20 01:47:20 -07:00
}
2020-07-31 08:03:02 -07:00
return storage . NewMergeQuerier ( blockQueriers , nil , storage . ChainedSeriesMerge ) , nil
}
2022-07-20 03:27:02 -07:00
// blockQueriersForRange returns individual block chunk queriers from the persistent blocks, in-order head block, and the
// out-of-order head block, overlapping with the given time range.
func ( db * DB ) blockChunkQuerierForRange ( mint , maxt int64 ) ( [ ] storage . ChunkQuerier , error ) {
2020-07-31 08:03:02 -07:00
var blocks [ ] BlockReader
2019-02-14 05:29:41 -08:00
2020-07-31 08:03:02 -07:00
db . mtx . RLock ( )
defer db . mtx . RUnlock ( )
for _ , b := range db . blocks {
if b . OverlapsClosedInterval ( mint , maxt ) {
blocks = append ( blocks , b )
}
}
2022-06-22 04:45:21 -07:00
var inOrderHeadQuerier storage . ChunkQuerier
2020-07-31 08:03:02 -07:00
if maxt >= db . head . MinTime ( ) {
2021-07-20 01:47:20 -07:00
rh := NewRangeHead ( db . head , mint , maxt )
var err error
2022-06-22 04:45:21 -07:00
inOrderHeadQuerier , err = NewBlockChunkQuerier ( rh , mint , maxt )
2021-07-20 01:47:20 -07:00
if err != nil {
return nil , errors . Wrapf ( err , "open querier for head %s" , rh )
}
// Getting the querier above registers itself in the queue that the truncation waits on.
// So if the querier is currently not colliding with any truncation, we can continue to use it and still
// won't run into a race later since any truncation that comes after will wait on this querier if it overlaps.
shouldClose , getNew , newMint := db . head . IsQuerierCollidingWithTruncation ( mint , maxt )
if shouldClose {
2022-06-22 04:45:21 -07:00
if err := inOrderHeadQuerier . Close ( ) ; err != nil {
2021-07-20 01:47:20 -07:00
return nil , errors . Wrapf ( err , "closing head querier %s" , rh )
}
2022-06-22 04:45:21 -07:00
inOrderHeadQuerier = nil
2021-07-20 01:47:20 -07:00
}
if getNew {
rh := NewRangeHead ( db . head , newMint , maxt )
2022-06-22 04:45:21 -07:00
inOrderHeadQuerier , err = NewBlockChunkQuerier ( rh , newMint , maxt )
2021-07-20 01:47:20 -07:00
if err != nil {
return nil , errors . Wrapf ( err , "open querier for head while getting new querier %s" , rh )
}
}
2019-02-14 05:29:41 -08:00
}
2022-06-22 04:45:21 -07:00
var outOfOrderHeadQuerier storage . ChunkQuerier
if overlapsClosedInterval ( mint , maxt , db . head . MinOOOTime ( ) , db . head . MaxOOOTime ( ) ) {
rh := NewOOORangeHead ( db . head , mint , maxt )
var err error
outOfOrderHeadQuerier , err = NewBlockChunkQuerier ( rh , mint , maxt )
if err != nil {
return nil , errors . Wrapf ( err , "open block chunk querier for ooo head %s" , rh )
}
}
2020-07-31 08:03:02 -07:00
blockQueriers := make ( [ ] storage . ChunkQuerier , 0 , len ( blocks ) )
for _ , b := range blocks {
q , err := NewBlockChunkQuerier ( b , mint , maxt )
if err == nil {
blockQueriers = append ( blockQueriers , q )
continue
}
// If we fail, all previously opened queriers must be closed.
for _ , q := range blockQueriers {
// TODO(bwplotka): Handle error.
_ = q . Close ( )
}
return nil , errors . Wrapf ( err , "open querier for block %s" , b )
}
2022-06-22 04:45:21 -07:00
if inOrderHeadQuerier != nil {
blockQueriers = append ( blockQueriers , inOrderHeadQuerier )
}
if outOfOrderHeadQuerier != nil {
blockQueriers = append ( blockQueriers , outOfOrderHeadQuerier )
2021-07-20 01:47:20 -07:00
}
2017-08-28 15:39:17 -07:00
2022-07-20 03:27:02 -07:00
return blockQueriers , nil
}
// ChunkQuerier returns a new chunk querier over the data partition for the given time range.
func ( db * DB ) ChunkQuerier ( _ context . Context , mint , maxt int64 ) ( storage . ChunkQuerier , error ) {
blockQueriers , err := db . blockChunkQuerierForRange ( mint , maxt )
if err != nil {
return nil , err
}
2020-07-31 08:03:02 -07:00
return storage . NewMergeChunkQuerier ( blockQueriers , nil , storage . NewCompactingChunkSeriesMerger ( storage . ChainedSeriesMerge ) ) , nil
2020-06-24 06:41:52 -07:00
}
2022-07-20 03:27:02 -07:00
// UnorderedChunkQuerier returns a new chunk querier over the data partition for the given time range.
// The chunks can be overlapping and not sorted.
func ( db * DB ) UnorderedChunkQuerier ( _ context . Context , mint , maxt int64 ) ( storage . ChunkQuerier , error ) {
blockQueriers , err := db . blockChunkQuerierForRange ( mint , maxt )
if err != nil {
return nil , err
}
return storage . NewMergeChunkQuerier ( blockQueriers , nil , storage . NewConcatenatingChunkSeriesMerger ( ) ) , nil
}
2021-03-16 02:47:45 -07:00
func ( db * DB ) ExemplarQuerier ( ctx context . Context ) ( storage . ExemplarQuerier , error ) {
return db . head . exemplars . ExemplarQuerier ( ctx )
}
2021-10-22 01:06:44 -07:00
func rangeForTimestamp ( t , width int64 ) ( maxt int64 ) {
2018-12-04 02:30:49 -08:00
return ( t / width ) * width + width
2017-02-01 06:29:48 -08:00
}
2017-08-28 15:39:17 -07:00
// Delete implements deletion of metrics. It only has atomicity guarantees on a per-block basis.
2019-11-18 11:53:33 -08:00
func ( db * DB ) Delete ( mint , maxt int64 , ms ... * labels . Matcher ) error {
2017-05-20 00:51:10 -07:00
db . cmtx . Lock ( )
defer db . cmtx . Unlock ( )
2017-07-14 00:00:22 -07:00
2017-05-19 12:05:50 -07:00
var g errgroup . Group
2017-10-23 11:30:03 -07:00
db . mtx . RLock ( )
defer db . mtx . RUnlock ( )
for _ , b := range db . blocks {
2018-07-02 01:23:36 -07:00
if b . OverlapsClosedInterval ( mint , maxt ) {
2017-10-09 06:21:46 -07:00
g . Go ( func ( b * Block ) func ( ) error {
2017-08-28 15:39:17 -07:00
return func ( ) error { return b . Delete ( mint , maxt , ms ... ) }
} ( b ) )
}
}
2021-09-15 23:50:03 -07:00
if db . head . OverlapsClosedInterval ( mint , maxt ) {
g . Go ( func ( ) error {
return db . head . Delete ( mint , maxt , ms ... )
} )
}
2018-03-21 14:23:47 -07:00
return g . Wait ( )
2017-05-19 12:05:50 -07:00
}
2017-11-22 04:34:50 -08:00
// CleanTombstones re-writes any blocks with tombstones.
2018-05-30 19:09:30 -07:00
func ( db * DB ) CleanTombstones ( ) ( err error ) {
2017-11-22 04:34:50 -08:00
db . cmtx . Lock ( )
defer db . cmtx . Unlock ( )
start := time . Now ( )
2022-10-25 15:26:12 -07:00
defer func ( ) {
db . metrics . tombCleanTimer . Observe ( time . Since ( start ) . Seconds ( ) )
} ( )
2017-11-22 04:34:50 -08:00
2021-02-16 21:32:43 -08:00
cleanUpCompleted := false
// Repeat cleanup until there is no tombstones left.
for ! cleanUpCompleted {
cleanUpCompleted = true
for _ , pb := range db . Blocks ( ) {
uid , safeToDelete , cleanErr := pb . CleanTombstones ( db . Dir ( ) , db . compactor )
if cleanErr != nil {
return errors . Wrapf ( cleanErr , "clean tombstones: %s" , pb . Dir ( ) )
}
if ! safeToDelete {
// There was nothing to clean.
continue
}
// In case tombstones of the old block covers the whole block,
// then there would be no resultant block to tell the parent.
// The lock protects against race conditions when deleting blocks
// during an already running reload.
db . mtx . Lock ( )
pb . meta . Compaction . Deletable = safeToDelete
db . mtx . Unlock ( )
cleanUpCompleted = false
if err = db . reloadBlocks ( ) ; err == nil { // Will try to delete old block.
// Successful reload will change the existing blocks.
// We need to loop over the new set of blocks.
break
}
// Delete new block if it was created.
if uid != nil && * uid != ( ulid . ULID { } ) {
2018-05-30 19:09:30 -07:00
dir := filepath . Join ( db . Dir ( ) , uid . String ( ) )
if err := os . RemoveAll ( dir ) ; err != nil {
level . Error ( db . logger ) . Log ( "msg" , "failed to delete block after failed `CleanTombstones`" , "dir" , dir , "err" , err )
}
}
2021-02-16 21:32:43 -08:00
return errors . Wrap ( err , "reload blocks" )
2018-05-30 19:09:30 -07:00
}
2020-10-19 08:27:08 -07:00
}
return nil
2017-11-22 04:34:50 -08:00
}
2022-04-27 02:24:36 -07:00
func isBlockDir ( fi fs . DirEntry ) bool {
2017-01-19 02:22:47 -08:00
if ! fi . IsDir ( ) {
return false
}
2019-03-18 07:14:10 -07:00
_ , err := ulid . ParseStrict ( fi . Name ( ) )
2017-05-18 07:09:30 -07:00
return err == nil
2017-01-19 02:22:47 -08:00
}
2022-03-24 03:44:14 -07:00
// isTmpDir returns true if the given file-info contains a block ULID or checkpoint prefix and a tmp extension.
2022-04-27 02:24:36 -07:00
func isTmpDir ( fi fs . DirEntry ) bool {
2020-08-10 22:56:08 -07:00
if ! fi . IsDir ( ) {
return false
}
fn := fi . Name ( )
ext := filepath . Ext ( fn )
2021-01-09 01:02:26 -08:00
if ext == tmpForDeletionBlockDirSuffix || ext == tmpForCreationBlockDirSuffix || ext == tmpLegacy {
2022-03-24 03:44:14 -07:00
if strings . HasPrefix ( fn , "checkpoint." ) {
return true
}
2020-08-10 22:56:08 -07:00
if _ , err := ulid . ParseStrict ( fn [ : len ( fn ) - len ( ext ) ] ) ; err == nil {
return true
}
}
return false
}
2017-01-19 02:22:47 -08:00
func blockDirs ( dir string ) ( [ ] string , error ) {
2022-04-27 02:24:36 -07:00
files , err := os . ReadDir ( dir )
2017-01-19 02:22:47 -08:00
if err != nil {
return nil , err
}
var dirs [ ] string
2022-04-27 02:24:36 -07:00
for _ , f := range files {
if isBlockDir ( f ) {
dirs = append ( dirs , filepath . Join ( dir , f . Name ( ) ) )
2017-01-19 02:22:47 -08:00
}
}
return dirs , nil
2017-01-03 06:43:26 -08:00
}
2016-12-09 04:41:38 -08:00
2017-08-30 09:34:54 -07:00
func sequenceFiles ( dir string ) ( [ ] string , error ) {
2022-04-27 02:24:36 -07:00
files , err := os . ReadDir ( dir )
2017-02-13 23:53:19 -08:00
if err != nil {
return nil , err
}
var res [ ] string
for _ , fi := range files {
2017-08-30 09:34:54 -07:00
if _ , err := strconv . ParseUint ( fi . Name ( ) , 10 , 64 ) ; err != nil {
continue
2017-02-13 23:53:19 -08:00
}
2017-08-30 09:34:54 -07:00
res = append ( res , filepath . Join ( dir , fi . Name ( ) ) )
2017-02-13 23:53:19 -08:00
}
return res , nil
}
2017-08-30 09:34:54 -07:00
func nextSequenceFile ( dir string ) ( string , int , error ) {
2022-04-27 02:24:36 -07:00
files , err := os . ReadDir ( dir )
2017-01-06 00:26:39 -08:00
if err != nil {
2017-01-28 23:11:47 -08:00
return "" , 0 , err
2017-01-06 00:26:39 -08:00
}
2017-01-06 04:13:22 -08:00
2017-01-06 00:26:39 -08:00
i := uint64 ( 0 )
2020-04-06 06:34:20 -07:00
for _ , f := range files {
j , err := strconv . ParseUint ( f . Name ( ) , 10 , 64 )
2017-01-06 04:13:22 -08:00
if err != nil {
continue
2017-01-06 00:26:39 -08:00
}
2017-01-06 04:13:22 -08:00
i = j
2017-01-06 00:26:39 -08:00
}
2017-08-30 09:34:54 -07:00
return filepath . Join ( dir , fmt . Sprintf ( "%0.6d" , i + 1 ) ) , int ( i + 1 ) , nil
2017-01-06 00:26:39 -08:00
}
2016-12-09 04:41:38 -08:00
2017-08-28 15:39:17 -07:00
func exponential ( d , min , max time . Duration ) time . Duration {
d *= 2
if d < min {
d = min
}
if d > max {
d = max
}
return d
}