2020-03-19 09:33:44 -07:00
// Copyright 2020 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package chunks
import (
"bufio"
"bytes"
"encoding/binary"
"hash"
"io"
"io/ioutil"
"os"
"path/filepath"
"sort"
"strconv"
"sync"
2022-01-10 05:36:45 -08:00
"github.com/dennwc/varint"
2020-03-19 09:33:44 -07:00
"github.com/pkg/errors"
2022-01-10 05:36:45 -08:00
"github.com/prometheus/client_golang/prometheus"
2020-10-22 02:00:08 -07:00
"go.uber.org/atomic"
2020-03-19 09:33:44 -07:00
"github.com/prometheus/prometheus/tsdb/chunkenc"
tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
"github.com/prometheus/prometheus/tsdb/fileutil"
)
// Head chunk file header fields constants.
const (
// MagicHeadChunks is 4 bytes at the beginning of a head chunk file.
MagicHeadChunks = 0x0130BC91
headChunksFormatV1 = 1
)
2021-10-22 01:06:44 -07:00
// ErrChunkDiskMapperClosed returned by any method indicates
// that the ChunkDiskMapper was closed.
var ErrChunkDiskMapperClosed = errors . New ( "ChunkDiskMapper closed" )
2020-03-19 09:33:44 -07:00
const (
// MintMaxtSize is the size of the mint/maxt for head chunk file and chunks.
MintMaxtSize = 8
// SeriesRefSize is the size of series reference on disk.
SeriesRefSize = 8
2021-11-17 02:21:27 -08:00
// HeadChunkFileHeaderSize is the total size of the header for a head chunk file.
2020-03-19 09:33:44 -07:00
HeadChunkFileHeaderSize = SegmentHeaderSize
// MaxHeadChunkFileSize is the max size of a head chunk file.
2020-05-29 07:38:41 -07:00
MaxHeadChunkFileSize = 128 * 1024 * 1024 // 128 MiB.
2020-03-19 09:33:44 -07:00
// CRCSize is the size of crc32 sum on disk.
CRCSize = 4
// MaxHeadChunkMetaSize is the max size of an mmapped chunks minus the chunks data.
// Max because the uvarint size can be smaller.
2022-01-19 06:50:35 -08:00
MaxHeadChunkMetaSize = SeriesRefSize + 2 * MintMaxtSize + ChunkEncodingSize + MaxChunkLengthFieldSize + CRCSize
2020-11-19 05:00:47 -08:00
// MinWriteBufferSize is the minimum write buffer size allowed.
MinWriteBufferSize = 64 * 1024 // 64KB.
// MaxWriteBufferSize is the maximum write buffer size allowed.
MaxWriteBufferSize = 8 * 1024 * 1024 // 8 MiB.
// DefaultWriteBufferSize is the default write buffer size.
DefaultWriteBufferSize = 4 * 1024 * 1024 // 4 MiB.
2022-01-10 05:36:45 -08:00
// DefaultWriteQueueSize is the default size of the in-memory queue used before flushing chunks to the disk.
DefaultWriteQueueSize = 1000
2020-03-19 09:33:44 -07:00
)
2021-10-13 05:14:32 -07:00
// ChunkDiskMapperRef represents the location of a head chunk on disk.
// The upper 4 bytes hold the index of the head chunk file and
// the lower 4 bytes hold the byte offset in the head chunk file where the chunk starts.
type ChunkDiskMapperRef uint64
func newChunkDiskMapperRef ( seq , offset uint64 ) ChunkDiskMapperRef {
return ChunkDiskMapperRef ( ( seq << 32 ) | offset )
}
2022-01-10 05:36:45 -08:00
func ( ref ChunkDiskMapperRef ) Unpack ( ) ( seq , offset int ) {
seq = int ( ref >> 32 )
offset = int ( ( ref << 32 ) >> 32 )
return seq , offset
2021-10-13 05:14:32 -07:00
}
2021-01-05 23:54:35 -08:00
// CorruptionErr is an error that's returned when corruption is encountered.
2020-05-06 08:30:00 -07:00
type CorruptionErr struct {
2020-03-19 09:33:44 -07:00
Dir string
FileIndex int
Err error
}
2020-05-06 08:30:00 -07:00
func ( e * CorruptionErr ) Error ( ) string {
2020-03-19 09:33:44 -07:00
return errors . Wrapf ( e . Err , "corruption in head chunk file %s" , segmentFile ( e . Dir , e . FileIndex ) ) . Error ( )
}
2022-01-10 05:36:45 -08:00
// chunkPos keeps track of the position in the head chunk files.
// chunkPos is not thread-safe, a lock must be used to protect it.
type chunkPos struct {
seq uint64 // Index of chunk file.
offset uint64 // Offset within chunk file.
cutFile bool // When true then the next chunk will be written to a new file.
}
// getNextChunkRef takes a chunk and returns the chunk reference which will refer to it once it has been written.
// getNextChunkRef also decides whether a new file should be cut before writing this chunk, and it returns the decision via the second return value.
// The order of calling getNextChunkRef must be the order in which chunks are written to the disk.
func ( f * chunkPos ) getNextChunkRef ( chk chunkenc . Chunk ) ( chkRef ChunkDiskMapperRef , cutFile bool ) {
chkLen := uint64 ( len ( chk . Bytes ( ) ) )
bytesToWrite := f . bytesToWriteForChunk ( chkLen )
2022-01-19 06:50:35 -08:00
if f . shouldCutNewFile ( bytesToWrite ) {
2022-01-10 05:36:45 -08:00
f . toNewFile ( )
f . cutFile = false
cutFile = true
}
chkOffset := f . offset
f . offset += bytesToWrite
return newChunkDiskMapperRef ( f . seq , chkOffset ) , cutFile
}
// toNewFile updates the seq/offset position to point to the beginning of a new chunk file.
func ( f * chunkPos ) toNewFile ( ) {
f . seq ++
f . offset = SegmentHeaderSize
}
// cutFileOnNextChunk triggers that the next chunk will be written in to a new file.
// Not thread safe, a lock must be held when calling this.
func ( f * chunkPos ) cutFileOnNextChunk ( ) {
f . cutFile = true
}
// initSeq sets the sequence number of the head chunk file.
// Should only be used for initialization, after that the sequence number will be managed by chunkPos.
func ( f * chunkPos ) initSeq ( seq uint64 ) {
f . seq = seq
}
// shouldCutNewFile returns whether a new file should be cut based on the file size.
2022-01-19 06:50:35 -08:00
// Not thread safe, a lock must be held when calling this.
func ( f * chunkPos ) shouldCutNewFile ( bytesToWrite uint64 ) bool {
2022-01-10 05:36:45 -08:00
if f . cutFile {
return true
}
return f . offset == 0 || // First head chunk file.
2022-01-19 06:50:35 -08:00
f . offset + bytesToWrite > MaxHeadChunkFileSize // Exceeds the max head chunk file size.
2022-01-10 05:36:45 -08:00
}
// bytesToWriteForChunk returns the number of bytes that will need to be written for the given chunk size,
// including all meta data before and after the chunk data.
// Head chunk format: https://github.com/prometheus/prometheus/blob/main/tsdb/docs/format/head_chunks.md#chunk
func ( f * chunkPos ) bytesToWriteForChunk ( chkLen uint64 ) uint64 {
// Headers.
bytes := uint64 ( SeriesRefSize ) + 2 * MintMaxtSize + ChunkEncodingSize
// Size of chunk length encoded as uvarint.
bytes += uint64 ( varint . UvarintSize ( chkLen ) )
// Chunk length.
bytes += chkLen
// crc32.
bytes += CRCSize
return bytes
}
2020-03-19 09:33:44 -07:00
// ChunkDiskMapper is for writing the Head block chunks to the disk
// and access chunks via mmapped file.
type ChunkDiskMapper struct {
/// Writer.
2020-11-19 05:00:47 -08:00
dir * os . File
writeBufferSize int
2020-03-19 09:33:44 -07:00
2022-01-10 05:36:45 -08:00
curFile * os . File // File being written to.
curFileSequence int // Index of current open file being appended to.
curFileOffset atomic . Uint64 // Bytes written in current open file.
curFileMaxt int64 // Used for the size retention.
// The values in evtlPos represent the file position which will eventually be
// reached once the content of the write queue has been fully processed.
evtlPosMtx sync . Mutex
evtlPos chunkPos
2020-03-19 09:33:44 -07:00
byteBuf [ MaxHeadChunkMetaSize ] byte // Buffer used to write the header of the chunk.
chkWriter * bufio . Writer // Writer for the current open file.
crc32 hash . Hash
2020-05-29 07:38:41 -07:00
writePathMtx sync . Mutex
2020-03-19 09:33:44 -07:00
/// Reader.
// The int key in the map is the file number on the disk.
mmappedChunkFiles map [ int ] * mmappedChunkFile // Contains the m-mapped files for each chunk file mapped with its index.
closers map [ int ] io . Closer // Closers for resources behind the byte slices.
readPathMtx sync . RWMutex // Mutex used to protect the above 2 maps.
pool chunkenc . Pool // This is used when fetching a chunk from the disk to allocate a chunk.
// Writer and Reader.
// We flush chunks to disk in batches. Hence, we store them in this buffer
// from which chunks are served till they are flushed and are ready for m-mapping.
chunkBuffer * chunkBuffer
2021-11-17 02:21:27 -08:00
// Whether the maxt field is set for all mmapped chunk files tracked within the mmappedChunkFiles map.
// This is done after iterating through all the chunks in those files using the IterateAllChunks method.
2020-03-19 09:33:44 -07:00
fileMaxtSet bool
2022-01-10 05:36:45 -08:00
writeQueue * chunkWriteQueue
2020-03-19 09:33:44 -07:00
closed bool
}
2021-11-17 02:21:27 -08:00
// mmappedChunkFile provides mmapp access to an entire head chunks file that holds many chunks.
2020-03-19 09:33:44 -07:00
type mmappedChunkFile struct {
byteSlice ByteSlice
2021-11-17 02:21:27 -08:00
maxt int64 // Max timestamp among all of this file's chunks.
2020-03-19 09:33:44 -07:00
}
2021-11-17 02:21:27 -08:00
// NewChunkDiskMapper returns a new ChunkDiskMapper against the given directory
2020-03-19 09:33:44 -07:00
// using the default head chunk file duration.
// NOTE: 'IterateAllChunks' method needs to be called at least once after creating ChunkDiskMapper
// to set the maxt of all the file.
2022-01-10 05:36:45 -08:00
func NewChunkDiskMapper ( reg prometheus . Registerer , dir string , pool chunkenc . Pool , writeBufferSize , writeQueueSize int ) ( * ChunkDiskMapper , error ) {
2020-11-19 05:00:47 -08:00
// Validate write buffer size.
if writeBufferSize < MinWriteBufferSize || writeBufferSize > MaxWriteBufferSize {
2021-11-10 13:46:35 -08:00
return nil , errors . Errorf ( "ChunkDiskMapper write buffer size should be between %d and %d (actual: %d)" , MinWriteBufferSize , MaxWriteBufferSize , writeBufferSize )
2020-11-19 05:00:47 -08:00
}
if writeBufferSize % 1024 != 0 {
return nil , errors . Errorf ( "ChunkDiskMapper write buffer size should be a multiple of 1024 (actual: %d)" , writeBufferSize )
}
2021-10-22 01:06:44 -07:00
if err := os . MkdirAll ( dir , 0 o777 ) ; err != nil {
2020-03-19 09:33:44 -07:00
return nil , err
}
dirFile , err := fileutil . OpenDir ( dir )
if err != nil {
return nil , err
}
m := & ChunkDiskMapper {
2020-11-19 05:00:47 -08:00
dir : dirFile ,
pool : pool ,
writeBufferSize : writeBufferSize ,
crc32 : newCRC32 ( ) ,
chunkBuffer : newChunkBuffer ( ) ,
2020-03-19 09:33:44 -07:00
}
2022-01-10 05:36:45 -08:00
m . writeQueue = newChunkWriteQueue ( reg , writeQueueSize , m . writeChunk )
2020-03-19 09:33:44 -07:00
if m . pool == nil {
m . pool = chunkenc . NewPool ( )
}
return m , m . openMMapFiles ( )
}
2021-11-17 02:21:27 -08:00
// openMMapFiles opens all files within dir for mmapping.
2020-03-19 09:33:44 -07:00
func ( cdm * ChunkDiskMapper ) openMMapFiles ( ) ( returnErr error ) {
cdm . mmappedChunkFiles = map [ int ] * mmappedChunkFile { }
cdm . closers = map [ int ] io . Closer { }
defer func ( ) {
if returnErr != nil {
2020-10-28 08:24:58 -07:00
returnErr = tsdb_errors . NewMulti ( returnErr , closeAllFromMap ( cdm . closers ) ) . Err ( )
2020-03-19 09:33:44 -07:00
cdm . mmappedChunkFiles = nil
cdm . closers = nil
}
} ( )
files , err := listChunkFiles ( cdm . dir . Name ( ) )
if err != nil {
return err
}
2020-10-21 05:57:13 -07:00
files , err = repairLastChunkFile ( files )
if err != nil {
return err
}
2020-03-19 09:33:44 -07:00
chkFileIndices := make ( [ ] int , 0 , len ( files ) )
for seq , fn := range files {
f , err := fileutil . OpenMmapFile ( fn )
if err != nil {
2020-05-28 07:54:19 -07:00
return errors . Wrapf ( err , "mmap files, file: %s" , fn )
2020-03-19 09:33:44 -07:00
}
cdm . closers [ seq ] = f
cdm . mmappedChunkFiles [ seq ] = & mmappedChunkFile { byteSlice : realByteSlice ( f . Bytes ( ) ) }
chkFileIndices = append ( chkFileIndices , seq )
}
// Check for gaps in the files.
sort . Ints ( chkFileIndices )
if len ( chkFileIndices ) == 0 {
return nil
}
lastSeq := chkFileIndices [ 0 ]
for _ , seq := range chkFileIndices [ 1 : ] {
if seq != lastSeq + 1 {
2020-06-17 07:40:00 -07:00
return errors . Errorf ( "found unsequential head chunk files %s (index: %d) and %s (index: %d)" , files [ lastSeq ] , lastSeq , files [ seq ] , seq )
2020-03-19 09:33:44 -07:00
}
lastSeq = seq
}
for i , b := range cdm . mmappedChunkFiles {
if b . byteSlice . Len ( ) < HeadChunkFileHeaderSize {
2020-06-17 07:40:00 -07:00
return errors . Wrapf ( errInvalidSize , "%s: invalid head chunk file header" , files [ i ] )
2020-03-19 09:33:44 -07:00
}
// Verify magic number.
if m := binary . BigEndian . Uint32 ( b . byteSlice . Range ( 0 , MagicChunksSize ) ) ; m != MagicHeadChunks {
2020-06-17 07:40:00 -07:00
return errors . Errorf ( "%s: invalid magic number %x" , files [ i ] , m )
2020-03-19 09:33:44 -07:00
}
// Verify chunk format version.
if v := int ( b . byteSlice . Range ( MagicChunksSize , MagicChunksSize + ChunksFormatVersionSize ) [ 0 ] ) ; v != chunksFormatV1 {
2020-06-17 07:40:00 -07:00
return errors . Errorf ( "%s: invalid chunk format version %d" , files [ i ] , v )
2020-03-19 09:33:44 -07:00
}
}
2022-01-10 05:36:45 -08:00
cdm . evtlPos . initSeq ( uint64 ( lastSeq ) )
2020-03-19 09:33:44 -07:00
return nil
}
func listChunkFiles ( dir string ) ( map [ int ] string , error ) {
files , err := ioutil . ReadDir ( dir )
if err != nil {
return nil , err
}
res := map [ int ] string { }
for _ , fi := range files {
seq , err := strconv . ParseUint ( fi . Name ( ) , 10 , 64 )
if err != nil {
continue
}
res [ int ( seq ) ] = filepath . Join ( dir , fi . Name ( ) )
}
2020-10-21 05:57:13 -07:00
2020-03-19 09:33:44 -07:00
return res , nil
}
2020-10-21 05:57:13 -07:00
// repairLastChunkFile deletes the last file if it's empty.
2021-11-17 02:21:27 -08:00
// Because we don't fsync when creating these files, we could end
2020-10-21 05:57:13 -07:00
// up with an empty file at the end during an abrupt shutdown.
func repairLastChunkFile ( files map [ int ] string ) ( _ map [ int ] string , returnErr error ) {
lastFile := - 1
for seq := range files {
if seq > lastFile {
lastFile = seq
}
}
if lastFile <= 0 {
return files , nil
}
info , err := os . Stat ( files [ lastFile ] )
if err != nil {
return files , errors . Wrap ( err , "file stat during last head chunk file repair" )
}
if info . Size ( ) == 0 {
// Corrupt file, hence remove it.
if err := os . RemoveAll ( files [ lastFile ] ) ; err != nil {
return files , errors . Wrap ( err , "delete corrupted, empty head chunk file during last file repair" )
}
delete ( files , lastFile )
}
return files , nil
}
2020-03-19 09:33:44 -07:00
// WriteChunk writes the chunk to the disk.
// The returned chunk ref is the reference from where the chunk encoding starts for the chunk.
2022-01-10 05:36:45 -08:00
func ( cdm * ChunkDiskMapper ) WriteChunk ( seriesRef HeadSeriesRef , mint , maxt int64 , chk chunkenc . Chunk , callback func ( err error ) ) ( chkRef ChunkDiskMapperRef ) {
var err error
defer func ( ) {
if err != nil && callback != nil {
callback ( err )
}
} ( )
// cdm.evtlPosMtx must be held to serialize the calls to .getNextChunkRef() and .addJob().
cdm . evtlPosMtx . Lock ( )
defer cdm . evtlPosMtx . Unlock ( )
ref , cutFile := cdm . evtlPos . getNextChunkRef ( chk )
err = cdm . writeQueue . addJob ( chunkWriteJob {
cutFile : cutFile ,
seriesRef : seriesRef ,
mint : mint ,
maxt : maxt ,
chk : chk ,
ref : ref ,
callback : callback ,
} )
return ref
}
func ( cdm * ChunkDiskMapper ) writeChunk ( seriesRef HeadSeriesRef , mint , maxt int64 , chk chunkenc . Chunk , ref ChunkDiskMapperRef , cutFile bool ) ( err error ) {
2020-03-19 09:33:44 -07:00
cdm . writePathMtx . Lock ( )
defer cdm . writePathMtx . Unlock ( )
if cdm . closed {
2022-01-10 05:36:45 -08:00
return ErrChunkDiskMapperClosed
2020-03-19 09:33:44 -07:00
}
2022-01-10 05:36:45 -08:00
if cutFile {
err := cdm . cutAndExpectRef ( ref )
if err != nil {
return err
2020-03-19 09:33:44 -07:00
}
}
// if len(chk.Bytes())+MaxHeadChunkMetaSize >= writeBufferSize, it means that chunk >= the buffer size;
// so no need to flush here, as we have to flush at the end (to not keep partial chunks in buffer).
2020-11-19 05:00:47 -08:00
if len ( chk . Bytes ( ) ) + MaxHeadChunkMetaSize < cdm . writeBufferSize && cdm . chkWriter . Available ( ) < MaxHeadChunkMetaSize + len ( chk . Bytes ( ) ) {
2020-03-19 09:33:44 -07:00
if err := cdm . flushBuffer ( ) ; err != nil {
2022-01-10 05:36:45 -08:00
return err
2020-03-19 09:33:44 -07:00
}
}
cdm . crc32 . Reset ( )
bytesWritten := 0
2021-11-06 03:10:04 -07:00
binary . BigEndian . PutUint64 ( cdm . byteBuf [ bytesWritten : ] , uint64 ( seriesRef ) )
2020-03-19 09:33:44 -07:00
bytesWritten += SeriesRefSize
binary . BigEndian . PutUint64 ( cdm . byteBuf [ bytesWritten : ] , uint64 ( mint ) )
bytesWritten += MintMaxtSize
binary . BigEndian . PutUint64 ( cdm . byteBuf [ bytesWritten : ] , uint64 ( maxt ) )
bytesWritten += MintMaxtSize
cdm . byteBuf [ bytesWritten ] = byte ( chk . Encoding ( ) )
bytesWritten += ChunkEncodingSize
n := binary . PutUvarint ( cdm . byteBuf [ bytesWritten : ] , uint64 ( len ( chk . Bytes ( ) ) ) )
bytesWritten += n
if err := cdm . writeAndAppendToCRC32 ( cdm . byteBuf [ : bytesWritten ] ) ; err != nil {
2022-01-10 05:36:45 -08:00
return err
2020-03-19 09:33:44 -07:00
}
if err := cdm . writeAndAppendToCRC32 ( chk . Bytes ( ) ) ; err != nil {
2022-01-10 05:36:45 -08:00
return err
2020-03-19 09:33:44 -07:00
}
if err := cdm . writeCRC32 ( ) ; err != nil {
2022-01-10 05:36:45 -08:00
return err
2020-03-19 09:33:44 -07:00
}
if maxt > cdm . curFileMaxt {
cdm . curFileMaxt = maxt
}
2022-01-10 05:36:45 -08:00
cdm . chunkBuffer . put ( ref , chk )
2020-03-19 09:33:44 -07:00
2020-11-19 05:00:47 -08:00
if len ( chk . Bytes ( ) ) + MaxHeadChunkMetaSize >= cdm . writeBufferSize {
2020-03-19 09:33:44 -07:00
// The chunk was bigger than the buffer itself.
// Flushing to not keep partial chunks in buffer.
if err := cdm . flushBuffer ( ) ; err != nil {
2022-01-10 05:36:45 -08:00
return err
2020-03-19 09:33:44 -07:00
}
}
2022-01-10 05:36:45 -08:00
return nil
2020-03-19 09:33:44 -07:00
}
2022-01-10 05:36:45 -08:00
// CutNewFile makes that a new file will be created the next time a chunk is written.
func ( cdm * ChunkDiskMapper ) CutNewFile ( ) {
cdm . evtlPosMtx . Lock ( )
defer cdm . evtlPosMtx . Unlock ( )
cdm . evtlPos . cutFileOnNextChunk ( )
2020-03-19 09:33:44 -07:00
}
2022-01-11 03:54:03 -08:00
func ( cdm * ChunkDiskMapper ) IsQueueEmpty ( ) bool {
return cdm . writeQueue . queueIsEmpty ( )
}
2022-01-10 05:36:45 -08:00
// cutAndExpectRef creates a new m-mapped file.
// The write lock should be held before calling this.
// It ensures that the position in the new file matches the given chunk reference, if not then it errors.
func ( cdm * ChunkDiskMapper ) cutAndExpectRef ( chkRef ChunkDiskMapperRef ) ( err error ) {
seq , offset , err := cdm . cut ( )
if err != nil {
return err
}
if expSeq , expOffset := chkRef . Unpack ( ) ; seq != expSeq || offset != expOffset {
return errors . Errorf ( "expected newly cut file to have sequence:offset %d:%d, got %d:%d" , expSeq , expOffset , seq , offset )
}
2020-05-29 07:38:41 -07:00
2022-01-10 05:36:45 -08:00
return nil
2020-05-29 07:38:41 -07:00
}
// cut creates a new m-mapped file. The write lock should be held before calling this.
2022-01-10 05:36:45 -08:00
// It returns the file sequence and the offset in that file to start writing chunks.
func ( cdm * ChunkDiskMapper ) cut ( ) ( seq , offset int , returnErr error ) {
2020-03-19 09:33:44 -07:00
// Sync current tail to disk and close.
if err := cdm . finalizeCurFile ( ) ; err != nil {
2022-01-10 05:36:45 -08:00
return 0 , 0 , err
2020-03-19 09:33:44 -07:00
}
2022-01-10 05:36:45 -08:00
offset , newFile , seq , err := cutSegmentFile ( cdm . dir , MagicHeadChunks , headChunksFormatV1 , HeadChunkFilePreallocationSize )
2020-03-19 09:33:44 -07:00
if err != nil {
2022-01-10 05:36:45 -08:00
return 0 , 0 , err
2020-03-19 09:33:44 -07:00
}
2022-01-10 05:36:45 -08:00
2020-03-19 09:33:44 -07:00
defer func ( ) {
// The file should not be closed if there is no error,
// its kept open in the ChunkDiskMapper.
if returnErr != nil {
2020-10-28 08:24:58 -07:00
returnErr = tsdb_errors . NewMulti ( returnErr , newFile . Close ( ) ) . Err ( )
2020-03-19 09:33:44 -07:00
}
} ( )
2022-01-10 05:36:45 -08:00
cdm . curFileOffset . Store ( uint64 ( offset ) )
2020-03-19 09:33:44 -07:00
if cdm . curFile != nil {
cdm . readPathMtx . Lock ( )
cdm . mmappedChunkFiles [ cdm . curFileSequence ] . maxt = cdm . curFileMaxt
cdm . readPathMtx . Unlock ( )
}
2021-07-29 05:56:57 -07:00
mmapFile , err := fileutil . OpenMmapFileWithSize ( newFile . Name ( ) , MaxHeadChunkFileSize )
2020-03-19 09:33:44 -07:00
if err != nil {
2022-01-10 05:36:45 -08:00
return 0 , 0 , err
2020-03-19 09:33:44 -07:00
}
2020-08-26 01:52:48 -07:00
cdm . readPathMtx . Lock ( )
2020-03-19 09:33:44 -07:00
cdm . curFileSequence = seq
cdm . curFile = newFile
if cdm . chkWriter != nil {
cdm . chkWriter . Reset ( newFile )
} else {
2020-11-19 05:00:47 -08:00
cdm . chkWriter = bufio . NewWriterSize ( newFile , cdm . writeBufferSize )
2020-03-19 09:33:44 -07:00
}
cdm . closers [ cdm . curFileSequence ] = mmapFile
cdm . mmappedChunkFiles [ cdm . curFileSequence ] = & mmappedChunkFile { byteSlice : realByteSlice ( mmapFile . Bytes ( ) ) }
cdm . readPathMtx . Unlock ( )
cdm . curFileMaxt = 0
2022-01-10 05:36:45 -08:00
return seq , offset , nil
2020-03-19 09:33:44 -07:00
}
// finalizeCurFile writes all pending data to the current tail file,
// truncates its size, and closes it.
func ( cdm * ChunkDiskMapper ) finalizeCurFile ( ) error {
if cdm . curFile == nil {
return nil
}
if err := cdm . flushBuffer ( ) ; err != nil {
return err
}
if err := cdm . curFile . Sync ( ) ; err != nil {
return err
}
return cdm . curFile . Close ( )
}
func ( cdm * ChunkDiskMapper ) write ( b [ ] byte ) error {
n , err := cdm . chkWriter . Write ( b )
2022-01-10 05:36:45 -08:00
cdm . curFileOffset . Add ( uint64 ( n ) )
2020-03-19 09:33:44 -07:00
return err
}
func ( cdm * ChunkDiskMapper ) writeAndAppendToCRC32 ( b [ ] byte ) error {
if err := cdm . write ( b ) ; err != nil {
return err
}
_ , err := cdm . crc32 . Write ( b )
return err
}
func ( cdm * ChunkDiskMapper ) writeCRC32 ( ) error {
return cdm . write ( cdm . crc32 . Sum ( cdm . byteBuf [ : 0 ] ) )
}
// flushBuffer flushes the current in-memory chunks.
// Assumes that writePathMtx is _write_ locked before calling this method.
func ( cdm * ChunkDiskMapper ) flushBuffer ( ) error {
if err := cdm . chkWriter . Flush ( ) ; err != nil {
return err
}
cdm . chunkBuffer . clear ( )
return nil
}
// Chunk returns a chunk from a given reference.
2021-10-13 05:14:32 -07:00
func ( cdm * ChunkDiskMapper ) Chunk ( ref ChunkDiskMapperRef ) ( chunkenc . Chunk , error ) {
2020-03-19 09:33:44 -07:00
cdm . readPathMtx . RLock ( )
2021-11-17 02:21:27 -08:00
// We hold this read lock for the entire duration because if Close()
2020-03-19 09:33:44 -07:00
// is called, the data in the byte slice will get corrupted as the mmapped
// file will be closed.
defer cdm . readPathMtx . RUnlock ( )
if cdm . closed {
return nil , ErrChunkDiskMapperClosed
}
2022-01-10 05:36:45 -08:00
chunk := cdm . writeQueue . get ( ref )
if chunk != nil {
return chunk , nil
}
2021-10-13 05:14:32 -07:00
sgmIndex , chkStart := ref . Unpack ( )
// We skip the series ref and the mint/maxt beforehand.
chkStart += SeriesRefSize + ( 2 * MintMaxtSize )
chkCRC32 := newCRC32 ( )
2020-03-19 09:33:44 -07:00
// If it is the current open file, then the chunks can be in the buffer too.
if sgmIndex == cdm . curFileSequence {
chunk := cdm . chunkBuffer . get ( ref )
if chunk != nil {
return chunk , nil
}
}
mmapFile , ok := cdm . mmappedChunkFiles [ sgmIndex ]
if ! ok {
if sgmIndex > cdm . curFileSequence {
2020-05-22 02:03:23 -07:00
return nil , & CorruptionErr {
Dir : cdm . dir . Name ( ) ,
FileIndex : - 1 ,
Err : errors . Errorf ( "head chunk file index %d more than current open file" , sgmIndex ) ,
}
}
return nil , & CorruptionErr {
Dir : cdm . dir . Name ( ) ,
FileIndex : sgmIndex ,
Err : errors . New ( "head chunk file index %d does not exist on disk" ) ,
2020-03-19 09:33:44 -07:00
}
}
if chkStart + MaxChunkLengthFieldSize > mmapFile . byteSlice . Len ( ) {
2020-05-22 02:03:23 -07:00
return nil , & CorruptionErr {
Dir : cdm . dir . Name ( ) ,
FileIndex : sgmIndex ,
Err : errors . Errorf ( "head chunk file doesn't include enough bytes to read the chunk size data field - required:%v, available:%v" , chkStart + MaxChunkLengthFieldSize , mmapFile . byteSlice . Len ( ) ) ,
}
2020-03-19 09:33:44 -07:00
}
// Encoding.
chkEnc := mmapFile . byteSlice . Range ( chkStart , chkStart + ChunkEncodingSize ) [ 0 ]
// Data length.
// With the minimum chunk length this should never cause us reading
// over the end of the slice.
chkDataLenStart := chkStart + ChunkEncodingSize
c := mmapFile . byteSlice . Range ( chkDataLenStart , chkDataLenStart + MaxChunkLengthFieldSize )
chkDataLen , n := binary . Uvarint ( c )
if n <= 0 {
2020-05-22 02:03:23 -07:00
return nil , & CorruptionErr {
Dir : cdm . dir . Name ( ) ,
FileIndex : sgmIndex ,
Err : errors . Errorf ( "reading chunk length failed with %d" , n ) ,
}
2020-03-19 09:33:44 -07:00
}
// Verify the chunk data end.
chkDataEnd := chkDataLenStart + n + int ( chkDataLen )
if chkDataEnd > mmapFile . byteSlice . Len ( ) {
2020-05-22 02:03:23 -07:00
return nil , & CorruptionErr {
Dir : cdm . dir . Name ( ) ,
FileIndex : sgmIndex ,
Err : errors . Errorf ( "head chunk file doesn't include enough bytes to read the chunk - required:%v, available:%v" , chkDataEnd , mmapFile . byteSlice . Len ( ) ) ,
}
2020-03-19 09:33:44 -07:00
}
// Check the CRC.
sum := mmapFile . byteSlice . Range ( chkDataEnd , chkDataEnd + CRCSize )
if _ , err := chkCRC32 . Write ( mmapFile . byteSlice . Range ( chkStart - ( SeriesRefSize + 2 * MintMaxtSize ) , chkDataEnd ) ) ; err != nil {
2020-05-22 02:03:23 -07:00
return nil , & CorruptionErr {
Dir : cdm . dir . Name ( ) ,
FileIndex : sgmIndex ,
Err : err ,
}
2020-03-19 09:33:44 -07:00
}
if act := chkCRC32 . Sum ( nil ) ; ! bytes . Equal ( act , sum ) {
2020-05-22 02:03:23 -07:00
return nil , & CorruptionErr {
Dir : cdm . dir . Name ( ) ,
FileIndex : sgmIndex ,
Err : errors . Errorf ( "checksum mismatch expected:%x, actual:%x" , sum , act ) ,
}
2020-03-19 09:33:44 -07:00
}
// The chunk data itself.
chkData := mmapFile . byteSlice . Range ( chkDataEnd - int ( chkDataLen ) , chkDataEnd )
2021-05-06 13:18:59 -07:00
// Make a copy of the chunk data to prevent a panic occurring because the returned
// chunk data slice references an mmap-ed file which could be closed after the
// function returns but while the chunk is still in use.
chkDataCopy := make ( [ ] byte , len ( chkData ) )
copy ( chkDataCopy , chkData )
chk , err := cdm . pool . Get ( chunkenc . Encoding ( chkEnc ) , chkDataCopy )
2020-05-22 02:03:23 -07:00
if err != nil {
return nil , & CorruptionErr {
Dir : cdm . dir . Name ( ) ,
FileIndex : sgmIndex ,
Err : err ,
}
}
return chk , nil
2020-03-19 09:33:44 -07:00
}
2021-11-17 02:21:27 -08:00
// IterateAllChunks iterates all mmappedChunkFiles (in order of head chunk file name/number) and all the chunks within it
// and runs the provided function with information about each chunk. It returns on the first error encountered.
2020-03-19 09:33:44 -07:00
// NOTE: This method needs to be called at least once after creating ChunkDiskMapper
// to set the maxt of all the file.
2021-11-06 03:10:04 -07:00
func ( cdm * ChunkDiskMapper ) IterateAllChunks ( f func ( seriesRef HeadSeriesRef , chunkRef ChunkDiskMapperRef , mint , maxt int64 , numSamples uint16 ) error ) ( err error ) {
2020-03-19 09:33:44 -07:00
cdm . writePathMtx . Lock ( )
defer cdm . writePathMtx . Unlock ( )
defer func ( ) {
2020-08-26 10:59:18 -07:00
cdm . fileMaxtSet = true
2020-03-19 09:33:44 -07:00
} ( )
chkCRC32 := newCRC32 ( )
// Iterate files in ascending order.
segIDs := make ( [ ] int , 0 , len ( cdm . mmappedChunkFiles ) )
for seg := range cdm . mmappedChunkFiles {
segIDs = append ( segIDs , seg )
}
sort . Ints ( segIDs )
for _ , segID := range segIDs {
mmapFile := cdm . mmappedChunkFiles [ segID ]
fileEnd := mmapFile . byteSlice . Len ( )
if segID == cdm . curFileSequence {
2020-07-23 05:35:19 -07:00
fileEnd = int ( cdm . curFileSize ( ) )
2020-03-19 09:33:44 -07:00
}
idx := HeadChunkFileHeaderSize
for idx < fileEnd {
if fileEnd - idx < MaxHeadChunkMetaSize {
// Check for all 0s which marks the end of the file.
allZeros := true
for _ , b := range mmapFile . byteSlice . Range ( idx , fileEnd ) {
if b != byte ( 0 ) {
allZeros = false
break
}
}
if allZeros {
2021-02-25 01:08:12 -08:00
// End of segment chunk file content.
2020-03-19 09:33:44 -07:00
break
}
2020-05-06 08:30:00 -07:00
return & CorruptionErr {
2020-03-19 09:33:44 -07:00
Dir : cdm . dir . Name ( ) ,
FileIndex : segID ,
2021-02-25 01:08:12 -08:00
Err : errors . Errorf ( "head chunk file has some unread data, but doesn't include enough bytes to read the chunk header" +
" - required:%v, available:%v, file:%d" , idx + MaxHeadChunkMetaSize , fileEnd , segID ) ,
2020-03-19 09:33:44 -07:00
}
}
chkCRC32 . Reset ( )
2021-10-13 05:14:32 -07:00
chunkRef := newChunkDiskMapperRef ( uint64 ( segID ) , uint64 ( idx ) )
2020-03-19 09:33:44 -07:00
startIdx := idx
2021-11-06 03:10:04 -07:00
seriesRef := HeadSeriesRef ( binary . BigEndian . Uint64 ( mmapFile . byteSlice . Range ( idx , idx + SeriesRefSize ) ) )
2020-03-19 09:33:44 -07:00
idx += SeriesRefSize
mint := int64 ( binary . BigEndian . Uint64 ( mmapFile . byteSlice . Range ( idx , idx + MintMaxtSize ) ) )
idx += MintMaxtSize
maxt := int64 ( binary . BigEndian . Uint64 ( mmapFile . byteSlice . Range ( idx , idx + MintMaxtSize ) ) )
idx += MintMaxtSize
// We preallocate file to help with m-mapping (especially windows systems).
// As series ref always starts from 1, we assume it being 0 to be the end of the actual file data.
// We are not considering possible file corruption that can cause it to be 0.
// Additionally we are checking mint and maxt just to be sure.
if seriesRef == 0 && mint == 0 && maxt == 0 {
break
}
idx += ChunkEncodingSize // Skip encoding.
dataLen , n := binary . Uvarint ( mmapFile . byteSlice . Range ( idx , idx + MaxChunkLengthFieldSize ) )
2020-05-06 08:30:00 -07:00
idx += n
numSamples := binary . BigEndian . Uint16 ( mmapFile . byteSlice . Range ( idx , idx + 2 ) )
idx += int ( dataLen ) // Skip the data.
2020-03-19 09:33:44 -07:00
// In the beginning we only checked for the chunk meta size.
// Now that we have added the chunk data length, we check for sufficient bytes again.
if idx + CRCSize > fileEnd {
2020-05-06 08:30:00 -07:00
return & CorruptionErr {
2020-03-19 09:33:44 -07:00
Dir : cdm . dir . Name ( ) ,
FileIndex : segID ,
Err : errors . Errorf ( "head chunk file doesn't include enough bytes to read the chunk header - required:%v, available:%v, file:%d" , idx + CRCSize , fileEnd , segID ) ,
}
}
// Check CRC.
sum := mmapFile . byteSlice . Range ( idx , idx + CRCSize )
if _ , err := chkCRC32 . Write ( mmapFile . byteSlice . Range ( startIdx , idx ) ) ; err != nil {
return err
}
if act := chkCRC32 . Sum ( nil ) ; ! bytes . Equal ( act , sum ) {
2020-05-06 08:30:00 -07:00
return & CorruptionErr {
2020-03-19 09:33:44 -07:00
Dir : cdm . dir . Name ( ) ,
FileIndex : segID ,
Err : errors . Errorf ( "checksum mismatch expected:%x, actual:%x" , sum , act ) ,
}
}
idx += CRCSize
if maxt > mmapFile . maxt {
mmapFile . maxt = maxt
}
2020-05-06 08:30:00 -07:00
if err := f ( seriesRef , chunkRef , mint , maxt , numSamples ) ; err != nil {
2020-08-26 08:06:27 -07:00
if cerr , ok := err . ( * CorruptionErr ) ; ok {
cerr . Dir = cdm . dir . Name ( )
cerr . FileIndex = segID
return cerr
}
2020-03-19 09:33:44 -07:00
return err
}
}
if idx > fileEnd {
// It should be equal to the slice length.
2020-05-06 08:30:00 -07:00
return & CorruptionErr {
2020-03-19 09:33:44 -07:00
Dir : cdm . dir . Name ( ) ,
FileIndex : segID ,
Err : errors . Errorf ( "head chunk file doesn't include enough bytes to read the last chunk data - required:%v, available:%v, file:%d" , idx , fileEnd , segID ) ,
}
}
}
return nil
}
// Truncate deletes the head chunk files which are strictly below the mint.
// mint should be in milliseconds.
func ( cdm * ChunkDiskMapper ) Truncate ( mint int64 ) error {
if ! cdm . fileMaxtSet {
return errors . New ( "maxt of the files are not set" )
}
cdm . readPathMtx . RLock ( )
// Sort the file indices, else if files deletion fails in between,
// it can lead to unsequential files as the map is not sorted.
chkFileIndices := make ( [ ] int , 0 , len ( cdm . mmappedChunkFiles ) )
for seq := range cdm . mmappedChunkFiles {
chkFileIndices = append ( chkFileIndices , seq )
}
sort . Ints ( chkFileIndices )
var removedFiles [ ] int
for _ , seq := range chkFileIndices {
2020-06-18 06:54:58 -07:00
if seq == cdm . curFileSequence || cdm . mmappedChunkFiles [ seq ] . maxt >= mint {
break
2020-03-19 09:33:44 -07:00
}
if cdm . mmappedChunkFiles [ seq ] . maxt < mint {
removedFiles = append ( removedFiles , seq )
}
}
cdm . readPathMtx . RUnlock ( )
2020-10-28 08:24:58 -07:00
errs := tsdb_errors . NewMulti ( )
2020-06-18 06:54:58 -07:00
// Cut a new file only if the current file has some chunks.
2020-07-23 05:35:19 -07:00
if cdm . curFileSize ( ) > HeadChunkFileHeaderSize {
2022-01-10 05:36:45 -08:00
// There is a known race condition here because between the check of curFileSize() and the call to CutNewFile()
// a new file could already be cut, this is acceptable because it will simply result in an empty file which
// won't do any harm.
cdm . CutNewFile ( )
2020-06-18 06:54:58 -07:00
}
2020-10-28 08:24:58 -07:00
errs . Add ( cdm . deleteFiles ( removedFiles ) )
return errs . Err ( )
2020-03-19 09:33:44 -07:00
}
func ( cdm * ChunkDiskMapper ) deleteFiles ( removedFiles [ ] int ) error {
cdm . readPathMtx . Lock ( )
for _ , seq := range removedFiles {
if err := cdm . closers [ seq ] . Close ( ) ; err != nil {
cdm . readPathMtx . Unlock ( )
return err
}
delete ( cdm . mmappedChunkFiles , seq )
delete ( cdm . closers , seq )
}
cdm . readPathMtx . Unlock ( )
// We actually delete the files separately to not block the readPathMtx for long.
for _ , seq := range removedFiles {
if err := os . Remove ( segmentFile ( cdm . dir . Name ( ) , seq ) ) ; err != nil {
return err
}
}
return nil
}
2020-05-06 08:30:00 -07:00
// DeleteCorrupted deletes all the head chunk files after the one which had the corruption
2020-03-19 09:33:44 -07:00
// (including the corrupt file).
2020-05-06 08:30:00 -07:00
func ( cdm * ChunkDiskMapper ) DeleteCorrupted ( originalErr error ) error {
2020-03-19 09:33:44 -07:00
err := errors . Cause ( originalErr ) // So that we can pick up errors even if wrapped.
2020-05-06 08:30:00 -07:00
cerr , ok := err . ( * CorruptionErr )
2020-03-19 09:33:44 -07:00
if ! ok {
return errors . Wrap ( originalErr , "cannot handle error" )
}
// Delete all the head chunk files following the corrupt head chunk file.
segs := [ ] int { }
2020-08-26 01:52:48 -07:00
cdm . readPathMtx . RLock ( )
2020-03-19 09:33:44 -07:00
for seg := range cdm . mmappedChunkFiles {
if seg >= cerr . FileIndex {
segs = append ( segs , seg )
}
}
2020-08-26 01:52:48 -07:00
cdm . readPathMtx . RUnlock ( )
2020-03-19 09:33:44 -07:00
return cdm . deleteFiles ( segs )
}
// Size returns the size of the chunk files.
2020-11-03 02:04:59 -08:00
func ( cdm * ChunkDiskMapper ) Size ( ) ( int64 , error ) {
return fileutil . DirSize ( cdm . dir . Name ( ) )
2020-07-23 05:35:19 -07:00
}
2022-01-10 05:36:45 -08:00
func ( cdm * ChunkDiskMapper ) curFileSize ( ) uint64 {
return cdm . curFileOffset . Load ( )
2020-03-19 09:33:44 -07:00
}
// Close closes all the open files in ChunkDiskMapper.
// It is not longer safe to access chunks from this struct after calling Close.
func ( cdm * ChunkDiskMapper ) Close ( ) error {
2022-01-10 05:36:45 -08:00
// Locking the eventual position lock blocks WriteChunk()
cdm . evtlPosMtx . Lock ( )
defer cdm . evtlPosMtx . Unlock ( )
cdm . writeQueue . stop ( )
2020-03-19 09:33:44 -07:00
// 'WriteChunk' locks writePathMtx first and then readPathMtx for cutting head chunk file.
// The lock order should not be reversed here else it can cause deadlocks.
cdm . writePathMtx . Lock ( )
defer cdm . writePathMtx . Unlock ( )
cdm . readPathMtx . Lock ( )
defer cdm . readPathMtx . Unlock ( )
if cdm . closed {
return nil
}
cdm . closed = true
2020-10-28 08:24:58 -07:00
errs := tsdb_errors . NewMulti (
closeAllFromMap ( cdm . closers ) ,
cdm . finalizeCurFile ( ) ,
cdm . dir . Close ( ) ,
)
2020-03-19 09:33:44 -07:00
cdm . mmappedChunkFiles = map [ int ] * mmappedChunkFile { }
cdm . closers = map [ int ] io . Closer { }
2020-10-28 08:24:58 -07:00
return errs . Err ( )
2020-03-19 09:33:44 -07:00
}
func closeAllFromMap ( cs map [ int ] io . Closer ) error {
2020-10-28 08:24:58 -07:00
errs := tsdb_errors . NewMulti ( )
2020-03-19 09:33:44 -07:00
for _ , c := range cs {
2020-10-28 08:24:58 -07:00
errs . Add ( c . Close ( ) )
2020-03-19 09:33:44 -07:00
}
2020-10-28 08:24:58 -07:00
return errs . Err ( )
2020-03-19 09:33:44 -07:00
}
const inBufferShards = 128 // 128 is a randomly chosen number.
2021-11-17 02:21:27 -08:00
// chunkBuffer is a thread safe lookup table for chunks by their ref.
2020-03-19 09:33:44 -07:00
type chunkBuffer struct {
2021-10-13 05:14:32 -07:00
inBufferChunks [ inBufferShards ] map [ ChunkDiskMapperRef ] chunkenc . Chunk
2020-03-19 09:33:44 -07:00
inBufferChunksMtxs [ inBufferShards ] sync . RWMutex
}
func newChunkBuffer ( ) * chunkBuffer {
cb := & chunkBuffer { }
for i := 0 ; i < inBufferShards ; i ++ {
2021-10-13 05:14:32 -07:00
cb . inBufferChunks [ i ] = make ( map [ ChunkDiskMapperRef ] chunkenc . Chunk )
2020-03-19 09:33:44 -07:00
}
return cb
}
2021-10-13 05:14:32 -07:00
func ( cb * chunkBuffer ) put ( ref ChunkDiskMapperRef , chk chunkenc . Chunk ) {
2020-03-19 09:33:44 -07:00
shardIdx := ref % inBufferShards
cb . inBufferChunksMtxs [ shardIdx ] . Lock ( )
cb . inBufferChunks [ shardIdx ] [ ref ] = chk
cb . inBufferChunksMtxs [ shardIdx ] . Unlock ( )
}
2021-10-13 05:14:32 -07:00
func ( cb * chunkBuffer ) get ( ref ChunkDiskMapperRef ) chunkenc . Chunk {
2020-03-19 09:33:44 -07:00
shardIdx := ref % inBufferShards
cb . inBufferChunksMtxs [ shardIdx ] . RLock ( )
defer cb . inBufferChunksMtxs [ shardIdx ] . RUnlock ( )
return cb . inBufferChunks [ shardIdx ] [ ref ]
}
func ( cb * chunkBuffer ) clear ( ) {
for i := 0 ; i < inBufferShards ; i ++ {
cb . inBufferChunksMtxs [ i ] . Lock ( )
2021-10-13 05:14:32 -07:00
cb . inBufferChunks [ i ] = make ( map [ ChunkDiskMapperRef ] chunkenc . Chunk )
2020-03-19 09:33:44 -07:00
cb . inBufferChunksMtxs [ i ] . Unlock ( )
}
}