2015-01-21 11:07:45 -08:00
// Copyright 2014 The Prometheus Authors
2014-09-19 09:18:44 -07:00
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
2014-09-16 06:47:24 -07:00
package local
2014-06-06 02:55:53 -07:00
import (
"bufio"
"encoding/binary"
"fmt"
"io"
2015-03-03 09:59:39 -08:00
"io/ioutil"
2014-06-06 02:55:53 -07:00
"os"
"path"
2015-01-14 07:52:09 -08:00
"path/filepath"
2015-03-02 11:02:37 -08:00
"strconv"
"strings"
2014-10-07 10:11:24 -07:00
"sync"
2015-03-08 18:33:10 -07:00
"sync/atomic"
2014-09-23 10:21:10 -07:00
"time"
2014-06-06 02:55:53 -07:00
2014-09-24 07:51:18 -07:00
"github.com/prometheus/client_golang/prometheus"
2015-10-03 01:21:43 -07:00
"github.com/prometheus/common/log"
2015-08-20 08:18:46 -07:00
"github.com/prometheus/common/model"
2014-06-06 02:55:53 -07:00
2014-09-23 10:21:10 -07:00
"github.com/prometheus/prometheus/storage/local/codable"
2014-08-21 13:06:11 -07:00
"github.com/prometheus/prometheus/storage/local/index"
2015-05-29 04:30:30 -07:00
"github.com/prometheus/prometheus/util/flock"
2014-06-06 02:55:53 -07:00
)
const (
2015-03-03 09:59:39 -08:00
// Version of the storage as it can be found in the version file.
// Increment to protect against incompatible changes.
2015-03-02 11:02:37 -08:00
Version = 1
versionFileName = "VERSION"
2014-10-15 08:07:12 -07:00
seriesFileSuffix = ".db"
seriesTempFileSuffix = ".db.tmp"
2014-11-20 12:03:51 -08:00
seriesDirNameLen = 2 // How many bytes of the fingerprint in dir name.
2014-09-10 09:41:52 -07:00
2015-03-08 18:33:10 -07:00
headsFileName = "heads.db"
headsTempFileName = "heads.db.tmp"
headsFormatVersion = 2
headsFormatLegacyVersion = 1 // Can read, but will never write.
headsMagicString = "PrometheusHeads"
2014-09-10 09:41:52 -07:00
2015-05-06 07:53:12 -07:00
mappingsFileName = "mappings.db"
mappingsTempFileName = "mappings.db.tmp"
mappingsFormatVersion = 1
mappingsMagicString = "PrometheusMappings"
2014-11-20 12:03:51 -08:00
dirtyFileName = "DIRTY"
2014-11-04 07:27:14 -08:00
fileBufSize = 1 << 16 // 64kiB.
2014-08-12 08:46:46 -07:00
2014-06-06 02:55:53 -07:00
chunkHeaderLen = 17
chunkHeaderTypeOffset = 0
chunkHeaderFirstTimeOffset = 1
chunkHeaderLastTimeOffset = 9
2015-04-13 11:20:26 -07:00
chunkLenWithHeader = chunkLen + chunkHeaderLen
chunkMaxBatchSize = 64 // How many chunks to load at most in one batch.
2014-09-23 10:21:10 -07:00
2014-10-06 06:58:12 -07:00
indexingMaxBatchSize = 1024 * 1024
indexingBatchTimeout = 500 * time . Millisecond // Commit batch when idle for that long.
2014-11-05 11:02:45 -08:00
indexingQueueCapacity = 1024 * 16
2014-06-06 02:55:53 -07:00
)
2015-08-20 08:18:46 -07:00
var fpLen = len ( model . Fingerprint ( 0 ) . String ( ) ) // Length of a fingerprint as string.
2014-11-20 12:03:51 -08:00
2014-09-16 06:47:24 -07:00
const (
2014-10-27 12:40:48 -07:00
flagHeadChunkPersisted byte = 1 << iota
// Add more flags here like:
// flagFoo
// flagBar
2014-09-16 06:47:24 -07:00
)
2014-09-23 10:21:10 -07:00
type indexingOpType byte
const (
add indexingOpType = iota
remove
)
type indexingOp struct {
2015-08-20 08:18:46 -07:00
fingerprint model . Fingerprint
metric model . Metric
2014-09-23 10:21:10 -07:00
opType indexingOpType
}
2014-10-07 10:11:24 -07:00
// A Persistence is used by a Storage implementation to store samples
// persistently across restarts. The methods are only goroutine-safe if
Improve persisting chunks to disk.
This is done by bucketing chunks by fingerprint. If the persisting to
disk falls behind, more and more chunks are in the queue. As soon as
there are "double hits", we will now persist both chunks in one go,
doubling the disk throughput (assuming it is limited by disk
seeks). Should even more pile up so that we end wit "triple hits", we
will persist those first, and so on.
Even if we have millions of time series, this will still help,
assuming not all of them are growing with the same speed. Series that
get many samples and/or are not very compressable will accumulate
chunks faster, and they will soon get double- or triple-writes.
To improve the chance of double writes,
-storage.local.persistence-queue-capacity could be set to a higher
value. However, that will slow down shutdown a lot (as the queue has
to be worked through). So we leave it to the user to set it to a
really high value. A more fundamental solution would be to checkpoint
not only head chunks, but also chunks still in the persist queue. That
would be quite complicated for a rather limited use-case (running many
time series with high ingestion rate on slow spinning disks).
2015-02-13 11:08:52 -08:00
// explicitly marked as such below. The chunk-related methods persistChunks,
Fix a bug handling freshly unarchived series.
Usually, if you unarchive a series, it is to add something to it,
which will create a new head chunk. However, if a series in
unarchived, and before anything is added to it, it is handled by the
maintenance loop, it will be archived again. In that case, we have to
load the chunkDescs to know the lastTime of the series to be
archived. Usually, this case will happen only rarely (as a race, has
never happened so far, possibly because the locking around unarchiving
and the subsequent sample append is smart enough). However, during
crash recovery, we sometimes treat series as "freshly unarchived"
without directly appending a sample. We might add more cases of that
type later, so better deal with archiving properly and load chunkDescs
if required.
2015-01-08 07:10:31 -08:00
// dropChunks, loadChunks, and loadChunkDescs can be called concurrently with
2014-10-07 10:11:24 -07:00
// each other if each call refers to a different fingerprint.
type persistence struct {
2015-03-13 07:49:07 -07:00
basePath string
2014-09-10 09:41:52 -07:00
archivedFingerprintToMetrics * index . FingerprintMetricIndex
archivedFingerprintToTimeRange * index . FingerprintTimeRangeIndex
labelPairToFingerprints * index . LabelPairFingerprintIndex
labelNameToLabelValues * index . LabelNameLabelValuesIndex
2014-09-23 10:21:10 -07:00
indexingQueue chan indexingOp
indexingStopped chan struct { }
2014-09-24 05:41:38 -07:00
indexingFlush chan chan int
2014-09-24 07:51:18 -07:00
indexingQueueLength prometheus . Gauge
indexingQueueCapacity prometheus . Metric
indexingBatchSizes prometheus . Summary
2015-03-19 09:06:16 -07:00
indexingBatchDuration prometheus . Summary
2014-10-24 11:27:27 -07:00
checkpointDuration prometheus . Gauge
2015-05-21 08:50:06 -07:00
dirtyCounter prometheus . Counter
2014-11-05 11:02:45 -08:00
2015-03-19 04:03:09 -07:00
dirtyMtx sync . Mutex // Protects dirty and becameDirty.
dirty bool // true if persistence was started in dirty state.
becameDirty bool // true if an inconsistency came up during runtime.
pedanticChecks bool // true if crash recovery should check each series.
dirtyFileName string // The file used for locking and to mark dirty state.
fLock flock . Releaser // The file lock to protect against concurrent usage.
2015-03-19 07:41:50 -07:00
shouldSync syncStrategy
2015-04-14 01:43:09 -07:00
2016-01-11 07:42:10 -08:00
minShrinkRatio float64 // How much a series file has to shrink to justify dropping chunks.
2015-04-14 01:43:09 -07:00
bufPool sync . Pool
2014-06-06 02:55:53 -07:00
}
2014-10-07 10:11:24 -07:00
// newPersistence returns a newly allocated persistence backed by local disk storage, ready to use.
2016-01-11 07:42:10 -08:00
func newPersistence (
basePath string ,
dirty , pedanticChecks bool ,
shouldSync syncStrategy ,
minShrinkRatio float64 ,
) ( * persistence , error ) {
2015-03-02 11:02:37 -08:00
dirtyPath := filepath . Join ( basePath , dirtyFileName )
versionPath := filepath . Join ( basePath , versionFileName )
2015-03-03 09:59:39 -08:00
if versionData , err := ioutil . ReadFile ( versionPath ) ; err == nil {
if persistedVersion , err := strconv . Atoi ( strings . TrimSpace ( string ( versionData ) ) ) ; err != nil {
return nil , fmt . Errorf ( "cannot parse content of %s: %s" , versionPath , versionData )
2015-03-02 11:02:37 -08:00
} else if persistedVersion != Version {
return nil , fmt . Errorf ( "found storage version %d on disk, need version %d - please wipe storage or run a version of Prometheus compatible with storage version %d" , persistedVersion , Version , persistedVersion )
}
} else if os . IsNotExist ( err ) {
// No version file found. Let's create the directory (in case
// it's not there yet) and then check if it is actually
// empty. If not, we have found an old storage directory without
// version file, so we have to bail out.
if err := os . MkdirAll ( basePath , 0700 ) ; err != nil {
return nil , err
}
2015-03-03 09:59:39 -08:00
fis , err := ioutil . ReadDir ( basePath )
2015-03-02 11:02:37 -08:00
if err != nil {
return nil , err
}
2016-01-11 09:05:36 -08:00
if len ( fis ) > 0 && ! ( len ( fis ) == 1 && fis [ 0 ] . Name ( ) == "lost+found" && fis [ 0 ] . IsDir ( ) ) {
2015-03-02 11:02:37 -08:00
return nil , fmt . Errorf ( "could not detect storage version on disk, assuming version 0, need version %d - please wipe storage or run a version of Prometheus compatible with storage version 0" , Version )
}
// Finally we can write our own version into a new version file.
file , err := os . Create ( versionPath )
if err != nil {
return nil , err
}
defer file . Close ( )
if _ , err := fmt . Fprintf ( file , "%d\n" , Version ) ; err != nil {
return nil , err
}
} else {
2014-09-10 09:41:52 -07:00
return nil , err
}
2015-01-14 07:52:09 -08:00
fLock , dirtyfileExisted , err := flock . New ( dirtyPath )
if err != nil {
2015-05-20 09:10:29 -07:00
log . Errorf ( "Could not lock %s, Prometheus already running?" , dirtyPath )
2015-01-14 07:52:09 -08:00
return nil , err
}
if dirtyfileExisted {
dirty = true
}
2014-09-23 10:21:10 -07:00
archivedFingerprintToMetrics , err := index . NewFingerprintMetricIndex ( basePath )
2014-08-21 13:06:11 -07:00
if err != nil {
return nil , err
}
2014-09-23 10:21:10 -07:00
archivedFingerprintToTimeRange , err := index . NewFingerprintTimeRangeIndex ( basePath )
2014-09-10 09:41:52 -07:00
if err != nil {
return nil , err
2014-06-06 02:55:53 -07:00
}
2014-10-07 10:11:24 -07:00
p := & persistence {
2015-03-13 07:49:07 -07:00
basePath : basePath ,
2014-09-23 10:21:10 -07:00
archivedFingerprintToMetrics : archivedFingerprintToMetrics ,
archivedFingerprintToTimeRange : archivedFingerprintToTimeRange ,
indexingQueue : make ( chan indexingOp , indexingQueueCapacity ) ,
indexingStopped : make ( chan struct { } ) ,
2014-09-24 05:41:38 -07:00
indexingFlush : make ( chan chan int ) ,
2014-09-24 07:51:18 -07:00
indexingQueueLength : prometheus . NewGauge ( prometheus . GaugeOpts {
Namespace : namespace ,
Subsystem : subsystem ,
Name : "indexing_queue_length" ,
Help : "The number of metrics waiting to be indexed." ,
} ) ,
indexingQueueCapacity : prometheus . MustNewConstMetric (
prometheus . NewDesc (
prometheus . BuildFQName ( namespace , subsystem , "indexing_queue_capacity" ) ,
"The capacity of the indexing queue." ,
nil , nil ,
) ,
prometheus . GaugeValue ,
float64 ( indexingQueueCapacity ) ,
) ,
indexingBatchSizes : prometheus . NewSummary (
prometheus . SummaryOpts {
Namespace : namespace ,
Subsystem : subsystem ,
Name : "indexing_batch_sizes" ,
Help : "Quantiles for indexing batch sizes (number of metrics per batch)." ,
} ,
) ,
2015-03-19 09:06:16 -07:00
indexingBatchDuration : prometheus . NewSummary (
2014-09-24 07:51:18 -07:00
prometheus . SummaryOpts {
Namespace : namespace ,
Subsystem : subsystem ,
2015-03-19 09:06:16 -07:00
Name : "indexing_batch_duration_milliseconds" ,
Help : "Quantiles for batch indexing duration in milliseconds." ,
2014-09-24 07:51:18 -07:00
} ,
) ,
2014-10-24 11:27:27 -07:00
checkpointDuration : prometheus . NewGauge ( prometheus . GaugeOpts {
Namespace : namespace ,
Subsystem : subsystem ,
Name : "checkpoint_duration_milliseconds" ,
Help : "The duration (in milliseconds) it took to checkpoint in-memory metrics and head chunks." ,
} ) ,
2015-05-21 08:50:06 -07:00
dirtyCounter : prometheus . NewCounter ( prometheus . CounterOpts {
Namespace : namespace ,
Subsystem : subsystem ,
Name : "inconsistencies_total" ,
Help : "A counter incremented each time an inconsistency in the local storage is detected. If this is greater zero, restart the server as soon as possible." ,
} ) ,
2015-03-19 04:03:09 -07:00
dirty : dirty ,
pedanticChecks : pedanticChecks ,
dirtyFileName : dirtyPath ,
fLock : fLock ,
2015-03-19 07:41:50 -07:00
shouldSync : shouldSync ,
2015-04-14 01:43:09 -07:00
// Create buffers of length 3*chunkLenWithHeader by default because that is still reasonably small
// and at the same time enough for many uses. The contract is to never return buffer smaller than
// that to the pool so that callers can rely on a minimum buffer size.
bufPool : sync . Pool { New : func ( ) interface { } { return make ( [ ] byte , 0 , 3 * chunkLenWithHeader ) } } ,
2014-11-05 11:02:45 -08:00
}
if p . dirty {
// Blow away the label indexes. We'll rebuild them later.
if err := index . DeleteLabelPairFingerprintIndex ( basePath ) ; err != nil {
return nil , err
}
if err := index . DeleteLabelNameLabelValuesIndex ( basePath ) ; err != nil {
return nil , err
}
}
labelPairToFingerprints , err := index . NewLabelPairFingerprintIndex ( basePath )
if err != nil {
return nil , err
}
labelNameToLabelValues , err := index . NewLabelNameLabelValuesIndex ( basePath )
if err != nil {
return nil , err
2014-09-23 10:21:10 -07:00
}
2014-11-05 11:02:45 -08:00
p . labelPairToFingerprints = labelPairToFingerprints
p . labelNameToLabelValues = labelNameToLabelValues
2014-09-23 10:21:10 -07:00
return p , nil
2014-06-06 02:55:53 -07:00
}
2015-05-18 10:26:28 -07:00
func ( p * persistence ) run ( ) {
p . processIndexingQueue ( )
}
2014-09-24 07:51:18 -07:00
// Describe implements prometheus.Collector.
2014-10-07 10:11:24 -07:00
func ( p * persistence ) Describe ( ch chan <- * prometheus . Desc ) {
2014-09-24 07:51:18 -07:00
ch <- p . indexingQueueLength . Desc ( )
ch <- p . indexingQueueCapacity . Desc ( )
p . indexingBatchSizes . Describe ( ch )
2015-03-19 09:06:16 -07:00
p . indexingBatchDuration . Describe ( ch )
2014-10-24 11:27:27 -07:00
ch <- p . checkpointDuration . Desc ( )
2015-05-21 08:50:06 -07:00
ch <- p . dirtyCounter . Desc ( )
2014-09-24 07:51:18 -07:00
}
// Collect implements prometheus.Collector.
2014-10-07 10:11:24 -07:00
func ( p * persistence ) Collect ( ch chan <- prometheus . Metric ) {
2014-09-24 07:51:18 -07:00
p . indexingQueueLength . Set ( float64 ( len ( p . indexingQueue ) ) )
ch <- p . indexingQueueLength
ch <- p . indexingQueueCapacity
p . indexingBatchSizes . Collect ( ch )
2015-03-19 09:06:16 -07:00
p . indexingBatchDuration . Collect ( ch )
2014-10-24 11:27:27 -07:00
ch <- p . checkpointDuration
2015-05-21 08:50:06 -07:00
ch <- p . dirtyCounter
2014-09-24 07:51:18 -07:00
}
2014-11-05 11:02:45 -08:00
// isDirty returns the dirty flag in a goroutine-safe way.
func ( p * persistence ) isDirty ( ) bool {
p . dirtyMtx . Lock ( )
defer p . dirtyMtx . Unlock ( )
return p . dirty
}
// setDirty sets the dirty flag in a goroutine-safe way. Once the dirty flag was
// set to true with this method, it cannot be set to false again. (If we became
// dirty during our runtime, there is no way back. If we were dirty from the
// start, a clean-up might make us clean again.)
func ( p * persistence ) setDirty ( dirty bool ) {
2015-06-22 09:12:55 -07:00
if dirty {
p . dirtyCounter . Inc ( )
}
2014-11-05 11:02:45 -08:00
p . dirtyMtx . Lock ( )
defer p . dirtyMtx . Unlock ( )
if p . becameDirty {
return
}
p . dirty = dirty
if dirty {
p . becameDirty = true
2015-05-20 09:10:29 -07:00
log . Error ( "The storage is now inconsistent. Restart Prometheus ASAP to initiate recovery." )
2014-11-05 11:02:45 -08:00
}
}
2015-05-20 10:13:06 -07:00
// fingerprintsForLabelPair returns the fingerprints for the given label
2014-10-07 10:11:24 -07:00
// pair. This method is goroutine-safe but take into account that metrics queued
2014-11-20 12:03:51 -08:00
// for indexing with IndexMetric might not have made it into the index
// yet. (Same applies correspondingly to UnindexMetric.)
2015-08-22 04:32:13 -07:00
func ( p * persistence ) fingerprintsForLabelPair ( lp model . LabelPair ) ( model . Fingerprints , error ) {
2014-09-10 09:41:52 -07:00
fps , _ , err := p . labelPairToFingerprints . Lookup ( lp )
2014-06-06 02:55:53 -07:00
if err != nil {
return nil , err
}
2014-09-10 09:41:52 -07:00
return fps , nil
2014-06-06 02:55:53 -07:00
}
2015-05-20 10:13:06 -07:00
// labelValuesForLabelName returns the label values for the given label
2014-10-07 10:11:24 -07:00
// name. This method is goroutine-safe but take into account that metrics queued
2014-11-20 12:03:51 -08:00
// for indexing with IndexMetric might not have made it into the index
// yet. (Same applies correspondingly to UnindexMetric.)
2015-08-20 08:18:46 -07:00
func ( p * persistence ) labelValuesForLabelName ( ln model . LabelName ) ( model . LabelValues , error ) {
2014-09-10 09:41:52 -07:00
lvs , _ , err := p . labelNameToLabelValues . Lookup ( ln )
if err != nil {
return nil , err
}
return lvs , nil
2014-06-06 02:55:53 -07:00
}
Improve persisting chunks to disk.
This is done by bucketing chunks by fingerprint. If the persisting to
disk falls behind, more and more chunks are in the queue. As soon as
there are "double hits", we will now persist both chunks in one go,
doubling the disk throughput (assuming it is limited by disk
seeks). Should even more pile up so that we end wit "triple hits", we
will persist those first, and so on.
Even if we have millions of time series, this will still help,
assuming not all of them are growing with the same speed. Series that
get many samples and/or are not very compressable will accumulate
chunks faster, and they will soon get double- or triple-writes.
To improve the chance of double writes,
-storage.local.persistence-queue-capacity could be set to a higher
value. However, that will slow down shutdown a lot (as the queue has
to be worked through). So we leave it to the user to set it to a
really high value. A more fundamental solution would be to checkpoint
not only head chunks, but also chunks still in the persist queue. That
would be quite complicated for a rather limited use-case (running many
time series with high ingestion rate on slow spinning disks).
2015-02-13 11:08:52 -08:00
// persistChunks persists a number of consecutive chunks of a series. It is the
// caller's responsibility to not modify the chunks concurrently and to not
// persist or drop anything for the same fingerprint concurrently. It returns
// the (zero-based) index of the first persisted chunk within the series
// file. In case of an error, the returned index is -1 (to avoid the
// misconception that the chunk was written at position 0).
2015-08-20 08:18:46 -07:00
func ( p * persistence ) persistChunks ( fp model . Fingerprint , chunks [ ] chunk ) ( index int , err error ) {
2015-03-08 18:33:10 -07:00
defer func ( ) {
if err != nil {
2015-05-20 09:10:29 -07:00
log . Error ( "Error persisting chunks: " , err )
2015-03-08 18:33:10 -07:00
p . setDirty ( true )
}
} ( )
Improve persisting chunks to disk.
This is done by bucketing chunks by fingerprint. If the persisting to
disk falls behind, more and more chunks are in the queue. As soon as
there are "double hits", we will now persist both chunks in one go,
doubling the disk throughput (assuming it is limited by disk
seeks). Should even more pile up so that we end wit "triple hits", we
will persist those first, and so on.
Even if we have millions of time series, this will still help,
assuming not all of them are growing with the same speed. Series that
get many samples and/or are not very compressable will accumulate
chunks faster, and they will soon get double- or triple-writes.
To improve the chance of double writes,
-storage.local.persistence-queue-capacity could be set to a higher
value. However, that will slow down shutdown a lot (as the queue has
to be worked through). So we leave it to the user to set it to a
really high value. A more fundamental solution would be to checkpoint
not only head chunks, but also chunks still in the persist queue. That
would be quite complicated for a rather limited use-case (running many
time series with high ingestion rate on slow spinning disks).
2015-02-13 11:08:52 -08:00
2014-06-06 02:55:53 -07:00
f , err := p . openChunkFileForWriting ( fp )
if err != nil {
2014-10-27 12:40:48 -07:00
return - 1 , err
2014-06-06 02:55:53 -07:00
}
2015-03-19 07:41:50 -07:00
defer p . closeChunkFile ( f )
2014-06-06 02:55:53 -07:00
2015-03-08 18:33:10 -07:00
if err := writeChunks ( f , chunks ) ; err != nil {
return - 1 , err
2014-10-27 12:40:48 -07:00
}
Improve persisting chunks to disk.
This is done by bucketing chunks by fingerprint. If the persisting to
disk falls behind, more and more chunks are in the queue. As soon as
there are "double hits", we will now persist both chunks in one go,
doubling the disk throughput (assuming it is limited by disk
seeks). Should even more pile up so that we end wit "triple hits", we
will persist those first, and so on.
Even if we have millions of time series, this will still help,
assuming not all of them are growing with the same speed. Series that
get many samples and/or are not very compressable will accumulate
chunks faster, and they will soon get double- or triple-writes.
To improve the chance of double writes,
-storage.local.persistence-queue-capacity could be set to a higher
value. However, that will slow down shutdown a lot (as the queue has
to be worked through). So we leave it to the user to set it to a
really high value. A more fundamental solution would be to checkpoint
not only head chunks, but also chunks still in the persist queue. That
would be quite complicated for a rather limited use-case (running many
time series with high ingestion rate on slow spinning disks).
2015-02-13 11:08:52 -08:00
// Determine index within the file.
2014-10-27 12:40:48 -07:00
offset , err := f . Seek ( 0 , os . SEEK_CUR )
if err != nil {
return - 1 , err
}
2015-03-08 18:33:10 -07:00
index , err = chunkIndexForOffset ( offset )
2014-10-27 12:40:48 -07:00
if err != nil {
return - 1 , err
}
Improve persisting chunks to disk.
This is done by bucketing chunks by fingerprint. If the persisting to
disk falls behind, more and more chunks are in the queue. As soon as
there are "double hits", we will now persist both chunks in one go,
doubling the disk throughput (assuming it is limited by disk
seeks). Should even more pile up so that we end wit "triple hits", we
will persist those first, and so on.
Even if we have millions of time series, this will still help,
assuming not all of them are growing with the same speed. Series that
get many samples and/or are not very compressable will accumulate
chunks faster, and they will soon get double- or triple-writes.
To improve the chance of double writes,
-storage.local.persistence-queue-capacity could be set to a higher
value. However, that will slow down shutdown a lot (as the queue has
to be worked through). So we leave it to the user to set it to a
really high value. A more fundamental solution would be to checkpoint
not only head chunks, but also chunks still in the persist queue. That
would be quite complicated for a rather limited use-case (running many
time series with high ingestion rate on slow spinning disks).
2015-02-13 11:08:52 -08:00
return index - len ( chunks ) , err
2014-06-06 02:55:53 -07:00
}
2014-10-07 10:11:24 -07:00
// loadChunks loads a group of chunks of a timeseries by their index. The chunk
// with the earliest time will have index 0, the following ones will have
2014-10-27 12:40:48 -07:00
// incrementally larger indexes. The indexOffset denotes the offset to be added to
// each index in indexes. It is the caller's responsibility to not persist or
// drop anything for the same fingerprint concurrently.
2015-08-20 08:18:46 -07:00
func ( p * persistence ) loadChunks ( fp model . Fingerprint , indexes [ ] int , indexOffset int ) ( [ ] chunk , error ) {
2014-06-06 02:55:53 -07:00
f , err := p . openChunkFileForReading ( fp )
if err != nil {
return nil , err
}
defer f . Close ( )
2014-10-15 06:53:05 -07:00
chunks := make ( [ ] chunk , 0 , len ( indexes ) )
2015-04-14 01:43:09 -07:00
buf := p . bufPool . Get ( ) . ( [ ] byte )
2015-04-13 11:20:26 -07:00
defer func ( ) {
2015-04-14 01:49:43 -07:00
// buf may change below, so wrap returning to the pool in a function.
// A simple 'defer p.bufPool.Put(buf)' would only return the original buf.
2015-04-14 01:43:09 -07:00
p . bufPool . Put ( buf )
2015-04-13 11:20:26 -07:00
} ( )
for i := 0 ; i < len ( indexes ) ; i ++ {
// This loads chunks in batches. A batch is a streak of
// consecutive chunks, read from disk in one go.
batchSize := 1
if _ , err := f . Seek ( offsetForChunkIndex ( indexes [ i ] + indexOffset ) , os . SEEK_SET ) ; err != nil {
2014-06-06 02:55:53 -07:00
return nil , err
}
2015-04-13 11:20:26 -07:00
for ; batchSize < chunkMaxBatchSize &&
i + 1 < len ( indexes ) &&
indexes [ i ] + 1 == indexes [ i + 1 ] ; i , batchSize = i + 1 , batchSize + 1 {
2014-06-06 02:55:53 -07:00
}
2015-04-13 11:20:26 -07:00
readSize := batchSize * chunkLenWithHeader
if cap ( buf ) < readSize {
buf = make ( [ ] byte , readSize )
2014-06-06 02:55:53 -07:00
}
2015-04-13 11:20:26 -07:00
buf = buf [ : readSize ]
2014-06-06 02:55:53 -07:00
2015-04-13 11:20:26 -07:00
if _ , err := io . ReadFull ( f , buf ) ; err != nil {
2014-06-06 02:55:53 -07:00
return nil , err
}
2015-04-13 11:20:26 -07:00
for c := 0 ; c < batchSize ; c ++ {
chunk := newChunkForEncoding ( chunkEncoding ( buf [ c * chunkLenWithHeader + chunkHeaderTypeOffset ] ) )
chunk . unmarshalFromBuf ( buf [ c * chunkLenWithHeader + chunkHeaderLen : ] )
chunks = append ( chunks , chunk )
}
2014-06-06 02:55:53 -07:00
}
2015-03-08 18:33:10 -07:00
chunkOps . WithLabelValues ( load ) . Add ( float64 ( len ( chunks ) ) )
atomic . AddInt64 ( & numMemChunks , int64 ( len ( chunks ) ) )
2014-06-06 02:55:53 -07:00
return chunks , nil
}
2015-07-06 16:10:14 -07:00
// loadChunkDescs loads the chunkDescs for a series from disk. offsetFromEnd is
// the number of chunkDescs to skip from the end of the series file. It is the
// caller's responsibility to not persist or drop anything for the same
2014-10-07 10:11:24 -07:00
// fingerprint concurrently.
2015-08-20 08:18:46 -07:00
func ( p * persistence ) loadChunkDescs ( fp model . Fingerprint , offsetFromEnd int ) ( [ ] * chunkDesc , error ) {
2014-06-06 02:55:53 -07:00
f , err := p . openChunkFileForReading ( fp )
if os . IsNotExist ( err ) {
return nil , nil
}
if err != nil {
return nil , err
}
defer f . Close ( )
fi , err := f . Stat ( )
if err != nil {
return nil , err
}
2015-04-13 11:20:26 -07:00
if fi . Size ( ) % int64 ( chunkLenWithHeader ) != 0 {
2014-11-27 11:46:45 -08:00
p . setDirty ( true )
return nil , fmt . Errorf (
"size of series file for fingerprint %v is %d, which is not a multiple of the chunk length %d" ,
2015-04-13 11:20:26 -07:00
fp , fi . Size ( ) , chunkLenWithHeader ,
2014-11-27 11:46:45 -08:00
)
2014-06-06 02:55:53 -07:00
}
2015-07-06 16:10:14 -07:00
numChunks := int ( fi . Size ( ) ) / chunkLenWithHeader - offsetFromEnd
cds := make ( [ ] * chunkDesc , numChunks )
2015-04-13 11:20:26 -07:00
chunkTimesBuf := make ( [ ] byte , 16 )
2014-06-06 02:55:53 -07:00
for i := 0 ; i < numChunks ; i ++ {
2015-03-08 18:33:10 -07:00
_ , err := f . Seek ( offsetForChunkIndex ( i ) + chunkHeaderFirstTimeOffset , os . SEEK_SET )
2014-06-06 02:55:53 -07:00
if err != nil {
return nil , err
}
_ , err = io . ReadAtLeast ( f , chunkTimesBuf , 16 )
if err != nil {
return nil , err
}
2015-07-06 16:10:14 -07:00
cds [ i ] = & chunkDesc {
2015-08-20 08:18:46 -07:00
chunkFirstTime : model . Time ( binary . LittleEndian . Uint64 ( chunkTimesBuf ) ) ,
chunkLastTime : model . Time ( binary . LittleEndian . Uint64 ( chunkTimesBuf [ 8 : ] ) ) ,
2014-06-06 02:55:53 -07:00
}
}
2014-10-22 10:21:23 -07:00
chunkDescOps . WithLabelValues ( load ) . Add ( float64 ( len ( cds ) ) )
2014-11-27 09:25:03 -08:00
numMemChunkDescs . Add ( float64 ( len ( cds ) ) )
2014-06-06 02:55:53 -07:00
return cds , nil
}
2014-10-24 11:27:27 -07:00
// checkpointSeriesMapAndHeads persists the fingerprint to memory-series mapping
2015-03-08 18:33:10 -07:00
// and all non persisted chunks. Do not call concurrently with
// loadSeriesMapAndHeads. This method will only write heads format v2, but
// loadSeriesMapAndHeads can also understand v1.
2014-11-27 11:46:45 -08:00
//
2015-03-08 18:33:10 -07:00
// Description of the file format (for both, v1 and v2):
2014-11-27 11:46:45 -08:00
//
// (1) Magic string (const headsMagicString).
//
// (2) Varint-encoded format version (const headsFormatVersion).
//
// (3) Number of series in checkpoint as big-endian uint64.
//
// (4) Repeated once per series:
//
2015-03-08 18:33:10 -07:00
// (4.1) A flag byte, see flag constants above. (Present but unused in v2.)
2014-11-27 11:46:45 -08:00
//
// (4.2) The fingerprint as big-endian uint64.
//
// (4.3) The metric as defined by codable.Metric.
//
2015-03-08 18:33:10 -07:00
// (4.4) The varint-encoded persistWatermark. (Missing in v1.)
2014-11-27 11:46:45 -08:00
//
2015-03-19 09:54:59 -07:00
// (4.5) The modification time of the series file as nanoseconds elapsed since
2015-03-19 04:59:26 -07:00
// January 1, 1970 UTC. -1 if the modification time is unknown or no series file
// exists yet. (Missing in v1.)
//
// (4.6) The varint-encoded chunkDescsOffset.
2014-11-27 11:46:45 -08:00
//
2015-03-08 18:33:10 -07:00
// (4.6) The varint-encoded savedFirstTime.
2014-11-27 11:46:45 -08:00
//
2015-03-08 18:33:10 -07:00
// (4.7) The varint-encoded number of chunk descriptors.
2014-11-27 11:46:45 -08:00
//
2015-03-08 18:33:10 -07:00
// (4.8) Repeated once per chunk descriptor, oldest to most recent, either
// variant 4.8.1 (if index < persistWatermark) or variant 4.8.2 (if index >=
// persistWatermark). In v1, everything is variant 4.8.1 except for a
// non-persisted head-chunk (determined by the flags).
2014-11-27 11:46:45 -08:00
//
2015-03-08 18:33:10 -07:00
// (4.8.1.1) The varint-encoded first time.
// (4.8.1.2) The varint-encoded last time.
2014-11-27 11:46:45 -08:00
//
2015-03-08 18:33:10 -07:00
// (4.8.2.1) A byte defining the chunk type.
// (4.8.2.2) The chunk itself, marshaled with the marshal() method.
2014-11-27 11:46:45 -08:00
//
2014-10-24 11:27:27 -07:00
func ( p * persistence ) checkpointSeriesMapAndHeads ( fingerprintToSeries * seriesMap , fpLocker * fingerprintLocker ) ( err error ) {
2015-05-20 09:10:29 -07:00
log . Info ( "Checkpointing in-memory metrics and chunks..." )
2014-10-24 11:27:27 -07:00
begin := time . Now ( )
f , err := os . OpenFile ( p . headsTempFileName ( ) , os . O_WRONLY | os . O_TRUNC | os . O_CREATE , 0640 )
2014-06-06 02:55:53 -07:00
if err != nil {
2015-09-07 09:08:23 -07:00
return err
2014-06-06 02:55:53 -07:00
}
2014-10-24 11:27:27 -07:00
defer func ( ) {
2015-09-07 09:08:23 -07:00
syncErr := f . Sync ( )
2014-10-24 11:27:27 -07:00
closeErr := f . Close ( )
if err != nil {
return
}
2015-09-07 09:08:23 -07:00
err = syncErr
if err != nil {
return
}
2014-10-24 11:27:27 -07:00
err = closeErr
if err != nil {
return
}
err = os . Rename ( p . headsTempFileName ( ) , p . headsFileName ( ) )
duration := time . Since ( begin )
p . checkpointDuration . Set ( float64 ( duration ) / float64 ( time . Millisecond ) )
2015-05-20 09:10:29 -07:00
log . Infof ( "Done checkpointing in-memory metrics and chunks in %v." , duration )
2014-10-24 11:27:27 -07:00
} ( )
2014-09-10 09:41:52 -07:00
w := bufio . NewWriterSize ( f , fileBufSize )
2014-06-06 02:55:53 -07:00
2014-10-24 11:27:27 -07:00
if _ , err = w . WriteString ( headsMagicString ) ; err != nil {
2015-09-07 09:08:23 -07:00
return err
2014-09-10 09:41:52 -07:00
}
2014-10-24 11:27:27 -07:00
var numberOfSeriesOffset int
if numberOfSeriesOffset , err = codable . EncodeVarint ( w , headsFormatVersion ) ; err != nil {
2015-09-07 09:08:23 -07:00
return err
2014-09-10 09:41:52 -07:00
}
2014-10-24 11:27:27 -07:00
numberOfSeriesOffset += len ( headsMagicString )
numberOfSeriesInHeader := uint64 ( fingerprintToSeries . length ( ) )
// We have to write the number of series as uint64 because we might need
// to overwrite it later, and a varint might change byte width then.
if err = codable . EncodeUint64 ( w , numberOfSeriesInHeader ) ; err != nil {
2015-09-07 09:08:23 -07:00
return err
2014-09-10 09:41:52 -07:00
}
2014-10-07 10:11:24 -07:00
iter := fingerprintToSeries . iter ( )
defer func ( ) {
// Consume the iterator in any case to not leak goroutines.
2014-12-26 04:37:30 -08:00
for range iter {
2014-10-07 10:11:24 -07:00
}
} ( )
2014-10-24 11:27:27 -07:00
var realNumberOfSeries uint64
2014-10-07 10:11:24 -07:00
for m := range iter {
2014-10-24 11:27:27 -07:00
func ( ) { // Wrapped in function to use defer for unlocking the fp.
fpLocker . Lock ( m . fp )
defer fpLocker . Unlock ( m . fp )
if len ( m . series . chunkDescs ) == 0 {
// This series was completely purged or archived in the meantime. Ignore.
return
}
realNumberOfSeries ++
2015-03-08 18:33:10 -07:00
// seriesFlags left empty in v2.
if err = w . WriteByte ( 0 ) ; err != nil {
2014-10-24 11:27:27 -07:00
return
}
if err = codable . EncodeUint64 ( w , uint64 ( m . fp ) ) ; err != nil {
return
}
var buf [ ] byte
buf , err = codable . Metric ( m . series . metric ) . MarshalBinary ( )
if err != nil {
return
}
2015-09-07 09:08:23 -07:00
if _ , err = w . Write ( buf ) ; err != nil {
return
}
2015-03-08 18:33:10 -07:00
if _ , err = codable . EncodeVarint ( w , int64 ( m . series . persistWatermark ) ) ; err != nil {
return
}
2015-03-19 04:59:26 -07:00
if m . series . modTime . IsZero ( ) {
if _ , err = codable . EncodeVarint ( w , - 1 ) ; err != nil {
return
}
} else {
if _ , err = codable . EncodeVarint ( w , m . series . modTime . UnixNano ( ) ) ; err != nil {
return
}
}
2014-10-27 12:40:48 -07:00
if _ , err = codable . EncodeVarint ( w , int64 ( m . series . chunkDescsOffset ) ) ; err != nil {
return
}
2014-11-07 16:01:34 -08:00
if _ , err = codable . EncodeVarint ( w , int64 ( m . series . savedFirstTime ) ) ; err != nil {
return
}
2014-10-24 11:27:27 -07:00
if _ , err = codable . EncodeVarint ( w , int64 ( len ( m . series . chunkDescs ) ) ) ; err != nil {
return
}
for i , chunkDesc := range m . series . chunkDescs {
2015-03-08 18:33:10 -07:00
if i < m . series . persistWatermark {
2014-10-24 11:27:27 -07:00
if _ , err = codable . EncodeVarint ( w , int64 ( chunkDesc . firstTime ( ) ) ) ; err != nil {
return
}
if _ , err = codable . EncodeVarint ( w , int64 ( chunkDesc . lastTime ( ) ) ) ; err != nil {
return
}
} else {
2015-07-13 12:12:27 -07:00
// This is a non-persisted chunk. Fully marshal it.
2015-05-20 10:13:06 -07:00
if err = w . WriteByte ( byte ( chunkDesc . c . encoding ( ) ) ) ; err != nil {
2014-10-24 11:27:27 -07:00
return
}
2015-05-20 10:13:06 -07:00
if err = chunkDesc . c . marshal ( w ) ; err != nil {
2014-10-24 11:27:27 -07:00
return
}
}
}
2015-09-07 09:08:23 -07:00
// Series is checkpointed now, so declare it clean. In case the entire
// checkpoint fails later on, this is fine, as the storage's series
// maintenance will mark these series newly dirty again, continuously
// increasing the total number of dirty series as seen by the storage.
// This has the effect of triggering a new checkpoint attempt even
// earlier than if we hadn't incorrectly set "dirty" to "false" here
// already.
2015-03-08 18:33:10 -07:00
m . series . dirty = false
2014-10-24 11:27:27 -07:00
} ( )
2014-06-06 02:55:53 -07:00
if err != nil {
2015-09-07 09:08:23 -07:00
return err
2014-06-06 02:55:53 -07:00
}
2014-10-24 11:27:27 -07:00
}
if err = w . Flush ( ) ; err != nil {
2015-09-07 09:08:23 -07:00
return err
2014-10-24 11:27:27 -07:00
}
if realNumberOfSeries != numberOfSeriesInHeader {
// The number of series has changed in the meantime.
// Rewrite it in the header.
if _ , err = f . Seek ( int64 ( numberOfSeriesOffset ) , os . SEEK_SET ) ; err != nil {
2015-09-07 09:08:23 -07:00
return err
2014-09-10 09:41:52 -07:00
}
2014-10-24 11:27:27 -07:00
if err = codable . EncodeUint64 ( f , realNumberOfSeries ) ; err != nil {
2015-09-07 09:08:23 -07:00
return err
2014-09-10 09:41:52 -07:00
}
2014-06-06 02:55:53 -07:00
}
2015-09-07 09:08:23 -07:00
return err
2014-06-06 02:55:53 -07:00
}
2014-10-07 10:11:24 -07:00
// loadSeriesMapAndHeads loads the fingerprint to memory-series mapping and all
2015-03-08 18:33:10 -07:00
// the chunks contained in the checkpoint (and thus not yet persisted to series
// files). The method is capable of loading the checkpoint format v1 and v2. If
// recoverable corruption is detected, or if the dirty flag was set from the
// beginning, crash recovery is run, which might take a while. If an
// unrecoverable error is encountered, it is returned. Call this method during
// start-up while nothing else is running in storage land. This method is
// utterly goroutine-unsafe.
2015-03-18 11:36:41 -07:00
func ( p * persistence ) loadSeriesMapAndHeads ( ) ( sm * seriesMap , chunksToPersist int64 , err error ) {
2015-02-26 15:06:16 -08:00
var chunkDescsTotal int64
2015-08-20 08:18:46 -07:00
fingerprintToSeries := make ( map [ model . Fingerprint ] * memorySeries )
2014-11-05 11:02:45 -08:00
sm = & seriesMap { m : fingerprintToSeries }
defer func ( ) {
if sm != nil && p . dirty {
2015-05-20 09:10:29 -07:00
log . Warn ( "Persistence layer appears dirty." )
2014-11-20 12:03:51 -08:00
err = p . recoverFromCrash ( fingerprintToSeries )
2014-11-05 11:02:45 -08:00
if err != nil {
sm = nil
}
}
if err == nil {
2014-11-27 09:25:03 -08:00
numMemChunkDescs . Add ( float64 ( chunkDescsTotal ) )
2014-11-05 11:02:45 -08:00
}
} ( )
2014-10-23 06:18:32 -07:00
2014-10-24 11:27:27 -07:00
f , err := os . Open ( p . headsFileName ( ) )
2014-09-10 09:41:52 -07:00
if os . IsNotExist ( err ) {
2015-03-08 18:33:10 -07:00
return sm , 0 , nil
2014-09-10 09:41:52 -07:00
}
if err != nil {
2015-05-20 09:10:29 -07:00
log . Warn ( "Could not open heads file:" , err )
2014-11-05 11:02:45 -08:00
p . dirty = true
return
2014-09-10 09:41:52 -07:00
}
defer f . Close ( )
r := bufio . NewReaderSize ( f , fileBufSize )
buf := make ( [ ] byte , len ( headsMagicString ) )
if _ , err := io . ReadFull ( r , buf ) ; err != nil {
2015-05-20 09:10:29 -07:00
log . Warn ( "Could not read from heads file:" , err )
2014-11-05 11:02:45 -08:00
p . dirty = true
2015-03-08 18:33:10 -07:00
return sm , 0 , nil
2014-09-10 09:41:52 -07:00
}
magic := string ( buf )
if magic != headsMagicString {
2015-05-20 09:10:29 -07:00
log . Warnf (
2014-09-10 09:41:52 -07:00
"unexpected magic string, want %q, got %q" ,
headsMagicString , magic ,
)
2014-11-05 11:02:45 -08:00
p . dirty = true
return
2014-09-10 09:41:52 -07:00
}
2015-03-08 18:33:10 -07:00
version , err := binary . ReadVarint ( r )
if ( version != headsFormatVersion && version != headsFormatLegacyVersion ) || err != nil {
2015-05-20 09:10:29 -07:00
log . Warnf ( "unknown heads format version, want %d" , headsFormatVersion )
2014-11-05 11:02:45 -08:00
p . dirty = true
2015-03-08 18:33:10 -07:00
return sm , 0 , nil
2014-09-10 09:41:52 -07:00
}
2014-10-24 11:27:27 -07:00
numSeries , err := codable . DecodeUint64 ( r )
2014-09-10 09:41:52 -07:00
if err != nil {
2015-05-20 09:10:29 -07:00
log . Warn ( "Could not decode number of series:" , err )
2014-11-05 11:02:45 -08:00
p . dirty = true
2015-03-08 18:33:10 -07:00
return sm , 0 , nil
2014-09-10 09:41:52 -07:00
}
for ; numSeries > 0 ; numSeries -- {
2014-09-16 06:47:24 -07:00
seriesFlags , err := r . ReadByte ( )
if err != nil {
2015-05-20 09:10:29 -07:00
log . Warn ( "Could not read series flags:" , err )
2014-11-05 11:02:45 -08:00
p . dirty = true
2015-03-18 11:36:41 -07:00
return sm , chunksToPersist , nil
2014-09-16 06:47:24 -07:00
}
headChunkPersisted := seriesFlags & flagHeadChunkPersisted != 0
2014-09-23 10:21:10 -07:00
fp , err := codable . DecodeUint64 ( r )
2014-09-10 09:41:52 -07:00
if err != nil {
2015-05-20 09:10:29 -07:00
log . Warn ( "Could not decode fingerprint:" , err )
2014-11-05 11:02:45 -08:00
p . dirty = true
2015-03-18 11:36:41 -07:00
return sm , chunksToPersist , nil
2014-09-10 09:41:52 -07:00
}
2014-09-23 10:21:10 -07:00
var metric codable . Metric
2014-09-10 09:41:52 -07:00
if err := metric . UnmarshalFromReader ( r ) ; err != nil {
2015-05-20 09:10:29 -07:00
log . Warn ( "Could not decode metric:" , err )
2014-11-05 11:02:45 -08:00
p . dirty = true
2015-03-18 11:36:41 -07:00
return sm , chunksToPersist , nil
2015-03-08 18:33:10 -07:00
}
var persistWatermark int64
2015-03-19 04:59:26 -07:00
var modTime time . Time
2015-03-08 18:33:10 -07:00
if version != headsFormatLegacyVersion {
// persistWatermark only present in v2.
persistWatermark , err = binary . ReadVarint ( r )
if err != nil {
2015-05-20 09:10:29 -07:00
log . Warn ( "Could not decode persist watermark:" , err )
2015-03-08 18:33:10 -07:00
p . dirty = true
2015-03-18 11:36:41 -07:00
return sm , chunksToPersist , nil
2015-03-08 18:33:10 -07:00
}
2015-03-19 04:59:26 -07:00
modTimeNano , err := binary . ReadVarint ( r )
if err != nil {
2015-05-20 09:10:29 -07:00
log . Warn ( "Could not decode modification time:" , err )
2015-03-19 04:59:26 -07:00
p . dirty = true
return sm , chunksToPersist , nil
}
if modTimeNano != - 1 {
modTime = time . Unix ( 0 , modTimeNano )
}
2014-09-10 09:41:52 -07:00
}
2014-10-27 12:40:48 -07:00
chunkDescsOffset , err := binary . ReadVarint ( r )
if err != nil {
2015-05-20 09:10:29 -07:00
log . Warn ( "Could not decode chunk descriptor offset:" , err )
2014-11-05 11:02:45 -08:00
p . dirty = true
2015-03-18 11:36:41 -07:00
return sm , chunksToPersist , nil
2014-10-27 12:40:48 -07:00
}
2014-11-07 16:01:34 -08:00
savedFirstTime , err := binary . ReadVarint ( r )
if err != nil {
2015-05-20 09:10:29 -07:00
log . Warn ( "Could not decode saved first time:" , err )
2014-11-07 16:01:34 -08:00
p . dirty = true
2015-03-18 11:36:41 -07:00
return sm , chunksToPersist , nil
2014-11-07 16:01:34 -08:00
}
2014-09-10 09:41:52 -07:00
numChunkDescs , err := binary . ReadVarint ( r )
if err != nil {
2015-05-20 09:10:29 -07:00
log . Warn ( "Could not decode number of chunk descriptors:" , err )
2014-11-05 11:02:45 -08:00
p . dirty = true
2015-03-18 11:36:41 -07:00
return sm , chunksToPersist , nil
2014-09-10 09:41:52 -07:00
}
2014-10-15 06:53:05 -07:00
chunkDescs := make ( [ ] * chunkDesc , numChunkDescs )
2015-03-08 18:33:10 -07:00
if version == headsFormatLegacyVersion {
if headChunkPersisted {
persistWatermark = numChunkDescs
} else {
persistWatermark = numChunkDescs - 1
}
}
2014-09-10 09:41:52 -07:00
2014-09-16 06:47:24 -07:00
for i := int64 ( 0 ) ; i < numChunkDescs ; i ++ {
2015-03-08 18:33:10 -07:00
if i < persistWatermark {
2014-09-16 06:47:24 -07:00
firstTime , err := binary . ReadVarint ( r )
if err != nil {
2015-05-20 09:10:29 -07:00
log . Warn ( "Could not decode first time:" , err )
2014-11-05 11:02:45 -08:00
p . dirty = true
2015-03-18 11:36:41 -07:00
return sm , chunksToPersist , nil
2014-09-16 06:47:24 -07:00
}
lastTime , err := binary . ReadVarint ( r )
if err != nil {
2015-05-20 09:10:29 -07:00
log . Warn ( "Could not decode last time:" , err )
2014-11-05 11:02:45 -08:00
p . dirty = true
2015-03-18 11:36:41 -07:00
return sm , chunksToPersist , nil
2014-09-16 06:47:24 -07:00
}
chunkDescs [ i ] = & chunkDesc {
2015-08-20 08:18:46 -07:00
chunkFirstTime : model . Time ( firstTime ) ,
chunkLastTime : model . Time ( lastTime ) ,
2014-09-16 06:47:24 -07:00
}
2015-03-08 18:33:10 -07:00
chunkDescsTotal ++
2014-09-16 06:47:24 -07:00
} else {
2015-03-08 18:33:10 -07:00
// Non-persisted chunk.
2015-03-13 07:49:07 -07:00
encoding , err := r . ReadByte ( )
2014-09-16 06:47:24 -07:00
if err != nil {
2015-05-20 09:10:29 -07:00
log . Warn ( "Could not decode chunk type:" , err )
2014-11-05 11:02:45 -08:00
p . dirty = true
2015-03-18 11:36:41 -07:00
return sm , chunksToPersist , nil
2014-09-16 06:47:24 -07:00
}
2015-03-13 07:49:07 -07:00
chunk := newChunkForEncoding ( chunkEncoding ( encoding ) )
2014-09-16 06:47:24 -07:00
if err := chunk . unmarshal ( r ) ; err != nil {
2015-05-20 09:10:29 -07:00
log . Warn ( "Could not decode chunk:" , err )
2014-11-05 11:02:45 -08:00
p . dirty = true
2015-03-18 11:36:41 -07:00
return sm , chunksToPersist , nil
2014-09-16 06:47:24 -07:00
}
2014-10-22 10:21:23 -07:00
chunkDescs [ i ] = newChunkDesc ( chunk )
2015-03-18 11:36:41 -07:00
chunksToPersist ++
2014-09-10 09:41:52 -07:00
}
}
2015-08-20 08:18:46 -07:00
fingerprintToSeries [ model . Fingerprint ( fp ) ] = & memorySeries {
metric : model . Metric ( metric ) ,
2015-03-08 18:33:10 -07:00
chunkDescs : chunkDescs ,
persistWatermark : int ( persistWatermark ) ,
2015-03-19 04:59:26 -07:00
modTime : modTime ,
2015-03-08 18:33:10 -07:00
chunkDescsOffset : int ( chunkDescsOffset ) ,
2015-08-20 08:18:46 -07:00
savedFirstTime : model . Time ( savedFirstTime ) ,
2015-07-13 12:12:27 -07:00
lastTime : chunkDescs [ len ( chunkDescs ) - 1 ] . lastTime ( ) ,
2015-03-08 18:33:10 -07:00
headChunkClosed : persistWatermark >= numChunkDescs ,
2014-09-10 09:41:52 -07:00
}
}
2015-03-18 11:36:41 -07:00
return sm , chunksToPersist , nil
2014-09-10 09:41:52 -07:00
}
2015-03-08 18:33:10 -07:00
// dropAndPersistChunks deletes all chunks from a series file whose last sample
// time is before beforeTime, and then appends the provided chunks, leaving out
// those whose last sample time is before beforeTime. It returns the timestamp
// of the first sample in the oldest chunk _not_ dropped, the offset within the
// series file of the first chunk persisted (out of the provided chunks), the
// number of deleted chunks, and true if all chunks of the series have been
// deleted (in which case the returned timestamp will be 0 and must be ignored).
// It is the caller's responsibility to make sure nothing is persisted or loaded
// for the same fingerprint concurrently.
func ( p * persistence ) dropAndPersistChunks (
2015-08-20 08:18:46 -07:00
fp model . Fingerprint , beforeTime model . Time , chunks [ ] chunk ,
2015-03-08 18:33:10 -07:00
) (
2015-08-20 08:18:46 -07:00
firstTimeNotDropped model . Time ,
2015-03-08 18:33:10 -07:00
offset int ,
2014-11-10 09:22:08 -08:00
numDropped int ,
allDropped bool ,
err error ,
) {
2015-03-08 18:33:10 -07:00
// Style note: With the many return values, it was decided to use naked
// returns in this method. They make the method more readable, but
// please handle with care!
2014-11-10 09:22:08 -08:00
defer func ( ) {
if err != nil {
2015-05-20 09:10:29 -07:00
log . Error ( "Error dropping and/or persisting chunks: " , err )
2014-11-10 09:22:08 -08:00
p . setDirty ( true )
}
} ( )
2015-03-08 18:33:10 -07:00
if len ( chunks ) > 0 {
// We have chunks to persist. First check if those are already
// too old. If that's the case, the chunks in the series file
// are all too old, too.
i := 0
2015-05-20 10:13:06 -07:00
for ; i < len ( chunks ) && chunks [ i ] . newIterator ( ) . lastTimestamp ( ) . Before ( beforeTime ) ; i ++ {
2015-03-08 18:33:10 -07:00
}
if i < len ( chunks ) {
firstTimeNotDropped = chunks [ i ] . firstTime ( )
}
if i > 0 || firstTimeNotDropped . Before ( beforeTime ) {
// Series file has to go.
if numDropped , err = p . deleteSeriesFile ( fp ) ; err != nil {
return
}
numDropped += i
if i == len ( chunks ) {
allDropped = true
return
}
// Now simply persist what has to be persisted to a new file.
_ , err = p . persistChunks ( fp , chunks [ i : ] )
return
}
}
// If we are here, we have to check the series file itself.
2014-06-06 02:55:53 -07:00
f , err := p . openChunkFileForReading ( fp )
if os . IsNotExist ( err ) {
2015-03-08 18:33:10 -07:00
// No series file. Only need to create new file with chunks to
// persist, if there are any.
if len ( chunks ) == 0 {
allDropped = true
err = nil // Do not report not-exist err.
return
}
offset , err = p . persistChunks ( fp , chunks )
return
2014-06-06 02:55:53 -07:00
}
if err != nil {
2015-03-08 18:33:10 -07:00
return
2014-06-06 02:55:53 -07:00
}
defer f . Close ( )
2016-01-11 07:42:10 -08:00
headerBuf := make ( [ ] byte , chunkHeaderLen )
var firstTimeInFile model . Time
2015-03-08 18:33:10 -07:00
// Find the first chunk in the file that should be kept.
for ; ; numDropped ++ {
_ , err = f . Seek ( offsetForChunkIndex ( numDropped ) , os . SEEK_SET )
2014-06-06 02:55:53 -07:00
if err != nil {
2015-03-08 18:33:10 -07:00
return
2014-06-06 02:55:53 -07:00
}
2015-04-13 11:20:26 -07:00
_ , err = io . ReadFull ( f , headerBuf )
2014-06-06 02:55:53 -07:00
if err == io . EOF {
// We ran into the end of the file without finding any chunks that should
// be kept. Remove the whole file.
2015-03-08 18:33:10 -07:00
if numDropped , err = p . deleteSeriesFile ( fp ) ; err != nil {
return
2014-06-06 02:55:53 -07:00
}
2015-03-08 18:33:10 -07:00
if len ( chunks ) == 0 {
allDropped = true
return
}
offset , err = p . persistChunks ( fp , chunks )
return
2014-06-06 02:55:53 -07:00
}
if err != nil {
2015-03-08 18:33:10 -07:00
return
2014-06-06 02:55:53 -07:00
}
2016-01-11 07:42:10 -08:00
if numDropped == 0 {
firstTimeInFile = model . Time (
binary . LittleEndian . Uint64 ( headerBuf [ chunkHeaderFirstTimeOffset : ] ) ,
)
}
2015-08-20 08:18:46 -07:00
lastTime := model . Time (
2015-03-08 18:33:10 -07:00
binary . LittleEndian . Uint64 ( headerBuf [ chunkHeaderLastTimeOffset : ] ) ,
)
2014-06-06 02:55:53 -07:00
if ! lastTime . Before ( beforeTime ) {
break
}
}
2016-01-11 07:42:10 -08:00
// We've found the first chunk that should be kept.
// First check if the shrink ratio is good enough to perform the the
// actual drop or leave it for next time if it is not worth the effort.
fi , err := f . Stat ( )
if err != nil {
return
}
totalChunks := int ( fi . Size ( ) ) / chunkLenWithHeader + len ( chunks )
if numDropped == 0 || float64 ( numDropped ) / float64 ( totalChunks ) < p . minShrinkRatio {
// Nothing to drop. Just adjust the return values and append the chunks (if any).
numDropped = 0
firstTimeNotDropped = firstTimeInFile
2015-03-08 18:33:10 -07:00
if len ( chunks ) > 0 {
offset , err = p . persistChunks ( fp , chunks )
}
return
}
2016-01-11 07:42:10 -08:00
// If we are here, we have to drop some chunks for real. So we need to
// record firstTimeNotDropped from the last read header, seek backwards
// to the beginning of its header, and start copying everything from
// there into a new file. Then append the chunks to the new file.
firstTimeNotDropped = model . Time (
binary . LittleEndian . Uint64 ( headerBuf [ chunkHeaderFirstTimeOffset : ] ) ,
)
chunkOps . WithLabelValues ( drop ) . Add ( float64 ( numDropped ) )
2015-03-18 11:09:07 -07:00
_ , err = f . Seek ( - chunkHeaderLen , os . SEEK_CUR )
2014-06-06 02:55:53 -07:00
if err != nil {
2015-03-08 18:33:10 -07:00
return
2014-06-06 02:55:53 -07:00
}
2014-10-15 08:07:12 -07:00
temp , err := os . OpenFile ( p . tempFileNameForFingerprint ( fp ) , os . O_WRONLY | os . O_CREATE , 0640 )
2014-06-06 02:55:53 -07:00
if err != nil {
2015-03-08 18:33:10 -07:00
return
2014-06-06 02:55:53 -07:00
}
2015-03-08 18:33:10 -07:00
defer func ( ) {
2015-03-19 07:41:50 -07:00
p . closeChunkFile ( temp )
2015-03-08 18:33:10 -07:00
if err == nil {
err = os . Rename ( p . tempFileNameForFingerprint ( fp ) , p . fileNameForFingerprint ( fp ) )
}
} ( )
2014-06-06 02:55:53 -07:00
2015-03-08 18:33:10 -07:00
written , err := io . Copy ( temp , f )
if err != nil {
return
2014-06-06 02:55:53 -07:00
}
2015-04-13 11:20:26 -07:00
offset = int ( written / chunkLenWithHeader )
2014-06-06 02:55:53 -07:00
2015-03-08 18:33:10 -07:00
if len ( chunks ) > 0 {
if err = writeChunks ( temp , chunks ) ; err != nil {
return
}
2014-10-27 12:40:48 -07:00
}
2015-03-08 18:33:10 -07:00
return
}
// deleteSeriesFile deletes a series file belonging to the provided
// fingerprint. It returns the number of chunks that were contained in the
// deleted file.
2015-08-20 08:18:46 -07:00
func ( p * persistence ) deleteSeriesFile ( fp model . Fingerprint ) ( int , error ) {
2015-03-08 18:33:10 -07:00
fname := p . fileNameForFingerprint ( fp )
fi , err := os . Stat ( fname )
if os . IsNotExist ( err ) {
// Great. The file is already gone.
return 0 , nil
}
if err != nil {
return - 1 , err
}
2015-04-13 11:20:26 -07:00
numChunks := int ( fi . Size ( ) / chunkLenWithHeader )
2015-03-08 18:33:10 -07:00
if err := os . Remove ( fname ) ; err != nil {
return - 1 , err
}
chunkOps . WithLabelValues ( drop ) . Add ( float64 ( numChunks ) )
return numChunks , nil
2014-09-10 09:41:52 -07:00
}
2015-05-20 10:13:06 -07:00
// seriesFileModTime returns the modification time of the series file belonging
// to the provided fingerprint. In case of an error, the zero value of time.Time
// is returned.
2015-08-20 08:18:46 -07:00
func ( p * persistence ) seriesFileModTime ( fp model . Fingerprint ) time . Time {
2015-03-19 04:59:26 -07:00
var modTime time . Time
if fi , err := os . Stat ( p . fileNameForFingerprint ( fp ) ) ; err == nil {
return fi . ModTime ( )
}
return modTime
}
2014-10-07 10:11:24 -07:00
// indexMetric queues the given metric for addition to the indexes needed by
2015-05-20 10:13:06 -07:00
// fingerprintsForLabelPair, labelValuesForLabelName, and
// fingerprintsModifiedBefore. If the queue is full, this method blocks until
// the metric can be queued. This method is goroutine-safe.
2015-08-20 08:18:46 -07:00
func ( p * persistence ) indexMetric ( fp model . Fingerprint , m model . Metric ) {
2014-09-23 10:21:10 -07:00
p . indexingQueue <- indexingOp { fp , m , add }
2014-06-06 02:55:53 -07:00
}
2014-10-07 10:11:24 -07:00
// unindexMetric queues references to the given metric for removal from the
2015-05-20 10:13:06 -07:00
// indexes used for fingerprintsForLabelPair, labelValuesForLabelName, and
// fingerprintsModifiedBefore. The index of fingerprints to archived metrics is
// not affected by this removal. (In fact, never call this method for an
2015-09-11 06:47:23 -07:00
// archived metric. To purge an archived metric, call purgeArchivedMetric.)
2014-10-07 10:11:24 -07:00
// If the queue is full, this method blocks until the metric can be queued. This
// method is goroutine-safe.
2015-08-20 08:18:46 -07:00
func ( p * persistence ) unindexMetric ( fp model . Fingerprint , m model . Metric ) {
2014-09-23 10:21:10 -07:00
p . indexingQueue <- indexingOp { fp , m , remove }
2014-09-10 09:41:52 -07:00
}
2014-10-07 10:11:24 -07:00
// waitForIndexing waits until all items in the indexing queue are processed. If
// queue processing is currently on hold (to gather more ops for batching), this
// method will trigger an immediate start of processing. This method is
// goroutine-safe.
func ( p * persistence ) waitForIndexing ( ) {
2014-09-24 07:32:07 -07:00
wait := make ( chan int )
for {
p . indexingFlush <- wait
if <- wait == 0 {
break
}
}
}
2014-10-07 10:11:24 -07:00
// archiveMetric persists the mapping of the given fingerprint to the given
// metric, together with the first and last timestamp of the series belonging to
2014-11-10 09:33:31 -08:00
// the metric. The caller must have locked the fingerprint.
2014-10-07 10:11:24 -07:00
func ( p * persistence ) archiveMetric (
2015-08-20 08:18:46 -07:00
fp model . Fingerprint , m model . Metric , first , last model . Time ,
2014-09-10 09:41:52 -07:00
) error {
2014-09-23 10:21:10 -07:00
if err := p . archivedFingerprintToMetrics . Put ( codable . Fingerprint ( fp ) , codable . Metric ( m ) ) ; err != nil {
2014-11-10 09:22:08 -08:00
p . setDirty ( true )
2014-09-14 06:33:56 -07:00
return err
}
2014-09-23 10:21:10 -07:00
if err := p . archivedFingerprintToTimeRange . Put ( codable . Fingerprint ( fp ) , codable . TimeRange { First : first , Last : last } ) ; err != nil {
2014-11-10 09:22:08 -08:00
p . setDirty ( true )
2014-09-14 06:33:56 -07:00
return err
}
2014-09-10 09:41:52 -07:00
return nil
}
2014-10-07 10:11:24 -07:00
// hasArchivedMetric returns whether the archived metric for the given
// fingerprint exists and if yes, what the first and last timestamp in the
// corresponding series is. This method is goroutine-safe.
2015-08-20 08:18:46 -07:00
func ( p * persistence ) hasArchivedMetric ( fp model . Fingerprint ) (
hasMetric bool , firstTime , lastTime model . Time , err error ,
2014-09-10 09:41:52 -07:00
) {
2014-09-16 06:47:24 -07:00
firstTime , lastTime , hasMetric , err = p . archivedFingerprintToTimeRange . Lookup ( fp )
2014-09-10 09:41:52 -07:00
return
}
2014-11-10 09:22:08 -08:00
// updateArchivedTimeRange updates an archived time range. The caller must make
// sure that the fingerprint is currently archived (the time range will
// otherwise be added without the corresponding metric in the archive).
func ( p * persistence ) updateArchivedTimeRange (
2015-08-20 08:18:46 -07:00
fp model . Fingerprint , first , last model . Time ,
2014-11-10 09:22:08 -08:00
) error {
return p . archivedFingerprintToTimeRange . Put ( codable . Fingerprint ( fp ) , codable . TimeRange { First : first , Last : last } )
}
2015-05-20 10:13:06 -07:00
// fingerprintsModifiedBefore returns the fingerprints of archived timeseries
2014-10-07 10:11:24 -07:00
// that have live samples before the provided timestamp. This method is
// goroutine-safe.
2015-08-20 08:18:46 -07:00
func ( p * persistence ) fingerprintsModifiedBefore ( beforeTime model . Time ) ( [ ] model . Fingerprint , error ) {
2014-09-24 07:32:07 -07:00
var fp codable . Fingerprint
var tr codable . TimeRange
2015-08-20 08:18:46 -07:00
fps := [ ] model . Fingerprint { }
2015-09-12 16:06:40 -07:00
err := p . archivedFingerprintToTimeRange . ForEach ( func ( kv index . KeyValueAccessor ) error {
2014-09-24 07:32:07 -07:00
if err := kv . Value ( & tr ) ; err != nil {
return err
}
if tr . First . Before ( beforeTime ) {
if err := kv . Key ( & fp ) ; err != nil {
return err
}
2015-08-20 08:18:46 -07:00
fps = append ( fps , model . Fingerprint ( fp ) )
2014-09-24 07:32:07 -07:00
}
return nil
} )
2015-09-12 16:06:40 -07:00
return fps , err
2014-09-24 07:32:07 -07:00
}
2015-05-20 10:13:06 -07:00
// archivedMetric retrieves the archived metric with the given fingerprint. This
// method is goroutine-safe.
2015-08-20 08:18:46 -07:00
func ( p * persistence ) archivedMetric ( fp model . Fingerprint ) ( model . Metric , error ) {
2014-09-16 06:47:24 -07:00
metric , _ , err := p . archivedFingerprintToMetrics . Lookup ( fp )
2014-09-14 06:33:56 -07:00
return metric , err
2014-09-10 09:41:52 -07:00
}
2015-02-26 06:19:44 -08:00
// purgeArchivedMetric deletes an archived fingerprint and its corresponding
2014-10-07 10:11:24 -07:00
// metric entirely. It also queues the metric for un-indexing (no need to call
2015-02-26 06:19:44 -08:00
// unindexMetric for the deleted metric.) It does not touch the series file,
// though. The caller must have locked the fingerprint.
2015-08-20 08:18:46 -07:00
func ( p * persistence ) purgeArchivedMetric ( fp model . Fingerprint ) ( err error ) {
2014-11-10 09:22:08 -08:00
defer func ( ) {
if err != nil {
p . setDirty ( true )
}
} ( )
2015-05-20 10:13:06 -07:00
metric , err := p . archivedMetric ( fp )
2014-09-14 06:33:56 -07:00
if err != nil || metric == nil {
return err
}
2015-01-29 03:57:50 -08:00
deleted , err := p . archivedFingerprintToMetrics . Delete ( codable . Fingerprint ( fp ) )
if err != nil {
2014-09-14 06:33:56 -07:00
return err
}
2015-01-29 03:57:50 -08:00
if ! deleted {
2015-05-20 09:10:29 -07:00
log . Errorf ( "Tried to delete non-archived fingerprint %s from archivedFingerprintToMetrics index. This should never happen." , fp )
2015-01-29 03:57:50 -08:00
}
deleted , err = p . archivedFingerprintToTimeRange . Delete ( codable . Fingerprint ( fp ) )
if err != nil {
2014-09-14 06:33:56 -07:00
return err
}
2015-01-29 03:57:50 -08:00
if ! deleted {
2015-05-20 09:10:29 -07:00
log . Errorf ( "Tried to delete non-archived fingerprint %s from archivedFingerprintToTimeRange index. This should never happen." , fp )
2015-01-29 03:57:50 -08:00
}
2014-10-28 11:01:41 -07:00
p . unindexMetric ( fp , metric )
2014-09-23 10:21:10 -07:00
return nil
2014-06-06 02:55:53 -07:00
}
2014-08-21 13:06:11 -07:00
2014-10-07 10:11:24 -07:00
// unarchiveMetric deletes an archived fingerprint and its metric, but (in
2015-02-26 06:19:44 -08:00
// contrast to purgeArchivedMetric) does not un-index the metric. If a metric
2015-07-13 12:12:27 -07:00
// was actually deleted, the method returns true and the first time and last
// time of the deleted metric. The caller must have locked the fingerprint.
2015-08-20 08:18:46 -07:00
func ( p * persistence ) unarchiveMetric ( fp model . Fingerprint ) ( deletedAnything bool , err error ) {
2014-11-10 09:33:31 -08:00
defer func ( ) {
if err != nil {
p . setDirty ( true )
}
} ( )
2014-10-07 10:11:24 -07:00
2015-01-29 03:57:50 -08:00
deleted , err := p . archivedFingerprintToMetrics . Delete ( codable . Fingerprint ( fp ) )
2015-07-13 12:12:27 -07:00
if err != nil || ! deleted {
return false , err
2015-01-29 03:57:50 -08:00
}
deleted , err = p . archivedFingerprintToTimeRange . Delete ( codable . Fingerprint ( fp ) )
if err != nil {
2015-07-13 12:12:27 -07:00
return false , err
2014-09-14 06:33:56 -07:00
}
2015-01-29 03:57:50 -08:00
if ! deleted {
2015-05-20 09:10:29 -07:00
log . Errorf ( "Tried to delete non-archived fingerprint %s from archivedFingerprintToTimeRange index. This should never happen." , fp )
2015-01-29 03:57:50 -08:00
}
2015-07-13 12:12:27 -07:00
return true , nil
2014-09-10 09:41:52 -07:00
}
2014-10-07 10:11:24 -07:00
// close flushes the indexing queue and other buffered data and releases any
2014-11-05 11:02:45 -08:00
// held resources. It also removes the dirty marker file if successful and if
// the persistence is currently not marked as dirty.
2014-10-07 10:11:24 -07:00
func ( p * persistence ) close ( ) error {
2014-09-23 10:21:10 -07:00
close ( p . indexingQueue )
<- p . indexingStopped
2015-01-14 07:52:09 -08:00
var lastError , dirtyFileRemoveError error
2014-09-16 06:47:24 -07:00
if err := p . archivedFingerprintToMetrics . Close ( ) ; err != nil {
2014-09-10 09:41:52 -07:00
lastError = err
2015-05-20 09:10:29 -07:00
log . Error ( "Error closing archivedFingerprintToMetric index DB: " , err )
2014-09-10 09:41:52 -07:00
}
2014-09-16 06:47:24 -07:00
if err := p . archivedFingerprintToTimeRange . Close ( ) ; err != nil {
2014-09-10 09:41:52 -07:00
lastError = err
2015-05-20 09:10:29 -07:00
log . Error ( "Error closing archivedFingerprintToTimeRange index DB: " , err )
2014-09-10 09:41:52 -07:00
}
2014-09-16 06:47:24 -07:00
if err := p . labelPairToFingerprints . Close ( ) ; err != nil {
2014-09-10 09:41:52 -07:00
lastError = err
2015-05-20 09:10:29 -07:00
log . Error ( "Error closing labelPairToFingerprints index DB: " , err )
2014-09-10 09:41:52 -07:00
}
2014-09-16 06:47:24 -07:00
if err := p . labelNameToLabelValues . Close ( ) ; err != nil {
2014-09-10 09:41:52 -07:00
lastError = err
2015-05-20 09:10:29 -07:00
log . Error ( "Error closing labelNameToLabelValues index DB: " , err )
2014-09-10 09:41:52 -07:00
}
2014-11-05 11:02:45 -08:00
if lastError == nil && ! p . isDirty ( ) {
2015-01-14 07:52:09 -08:00
dirtyFileRemoveError = os . Remove ( p . dirtyFileName )
}
if err := p . fLock . Release ( ) ; err != nil {
lastError = err
2015-05-20 09:10:29 -07:00
log . Error ( "Error releasing file lock: " , err )
2015-01-14 07:52:09 -08:00
}
if dirtyFileRemoveError != nil {
// On Windows, removing the dirty file before unlocking is not
// possible. So remove it here if it failed above.
lastError = os . Remove ( p . dirtyFileName )
2014-11-05 11:02:45 -08:00
}
2014-09-10 09:41:52 -07:00
return lastError
}
2015-08-20 08:18:46 -07:00
func ( p * persistence ) dirNameForFingerprint ( fp model . Fingerprint ) string {
2014-09-10 09:41:52 -07:00
fpStr := fp . String ( )
2014-11-20 12:03:51 -08:00
return path . Join ( p . basePath , fpStr [ 0 : seriesDirNameLen ] )
2014-10-15 08:07:12 -07:00
}
2015-08-20 08:18:46 -07:00
func ( p * persistence ) fileNameForFingerprint ( fp model . Fingerprint ) string {
2014-10-15 08:07:12 -07:00
fpStr := fp . String ( )
2014-11-20 12:03:51 -08:00
return path . Join ( p . basePath , fpStr [ 0 : seriesDirNameLen ] , fpStr [ seriesDirNameLen : ] + seriesFileSuffix )
2014-10-15 08:07:12 -07:00
}
2015-08-20 08:18:46 -07:00
func ( p * persistence ) tempFileNameForFingerprint ( fp model . Fingerprint ) string {
2014-10-15 08:07:12 -07:00
fpStr := fp . String ( )
2014-11-20 12:03:51 -08:00
return path . Join ( p . basePath , fpStr [ 0 : seriesDirNameLen ] , fpStr [ seriesDirNameLen : ] + seriesTempFileSuffix )
2014-09-10 09:41:52 -07:00
}
2015-08-20 08:18:46 -07:00
func ( p * persistence ) openChunkFileForWriting ( fp model . Fingerprint ) ( * os . File , error ) {
2014-10-15 08:07:12 -07:00
if err := os . MkdirAll ( p . dirNameForFingerprint ( fp ) , 0700 ) ; err != nil {
2014-09-10 09:41:52 -07:00
return nil , err
}
2015-01-26 04:48:24 -08:00
return os . OpenFile ( p . fileNameForFingerprint ( fp ) , os . O_WRONLY | os . O_APPEND | os . O_CREATE , 0640 )
// NOTE: Although the file was opened for append,
// f.Seek(0, os.SEEK_CUR)
// would now return '0, nil', so we cannot check for a consistent file length right now.
2015-03-08 18:33:10 -07:00
// However, the chunkIndexForOffset function is doing that check, so a wrong file length
2015-01-26 04:48:24 -08:00
// would still be detected.
2014-09-10 09:41:52 -07:00
}
2015-03-19 09:54:59 -07:00
// closeChunkFile first syncs the provided file if mandated so by the sync
2015-03-19 07:41:50 -07:00
// strategy. Then it closes the file. Errors are logged.
func ( p * persistence ) closeChunkFile ( f * os . File ) {
if p . shouldSync ( ) {
if err := f . Sync ( ) ; err != nil {
2015-05-20 09:10:29 -07:00
log . Error ( "Error syncing file:" , err )
2015-03-19 07:41:50 -07:00
}
}
if err := f . Close ( ) ; err != nil {
2015-05-20 09:10:29 -07:00
log . Error ( "Error closing chunk file:" , err )
2015-03-19 07:41:50 -07:00
}
}
2015-08-20 08:18:46 -07:00
func ( p * persistence ) openChunkFileForReading ( fp model . Fingerprint ) ( * os . File , error ) {
2014-10-15 08:07:12 -07:00
return os . Open ( p . fileNameForFingerprint ( fp ) )
2014-09-10 09:41:52 -07:00
}
2014-10-24 11:27:27 -07:00
func ( p * persistence ) headsFileName ( ) string {
2014-09-10 09:41:52 -07:00
return path . Join ( p . basePath , headsFileName )
}
2014-10-24 11:27:27 -07:00
func ( p * persistence ) headsTempFileName ( ) string {
return path . Join ( p . basePath , headsTempFileName )
}
2015-05-06 07:53:12 -07:00
func ( p * persistence ) mappingsFileName ( ) string {
return path . Join ( p . basePath , mappingsFileName )
}
func ( p * persistence ) mappingsTempFileName ( ) string {
return path . Join ( p . basePath , mappingsTempFileName )
}
2014-10-07 10:11:24 -07:00
func ( p * persistence ) processIndexingQueue ( ) {
2014-09-23 10:21:10 -07:00
batchSize := 0
nameToValues := index . LabelNameLabelValuesMapping { }
pairToFPs := index . LabelPairFingerprintsMapping { }
batchTimeout := time . NewTimer ( indexingBatchTimeout )
defer batchTimeout . Stop ( )
commitBatch := func ( ) {
2014-09-24 07:51:18 -07:00
p . indexingBatchSizes . Observe ( float64 ( batchSize ) )
2014-09-24 08:18:48 -07:00
defer func ( begin time . Time ) {
2015-03-19 09:06:16 -07:00
p . indexingBatchDuration . Observe (
float64 ( time . Since ( begin ) ) / float64 ( time . Millisecond ) ,
)
2014-09-24 08:18:48 -07:00
} ( time . Now ( ) )
2014-09-24 07:51:18 -07:00
2014-09-23 10:21:10 -07:00
if err := p . labelPairToFingerprints . IndexBatch ( pairToFPs ) ; err != nil {
2015-05-20 09:10:29 -07:00
log . Error ( "Error indexing label pair to fingerprints batch: " , err )
2014-09-23 10:21:10 -07:00
}
if err := p . labelNameToLabelValues . IndexBatch ( nameToValues ) ; err != nil {
2015-05-20 09:10:29 -07:00
log . Error ( "Error indexing label name to label values batch: " , err )
2014-09-23 10:21:10 -07:00
}
batchSize = 0
nameToValues = index . LabelNameLabelValuesMapping { }
pairToFPs = index . LabelPairFingerprintsMapping { }
2014-09-24 05:41:38 -07:00
batchTimeout . Reset ( indexingBatchTimeout )
2014-09-23 10:21:10 -07:00
}
2014-09-24 05:41:38 -07:00
var flush chan chan int
2014-09-23 10:21:10 -07:00
loop :
for {
2014-09-24 05:41:38 -07:00
// Only process flush requests if the queue is currently empty.
if len ( p . indexingQueue ) == 0 {
flush = p . indexingFlush
} else {
flush = nil
}
2014-09-23 10:21:10 -07:00
select {
case <- batchTimeout . C :
2014-10-17 04:55:54 -07:00
// Only commit if we have something to commit _and_
// nothing is waiting in the queue to be picked up. That
// prevents a death spiral if the LookupSet calls below
// are slow for some reason.
if batchSize > 0 && len ( p . indexingQueue ) == 0 {
2014-09-24 05:41:38 -07:00
commitBatch ( )
} else {
batchTimeout . Reset ( indexingBatchTimeout )
}
case r := <- flush :
2014-09-23 10:21:10 -07:00
if batchSize > 0 {
commitBatch ( )
}
2014-09-24 05:41:38 -07:00
r <- len ( p . indexingQueue )
2014-09-23 10:21:10 -07:00
case op , ok := <- p . indexingQueue :
if ! ok {
if batchSize > 0 {
commitBatch ( )
}
break loop
}
batchSize ++
for ln , lv := range op . metric {
2015-08-22 04:32:13 -07:00
lp := model . LabelPair { Name : ln , Value : lv }
2014-09-23 10:21:10 -07:00
baseFPs , ok := pairToFPs [ lp ]
if ! ok {
var err error
baseFPs , _ , err = p . labelPairToFingerprints . LookupSet ( lp )
if err != nil {
2015-05-20 09:10:29 -07:00
log . Errorf ( "Error looking up label pair %v: %s" , lp , err )
2014-09-23 10:21:10 -07:00
continue
}
pairToFPs [ lp ] = baseFPs
}
baseValues , ok := nameToValues [ ln ]
if ! ok {
var err error
baseValues , _ , err = p . labelNameToLabelValues . LookupSet ( ln )
if err != nil {
2015-05-20 09:10:29 -07:00
log . Errorf ( "Error looking up label name %v: %s" , ln , err )
2014-09-23 10:21:10 -07:00
continue
}
nameToValues [ ln ] = baseValues
}
switch op . opType {
case add :
baseFPs [ op . fingerprint ] = struct { } { }
baseValues [ lv ] = struct { } { }
case remove :
delete ( baseFPs , op . fingerprint )
if len ( baseFPs ) == 0 {
delete ( baseValues , lv )
}
default :
panic ( "unknown op type" )
}
}
if batchSize >= indexingMaxBatchSize {
commitBatch ( )
}
}
}
close ( p . indexingStopped )
}
2015-03-08 18:33:10 -07:00
2015-05-06 07:53:12 -07:00
// checkpointFPMappings persists the fingerprint mappings. This method is not
// goroutine-safe.
//
// Description of the file format, v1:
//
// (1) Magic string (const mappingsMagicString).
//
// (2) Uvarint-encoded format version (const mappingsFormatVersion).
//
// (3) Uvarint-encoded number of mappings in fpMappings.
//
// (4) Repeated once per mapping:
//
// (4.1) The raw fingerprint as big-endian uint64.
//
// (4.2) The uvarint-encoded number of sub-mappings for the raw fingerprint.
//
// (4.3) Repeated once per sub-mapping:
//
// (4.3.1) The uvarint-encoded length of the unique metric string.
// (4.3.2) The unique metric string.
// (4.3.3) The mapped fingerprint as big-endian uint64.
2015-05-07 09:58:14 -07:00
func ( p * persistence ) checkpointFPMappings ( fpm fpMappings ) ( err error ) {
2015-05-20 09:10:29 -07:00
log . Info ( "Checkpointing fingerprint mappings..." )
2015-05-06 07:53:12 -07:00
begin := time . Now ( )
f , err := os . OpenFile ( p . mappingsTempFileName ( ) , os . O_WRONLY | os . O_TRUNC | os . O_CREATE , 0640 )
if err != nil {
return
}
defer func ( ) {
2015-09-12 16:06:40 -07:00
syncErr := f . Sync ( )
2015-05-06 07:53:12 -07:00
closeErr := f . Close ( )
2015-09-12 16:06:40 -07:00
if err != nil {
return
}
err = syncErr
2015-05-06 07:53:12 -07:00
if err != nil {
return
}
err = closeErr
if err != nil {
return
}
err = os . Rename ( p . mappingsTempFileName ( ) , p . mappingsFileName ( ) )
duration := time . Since ( begin )
2015-05-20 09:10:29 -07:00
log . Infof ( "Done checkpointing fingerprint mappings in %v." , duration )
2015-05-06 07:53:12 -07:00
} ( )
w := bufio . NewWriterSize ( f , fileBufSize )
if _ , err = w . WriteString ( mappingsMagicString ) ; err != nil {
return
}
if _ , err = codable . EncodeUvarint ( w , mappingsFormatVersion ) ; err != nil {
return
}
2015-05-07 09:58:14 -07:00
if _ , err = codable . EncodeUvarint ( w , uint64 ( len ( fpm ) ) ) ; err != nil {
2015-05-06 07:53:12 -07:00
return
}
2015-05-07 09:58:14 -07:00
for fp , mappings := range fpm {
2015-05-06 07:53:12 -07:00
if err = codable . EncodeUint64 ( w , uint64 ( fp ) ) ; err != nil {
return
}
if _ , err = codable . EncodeUvarint ( w , uint64 ( len ( mappings ) ) ) ; err != nil {
return
}
for ms , mappedFP := range mappings {
if _ , err = codable . EncodeUvarint ( w , uint64 ( len ( ms ) ) ) ; err != nil {
return
}
if _ , err = w . WriteString ( ms ) ; err != nil {
return
}
if err = codable . EncodeUint64 ( w , uint64 ( mappedFP ) ) ; err != nil {
return
}
}
}
err = w . Flush ( )
return
}
// loadFPMappings loads the fingerprint mappings. It also returns the highest
// mapped fingerprint and any error encountered. If p.mappingsFileName is not
// found, the method returns (fpMappings{}, 0, nil). Do not call concurrently
// with checkpointFPMappings.
2015-08-20 08:18:46 -07:00
func ( p * persistence ) loadFPMappings ( ) ( fpMappings , model . Fingerprint , error ) {
2015-05-06 07:53:12 -07:00
fpm := fpMappings { }
2015-08-20 08:18:46 -07:00
var highestMappedFP model . Fingerprint
2015-05-06 07:53:12 -07:00
f , err := os . Open ( p . mappingsFileName ( ) )
if os . IsNotExist ( err ) {
return fpm , 0 , nil
}
if err != nil {
return nil , 0 , err
}
defer f . Close ( )
r := bufio . NewReaderSize ( f , fileBufSize )
buf := make ( [ ] byte , len ( mappingsMagicString ) )
if _ , err := io . ReadFull ( r , buf ) ; err != nil {
return nil , 0 , err
}
magic := string ( buf )
if magic != mappingsMagicString {
return nil , 0 , fmt . Errorf (
"unexpected magic string, want %q, got %q" ,
mappingsMagicString , magic ,
)
}
version , err := binary . ReadUvarint ( r )
if version != mappingsFormatVersion || err != nil {
return nil , 0 , fmt . Errorf ( "unknown fingerprint mappings format version, want %d" , mappingsFormatVersion )
}
numRawFPs , err := binary . ReadUvarint ( r )
if err != nil {
return nil , 0 , err
}
for ; numRawFPs > 0 ; numRawFPs -- {
rawFP , err := codable . DecodeUint64 ( r )
if err != nil {
return nil , 0 , err
}
numMappings , err := binary . ReadUvarint ( r )
if err != nil {
return nil , 0 , err
}
2015-08-20 08:18:46 -07:00
mappings := make ( map [ string ] model . Fingerprint , numMappings )
2015-05-06 07:53:12 -07:00
for ; numMappings > 0 ; numMappings -- {
lenMS , err := binary . ReadUvarint ( r )
if err != nil {
return nil , 0 , err
}
buf := make ( [ ] byte , lenMS )
if _ , err := io . ReadFull ( r , buf ) ; err != nil {
return nil , 0 , err
}
fp , err := codable . DecodeUint64 ( r )
if err != nil {
return nil , 0 , err
}
2015-08-20 08:18:46 -07:00
mappedFP := model . Fingerprint ( fp )
2015-05-06 07:53:12 -07:00
if mappedFP > highestMappedFP {
highestMappedFP = mappedFP
}
mappings [ string ( buf ) ] = mappedFP
}
2015-08-20 08:18:46 -07:00
fpm [ model . Fingerprint ( rawFP ) ] = mappings
2015-05-06 07:53:12 -07:00
}
return fpm , highestMappedFP , nil
}
2015-03-08 18:33:10 -07:00
func offsetForChunkIndex ( i int ) int64 {
2015-04-13 11:20:26 -07:00
return int64 ( i * chunkLenWithHeader )
2015-03-08 18:33:10 -07:00
}
func chunkIndexForOffset ( offset int64 ) ( int , error ) {
2015-04-13 11:20:26 -07:00
if int ( offset ) % chunkLenWithHeader != 0 {
2015-03-08 18:33:10 -07:00
return - 1 , fmt . Errorf (
"offset %d is not a multiple of on-disk chunk length %d" ,
2015-04-13 11:20:26 -07:00
offset , chunkLenWithHeader ,
2015-03-08 18:33:10 -07:00
)
}
2015-04-13 11:20:26 -07:00
return int ( offset ) / chunkLenWithHeader , nil
2015-03-08 18:33:10 -07:00
}
func writeChunkHeader ( w io . Writer , c chunk ) error {
header := make ( [ ] byte , chunkHeaderLen )
header [ chunkHeaderTypeOffset ] = byte ( c . encoding ( ) )
2015-04-14 04:46:38 -07:00
binary . LittleEndian . PutUint64 (
header [ chunkHeaderFirstTimeOffset : ] ,
uint64 ( c . firstTime ( ) ) ,
)
binary . LittleEndian . PutUint64 (
header [ chunkHeaderLastTimeOffset : ] ,
2015-05-20 10:13:06 -07:00
uint64 ( c . newIterator ( ) . lastTimestamp ( ) ) ,
2015-04-14 04:46:38 -07:00
)
2015-03-08 18:33:10 -07:00
_ , err := w . Write ( header )
return err
}
func writeChunks ( w io . Writer , chunks [ ] chunk ) error {
2015-04-13 11:20:26 -07:00
b := bufio . NewWriterSize ( w , len ( chunks ) * chunkLenWithHeader )
2015-03-08 18:33:10 -07:00
for _ , chunk := range chunks {
if err := writeChunkHeader ( b , chunk ) ; err != nil {
return err
}
if err := chunk . marshal ( b ) ; err != nil {
return err
}
}
return b . Flush ( )
}