2013-02-08 09:03:26 -08:00
// Copyright 2013 Prometheus Team
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package metric
import (
"fmt"
2013-03-01 09:51:36 -08:00
"github.com/prometheus/prometheus/coding"
"github.com/prometheus/prometheus/coding/indexable"
2013-02-08 09:03:26 -08:00
"github.com/prometheus/prometheus/model"
2013-03-01 09:51:36 -08:00
dto "github.com/prometheus/prometheus/model/generated"
2013-02-08 09:03:26 -08:00
"github.com/prometheus/prometheus/storage"
2013-03-25 02:24:59 -07:00
"github.com/prometheus/prometheus/storage/raw/leveldb"
2013-03-06 17:16:39 -08:00
"sort"
2013-02-08 09:03:26 -08:00
"sync"
"time"
)
// tieredStorage both persists samples and generates materialized views for
// queries.
type tieredStorage struct {
appendToDiskQueue chan model . Sample
appendToMemoryQueue chan model . Sample
2013-03-01 09:51:36 -08:00
diskFrontier * diskFrontier
2013-02-08 09:03:26 -08:00
diskStorage * LevelDBMetricPersistence
2013-03-21 09:53:57 -07:00
draining chan chan bool
2013-02-08 09:03:26 -08:00
flushMemoryInterval time . Duration
memoryArena memorySeriesStorage
memoryTTL time . Duration
mutex sync . Mutex
viewQueue chan viewJob
writeMemoryInterval time . Duration
}
// viewJob encapsulates a request to extract sample values from the datastore.
type viewJob struct {
builder ViewRequestBuilder
output chan View
err chan error
}
2013-03-01 09:51:36 -08:00
// Provides a unified means for batch appending values into the datastore along
// with querying for values in an efficient way.
2013-02-08 09:03:26 -08:00
type Storage interface {
2013-03-01 09:51:36 -08:00
// Enqueues a Sample for storage.
AppendSample ( model . Sample ) error
// Enqueus a ViewRequestBuilder for materialization, subject to a timeout.
MakeView ( request ViewRequestBuilder , timeout time . Duration ) ( View , error )
// Starts serving requests.
2013-02-08 09:03:26 -08:00
Serve ( )
2013-03-01 09:51:36 -08:00
// Stops the storage subsystem, flushing all pending operations.
Drain ( )
2013-03-06 17:16:39 -08:00
Flush ( )
2013-03-11 14:21:25 -07:00
Close ( )
2013-03-21 09:59:42 -07:00
// MetricPersistence proxy methods.
GetAllMetricNames ( ) ( [ ] string , error )
GetFingerprintsForLabelSet ( model . LabelSet ) ( model . Fingerprints , error )
GetMetricForFingerprint ( model . Fingerprint ) ( m * model . Metric , err error )
2013-02-08 09:03:26 -08:00
}
2013-03-06 17:16:39 -08:00
func NewTieredStorage ( appendToMemoryQueueDepth , appendToDiskQueueDepth , viewQueueDepth uint , flushMemoryInterval , writeMemoryInterval , memoryTTL time . Duration , root string ) Storage {
diskStorage , err := NewLevelDBMetricPersistence ( root )
2013-02-08 09:03:26 -08:00
if err != nil {
panic ( err )
}
return & tieredStorage {
appendToDiskQueue : make ( chan model . Sample , appendToDiskQueueDepth ) ,
appendToMemoryQueue : make ( chan model . Sample , appendToMemoryQueueDepth ) ,
diskStorage : diskStorage ,
2013-03-21 09:53:57 -07:00
draining : make ( chan chan bool ) ,
2013-02-08 09:03:26 -08:00
flushMemoryInterval : flushMemoryInterval ,
memoryArena : NewMemorySeriesStorage ( ) ,
memoryTTL : memoryTTL ,
viewQueue : make ( chan viewJob , viewQueueDepth ) ,
writeMemoryInterval : writeMemoryInterval ,
}
}
2013-03-01 09:51:36 -08:00
func ( t * tieredStorage ) AppendSample ( s model . Sample ) ( err error ) {
if len ( t . draining ) > 0 {
return fmt . Errorf ( "Storage is in the process of draining." )
}
2013-02-08 09:03:26 -08:00
t . appendToMemoryQueue <- s
2013-03-01 09:51:36 -08:00
return
}
func ( t * tieredStorage ) Drain ( ) {
2013-03-21 09:53:57 -07:00
drainingDone := make ( chan bool )
2013-03-06 17:16:39 -08:00
if len ( t . draining ) == 0 {
2013-03-21 09:53:57 -07:00
t . draining <- drainingDone
2013-03-06 17:16:39 -08:00
}
2013-03-21 09:53:57 -07:00
<- drainingDone
2013-02-08 09:03:26 -08:00
}
func ( t * tieredStorage ) MakeView ( builder ViewRequestBuilder , deadline time . Duration ) ( view View , err error ) {
2013-03-01 09:51:36 -08:00
if len ( t . draining ) > 0 {
err = fmt . Errorf ( "Storage is in the process of draining." )
return
}
2013-02-08 09:03:26 -08:00
result := make ( chan View )
errChan := make ( chan error )
t . viewQueue <- viewJob {
builder : builder ,
output : result ,
err : errChan ,
}
select {
case value := <- result :
view = value
case err = <- errChan :
return
case <- time . After ( deadline ) :
err = fmt . Errorf ( "MakeView timed out after %s." , deadline )
}
return
}
2013-03-01 09:51:36 -08:00
func ( t * tieredStorage ) rebuildDiskFrontier ( ) ( err error ) {
begin := time . Now ( )
defer func ( ) {
2013-03-11 14:21:25 -07:00
duration := time . Since ( begin )
2013-02-08 09:03:26 -08:00
2013-03-01 09:51:36 -08:00
recordOutcome ( duration , err , map [ string ] string { operation : appendSample , result : success } , map [ string ] string { operation : rebuildDiskFrontier , result : failure } )
} ( )
2013-03-25 02:24:59 -07:00
i := t . diskStorage . metricSamples . NewIterator ( true )
defer i . Close ( )
2013-03-01 09:51:36 -08:00
t . diskFrontier , err = newDiskFrontier ( i )
if err != nil {
panic ( err )
2013-02-08 09:03:26 -08:00
}
2013-03-01 09:51:36 -08:00
return
2013-02-08 09:03:26 -08:00
}
func ( t * tieredStorage ) Serve ( ) {
var (
flushMemoryTicker = time . Tick ( t . flushMemoryInterval )
writeMemoryTicker = time . Tick ( t . writeMemoryInterval )
)
2013-03-01 09:51:36 -08:00
2013-02-08 09:03:26 -08:00
for {
2013-03-06 17:16:39 -08:00
t . reportQueues ( )
2013-02-08 09:03:26 -08:00
select {
case <- writeMemoryTicker :
t . writeMemory ( )
case <- flushMemoryTicker :
t . flushMemory ( )
case viewRequest := <- t . viewQueue :
t . renderView ( viewRequest )
2013-03-21 09:53:57 -07:00
case drainingDone := <- t . draining :
2013-03-01 09:51:36 -08:00
t . flush ( )
2013-03-21 09:53:57 -07:00
drainingDone <- true
2013-03-06 17:16:39 -08:00
break
2013-02-08 09:03:26 -08:00
}
}
}
2013-03-06 17:16:39 -08:00
func ( t * tieredStorage ) reportQueues ( ) {
queueSizes . Set ( map [ string ] string { "queue" : "append_to_disk" , "facet" : "occupancy" } , float64 ( len ( t . appendToDiskQueue ) ) )
queueSizes . Set ( map [ string ] string { "queue" : "append_to_disk" , "facet" : "capacity" } , float64 ( cap ( t . appendToDiskQueue ) ) )
queueSizes . Set ( map [ string ] string { "queue" : "append_to_memory" , "facet" : "occupancy" } , float64 ( len ( t . appendToMemoryQueue ) ) )
queueSizes . Set ( map [ string ] string { "queue" : "append_to_memory" , "facet" : "capacity" } , float64 ( cap ( t . appendToMemoryQueue ) ) )
queueSizes . Set ( map [ string ] string { "queue" : "view_generation" , "facet" : "occupancy" } , float64 ( len ( t . viewQueue ) ) )
queueSizes . Set ( map [ string ] string { "queue" : "view_generation" , "facet" : "capacity" } , float64 ( cap ( t . viewQueue ) ) )
}
2013-02-08 09:03:26 -08:00
func ( t * tieredStorage ) writeMemory ( ) {
2013-03-01 09:51:36 -08:00
begin := time . Now ( )
defer func ( ) {
2013-03-11 14:21:25 -07:00
duration := time . Since ( begin )
2013-03-01 09:51:36 -08:00
recordOutcome ( duration , nil , map [ string ] string { operation : appendSample , result : success } , map [ string ] string { operation : writeMemory , result : failure } )
} ( )
2013-02-08 09:03:26 -08:00
t . mutex . Lock ( )
defer t . mutex . Unlock ( )
pendingLength := len ( t . appendToMemoryQueue )
for i := 0 ; i < pendingLength ; i ++ {
t . memoryArena . AppendSample ( <- t . appendToMemoryQueue )
}
}
2013-03-06 17:16:39 -08:00
func ( t * tieredStorage ) Flush ( ) {
t . flush ( )
}
2013-03-11 14:21:25 -07:00
func ( t * tieredStorage ) Close ( ) {
t . Drain ( )
t . diskStorage . Close ( )
}
2013-02-08 09:03:26 -08:00
// Write all pending appends.
func ( t * tieredStorage ) flush ( ) ( err error ) {
2013-03-11 14:21:25 -07:00
// Trim any old values to reduce iterative write costs.
2013-03-07 11:01:32 -08:00
t . flushMemory ( )
2013-02-08 09:03:26 -08:00
t . writeMemory ( )
t . flushMemory ( )
return
}
type memoryToDiskFlusher struct {
toDiskQueue chan model . Sample
disk MetricPersistence
olderThan time . Time
valuesAccepted int
valuesRejected int
}
type memoryToDiskFlusherVisitor struct {
stream stream
flusher * memoryToDiskFlusher
}
func ( f memoryToDiskFlusherVisitor ) DecodeKey ( in interface { } ) ( out interface { } , err error ) {
out = time . Time ( in . ( skipListTime ) )
return
}
func ( f memoryToDiskFlusherVisitor ) DecodeValue ( in interface { } ) ( out interface { } , err error ) {
out = in . ( value ) . get ( )
return
}
func ( f memoryToDiskFlusherVisitor ) Filter ( key , value interface { } ) ( filterResult storage . FilterResult ) {
var (
recordTime = key . ( time . Time )
)
if recordTime . Before ( f . flusher . olderThan ) {
f . flusher . valuesAccepted ++
return storage . ACCEPT
}
f . flusher . valuesRejected ++
return storage . STOP
}
func ( f memoryToDiskFlusherVisitor ) Operate ( key , value interface { } ) ( err * storage . OperatorError ) {
var (
recordTime = key . ( time . Time )
recordValue = value . ( model . SampleValue )
)
if len ( f . flusher . toDiskQueue ) == cap ( f . flusher . toDiskQueue ) {
f . flusher . Flush ( )
}
f . flusher . toDiskQueue <- model . Sample {
Metric : f . stream . metric ,
Timestamp : recordTime ,
Value : recordValue ,
}
f . stream . values . Delete ( skipListTime ( recordTime ) )
return
}
func ( f * memoryToDiskFlusher ) ForStream ( stream stream ) ( decoder storage . RecordDecoder , filter storage . RecordFilter , operator storage . RecordOperator ) {
visitor := memoryToDiskFlusherVisitor {
stream : stream ,
flusher : f ,
}
2013-03-01 09:51:36 -08:00
// fmt.Printf("fingerprint -> %s\n", model.NewFingerprintFromMetric(stream.metric).ToRowKey())
2013-02-08 09:03:26 -08:00
return visitor , visitor , visitor
}
func ( f * memoryToDiskFlusher ) Flush ( ) {
length := len ( f . toDiskQueue )
samples := model . Samples { }
for i := 0 ; i < length ; i ++ {
samples = append ( samples , <- f . toDiskQueue )
}
2013-03-07 11:01:32 -08:00
start := time . Now ( )
2013-02-08 09:03:26 -08:00
f . disk . AppendSamples ( samples )
2013-03-07 11:01:32 -08:00
if false {
fmt . Printf ( "Took %s to append...\n" , time . Since ( start ) )
}
2013-02-08 09:03:26 -08:00
}
func ( f memoryToDiskFlusher ) Close ( ) {
f . Flush ( )
}
// Persist a whole bunch of samples to the datastore.
func ( t * tieredStorage ) flushMemory ( ) {
2013-03-01 09:51:36 -08:00
begin := time . Now ( )
defer func ( ) {
2013-03-11 14:21:25 -07:00
duration := time . Since ( begin )
2013-03-01 09:51:36 -08:00
recordOutcome ( duration , nil , map [ string ] string { operation : appendSample , result : success } , map [ string ] string { operation : flushMemory , result : failure } )
} ( )
2013-02-08 09:03:26 -08:00
t . mutex . Lock ( )
defer t . mutex . Unlock ( )
flusher := & memoryToDiskFlusher {
disk : t . diskStorage ,
olderThan : time . Now ( ) . Add ( - 1 * t . memoryTTL ) ,
toDiskQueue : t . appendToDiskQueue ,
}
defer flusher . Close ( )
t . memoryArena . ForEachSample ( flusher )
return
}
2013-03-16 01:30:31 -07:00
func ( t * tieredStorage ) renderView ( viewJob viewJob ) {
// Telemetry.
var err error
2013-03-01 09:51:36 -08:00
begin := time . Now ( )
defer func ( ) {
2013-03-11 14:21:25 -07:00
duration := time . Since ( begin )
2013-02-08 09:03:26 -08:00
2013-03-16 01:30:31 -07:00
recordOutcome ( duration , err , map [ string ] string { operation : renderView , result : success } , map [ string ] string { operation : renderView , result : failure } )
2013-03-01 09:51:36 -08:00
} ( )
2013-02-08 09:03:26 -08:00
2013-03-01 09:51:36 -08:00
t . mutex . Lock ( )
defer t . mutex . Unlock ( )
2013-02-08 09:03:26 -08:00
var (
2013-03-01 09:51:36 -08:00
scans = viewJob . builder . ScanJobs ( )
2013-03-16 01:30:31 -07:00
view = newView ( )
2013-02-08 09:03:26 -08:00
)
2013-03-01 09:51:36 -08:00
// Rebuilding of the frontier should happen on a conditional basis if a
// (fingerprint, timestamp) tuple is outside of the current frontier.
err = t . rebuildDiskFrontier ( )
if err != nil {
panic ( err )
}
2013-03-16 01:30:31 -07:00
if t . diskFrontier == nil {
// Storage still empty, return an empty view.
viewJob . output <- view
return
}
2013-02-08 09:03:26 -08:00
2013-03-16 01:30:31 -07:00
// Get a single iterator that will be used for all data extraction below.
2013-03-25 02:24:59 -07:00
iterator := t . diskStorage . metricSamples . NewIterator ( true )
defer iterator . Close ( )
2013-03-01 09:51:36 -08:00
for _ , scanJob := range scans {
2013-03-16 01:30:31 -07:00
seriesFrontier , err := newSeriesFrontier ( scanJob . fingerprint , * t . diskFrontier , iterator )
if err != nil {
panic ( err )
}
if seriesFrontier == nil {
continue
}
standingOps := scanJob . operations
for len ( standingOps ) > 0 {
// Load data value chunk(s) around the first standing op's current time.
highWatermark := * standingOps [ 0 ] . CurrentTime ( )
2013-03-18 13:01:22 -07:00
// XXX: For earnest performance gains analagous to the benchmarking we
// performed, chunk should only be reloaded if it no longer contains
// the values we're looking for.
//
// To better understand this, look at https://github.com/prometheus/prometheus/blob/benchmark/leveldb/iterator-seek-characteristics/leveldb.go#L239 and note the behavior around retrievedValue.
2013-03-16 01:30:31 -07:00
chunk := t . loadChunkAroundTime ( iterator , seriesFrontier , scanJob . fingerprint , highWatermark )
lastChunkTime := chunk [ len ( chunk ) - 1 ] . Timestamp
if lastChunkTime . After ( highWatermark ) {
highWatermark = lastChunkTime
2013-03-01 09:51:36 -08:00
}
2013-02-08 09:03:26 -08:00
2013-03-16 01:30:31 -07:00
// For each op, extract all needed data from the current chunk.
out := [ ] model . SamplePair { }
for _ , op := range standingOps {
if op . CurrentTime ( ) . After ( highWatermark ) {
break
}
for op . CurrentTime ( ) != nil && ! op . CurrentTime ( ) . After ( highWatermark ) {
out = op . ExtractSamples ( chunk )
2013-02-08 09:03:26 -08:00
}
}
2013-03-16 01:30:31 -07:00
// Append the extracted samples to the materialized view.
for _ , sample := range out {
view . appendSample ( scanJob . fingerprint , sample . Timestamp , sample . Value )
}
// Throw away standing ops which are finished.
filteredOps := ops { }
for _ , op := range standingOps {
if op . CurrentTime ( ) != nil {
filteredOps = append ( filteredOps , op )
}
}
standingOps = filteredOps
// Sort ops by start time again, since they might be slightly off now.
// For example, consider a current chunk of values and two interval ops
2013-03-19 06:25:38 -07:00
// with different interval lengths. Their states after the cycle above
2013-03-16 01:30:31 -07:00
// could be:
//
// (C = current op time)
//
// Chunk: [ X X X X X ]
// Op 1: [ X X C . . . ]
// Op 2: [ X X C . . .]
//
// Op 2 now has an earlier current time than Op 1.
2013-03-16 01:41:43 -07:00
sort . Sort ( startsAtSort { standingOps } )
2013-02-08 09:03:26 -08:00
}
2013-03-16 01:30:31 -07:00
}
viewJob . output <- view
return
}
2013-02-08 09:03:26 -08:00
2013-03-25 02:24:59 -07:00
func ( t * tieredStorage ) loadChunkAroundTime ( iterator leveldb . Iterator , frontier * seriesFrontier , fingerprint model . Fingerprint , ts time . Time ) ( chunk [ ] model . SamplePair ) {
2013-03-16 01:30:31 -07:00
var (
targetKey = & dto . SampleKey {
Fingerprint : fingerprint . ToDTO ( ) ,
}
foundKey = & dto . SampleKey { }
foundValue * dto . SampleValueSeries
)
// Limit the target key to be within the series' keyspace.
if ts . After ( frontier . lastSupertime ) {
targetKey . Timestamp = indexable . EncodeTime ( frontier . lastSupertime )
} else {
targetKey . Timestamp = indexable . EncodeTime ( ts )
2013-02-08 09:03:26 -08:00
}
2013-03-01 09:51:36 -08:00
2013-03-16 01:30:31 -07:00
// Try seeking to target key.
rawKey , _ := coding . NewProtocolBufferEncoder ( targetKey ) . Encode ( )
iterator . Seek ( rawKey )
2013-03-01 09:51:36 -08:00
2013-03-16 01:30:31 -07:00
foundKey , err := extractSampleKey ( iterator )
if err != nil {
panic ( err )
}
// Figure out if we need to rewind by one block.
// Imagine the following supertime blocks with time ranges:
//
// Block 1: ft 1000 - lt 1009 <data>
// Block 1: ft 1010 - lt 1019 <data>
//
// If we are aiming to find time 1005, we would first seek to the block with
// supertime 1010, then need to rewind by one block by virtue of LevelDB
// iterator seek behavior.
//
// Only do the rewind if there is another chunk before this one.
rewound := false
firstTime := indexable . DecodeTime ( foundKey . Timestamp )
if ts . Before ( firstTime ) && ! frontier . firstSupertime . After ( ts ) {
2013-03-25 02:24:59 -07:00
iterator . Previous ( )
2013-03-16 01:30:31 -07:00
rewound = true
}
foundValue , err = extractSampleValues ( iterator )
if err != nil {
panic ( err )
}
// If we rewound, but the target time is still past the current block, return
// the last value of the current (rewound) block and the entire next block.
if rewound {
foundKey , err = extractSampleKey ( iterator )
if err != nil {
panic ( err )
}
currentChunkLastTime := time . Unix ( * foundKey . LastTimestamp , 0 )
if ts . After ( currentChunkLastTime ) {
sampleCount := len ( foundValue . Value )
chunk = append ( chunk , model . SamplePair {
Timestamp : time . Unix ( * foundValue . Value [ sampleCount - 1 ] . Timestamp , 0 ) ,
Value : model . SampleValue ( * foundValue . Value [ sampleCount - 1 ] . Value ) ,
} )
// We know there's a next block since we have rewound from it.
iterator . Next ( )
foundValue , err = extractSampleValues ( iterator )
if err != nil {
panic ( err )
}
}
}
// Now append all the samples of the currently seeked block to the output.
for _ , sample := range foundValue . Value {
chunk = append ( chunk , model . SamplePair {
Timestamp : time . Unix ( * sample . Timestamp , 0 ) ,
Value : model . SampleValue ( * sample . Value ) ,
} )
}
2013-03-06 17:16:39 -08:00
2013-03-01 09:51:36 -08:00
return
2013-02-08 09:03:26 -08:00
}
2013-03-21 09:59:42 -07:00
func ( t * tieredStorage ) GetAllMetricNames ( ) ( [ ] string , error ) {
// TODO: handle memory persistence as well.
return t . diskStorage . GetAllMetricNames ( )
}
func ( t * tieredStorage ) GetFingerprintsForLabelSet ( labelSet model . LabelSet ) ( model . Fingerprints , error ) {
// TODO: handle memory persistence as well.
return t . diskStorage . GetFingerprintsForLabelSet ( labelSet )
}
func ( t * tieredStorage ) GetMetricForFingerprint ( f model . Fingerprint ) ( m * model . Metric , err error ) {
// TODO: handle memory persistence as well.
return t . diskStorage . GetMetricForFingerprint ( f )
}