mirror of
https://github.com/prometheus/prometheus.git
synced 2024-11-09 23:24:05 -08:00
1c80c33e72
The former approach created unordered postings list by either map iteration of new series being unsorted (fixable) or concurrent writers creating new series interleaved. We switch back to generating ephemeral references for a single batch. Newly created series have to be re-set upon the next insert.
738 lines
15 KiB
Go
738 lines
15 KiB
Go
// Package tsdb implements a time series storage for float64 sample data.
|
|
package tsdb
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"io/ioutil"
|
|
"math"
|
|
"os"
|
|
"path/filepath"
|
|
"reflect"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
"unsafe"
|
|
|
|
"golang.org/x/sync/errgroup"
|
|
|
|
"github.com/coreos/etcd/pkg/fileutil"
|
|
"github.com/fabxc/tsdb/labels"
|
|
"github.com/go-kit/kit/log"
|
|
"github.com/pkg/errors"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
)
|
|
|
|
// DefaultOptions used for the DB. They are sane for setups using
|
|
// millisecond precision timestampdb.
|
|
var DefaultOptions = &Options{
|
|
Retention: 15 * 24 * 3600 * 1000, // 15 days
|
|
DisableWAL: false,
|
|
}
|
|
|
|
// Options of the DB storage.
|
|
type Options struct {
|
|
Retention int64
|
|
DisableWAL bool
|
|
WALFlushInterval time.Duration
|
|
}
|
|
|
|
// Appender allows appending a batch of data. It must be completed with a
|
|
// call to Commit or Rollback and must not be reused afterwards.
|
|
type Appender interface {
|
|
// SetSeries ensures that a series with the given label set exists and
|
|
// returns a unique reference number identifying it. Returned reference
|
|
// numbers are ephemeral and may be rejected in calls to Add() at any point.
|
|
// A new reference number can then be requested with another call to
|
|
// SetSeries.
|
|
SetSeries(labels.Labels) (uint64, error)
|
|
|
|
// Add adds a sample pair for the referenced serie.
|
|
Add(ref uint64, t int64, v float64) error
|
|
|
|
// Commit submits the collected samples and purges the batch.
|
|
Commit() error
|
|
|
|
// Rollback rolls back all modifications made in the appender so far.
|
|
Rollback() error
|
|
}
|
|
|
|
type hashedSample struct {
|
|
hash uint64
|
|
labels labels.Labels
|
|
ref uint32
|
|
|
|
t int64
|
|
v float64
|
|
}
|
|
|
|
const sep = '\xff'
|
|
|
|
// DB handles reads and writes of time series falling into
|
|
// a hashed partition of a seriedb.
|
|
type DB struct {
|
|
dir string
|
|
logger log.Logger
|
|
metrics *dbMetrics
|
|
|
|
mtx sync.RWMutex
|
|
persisted []*persistedBlock
|
|
heads []*headBlock
|
|
headGen uint8
|
|
|
|
compactor *compactor
|
|
|
|
compactc chan struct{}
|
|
cutc chan struct{}
|
|
donec chan struct{}
|
|
stopc chan struct{}
|
|
}
|
|
|
|
type dbMetrics struct {
|
|
samplesAppended prometheus.Counter
|
|
compactionsTriggered prometheus.Counter
|
|
}
|
|
|
|
func newDBMetrics(r prometheus.Registerer) *dbMetrics {
|
|
m := &dbMetrics{}
|
|
|
|
m.samplesAppended = prometheus.NewCounter(prometheus.CounterOpts{
|
|
Name: "tsdb_samples_appended_total",
|
|
Help: "Total number of appended sampledb.",
|
|
})
|
|
m.compactionsTriggered = prometheus.NewCounter(prometheus.CounterOpts{
|
|
Name: "tsdb_compactions_triggered_total",
|
|
Help: "Total number of triggered compactions for the partition.",
|
|
})
|
|
|
|
if r != nil {
|
|
r.MustRegister(
|
|
m.samplesAppended,
|
|
m.compactionsTriggered,
|
|
)
|
|
}
|
|
return m
|
|
}
|
|
|
|
// Open returns a new DB in the given directory.
|
|
func Open(dir string, logger log.Logger) (db *DB, err error) {
|
|
// Create directory if partition is new.
|
|
if !fileutil.Exist(dir) {
|
|
if err := os.MkdirAll(dir, 0777); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
var r prometheus.Registerer
|
|
// r := prometheus.DefaultRegisterer
|
|
|
|
db = &DB{
|
|
dir: dir,
|
|
logger: logger,
|
|
metrics: newDBMetrics(r),
|
|
compactc: make(chan struct{}, 1),
|
|
cutc: make(chan struct{}, 1),
|
|
donec: make(chan struct{}),
|
|
stopc: make(chan struct{}),
|
|
}
|
|
|
|
if err := db.initBlocks(); err != nil {
|
|
return nil, err
|
|
}
|
|
if db.compactor, err = newCompactor(db, r); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
go db.run()
|
|
|
|
return db, nil
|
|
}
|
|
|
|
func (db *DB) run() {
|
|
defer close(db.donec)
|
|
|
|
for {
|
|
select {
|
|
case <-db.cutc:
|
|
db.mtx.Lock()
|
|
err := db.cut()
|
|
db.mtx.Unlock()
|
|
|
|
if err != nil {
|
|
db.logger.Log("msg", "cut failed", "err", err)
|
|
} else {
|
|
select {
|
|
case db.compactc <- struct{}{}:
|
|
default:
|
|
}
|
|
}
|
|
// Drain cut channel so we don't trigger immediately again.
|
|
select {
|
|
case <-db.cutc:
|
|
default:
|
|
}
|
|
|
|
case <-db.compactc:
|
|
db.metrics.compactionsTriggered.Inc()
|
|
|
|
for {
|
|
blocks := db.compactor.pick()
|
|
if len(blocks) == 0 {
|
|
break
|
|
}
|
|
// TODO(fabxc): pick emits blocks in order. compact acts on
|
|
// inverted order. Put inversion into compactor?
|
|
var bs []Block
|
|
for _, b := range blocks {
|
|
bs = append([]Block{b}, bs...)
|
|
}
|
|
|
|
select {
|
|
case <-db.stopc:
|
|
return
|
|
default:
|
|
}
|
|
if err := db.compact(bs); err != nil {
|
|
db.logger.Log("msg", "compaction failed", "err", err)
|
|
}
|
|
}
|
|
|
|
case <-db.stopc:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
func (db *DB) compact(blocks []Block) error {
|
|
if len(blocks) == 0 {
|
|
return nil
|
|
}
|
|
tmpdir := blocks[0].Dir() + ".tmp"
|
|
|
|
// TODO(fabxc): find a better place to do this transparently.
|
|
for _, b := range blocks {
|
|
if h, ok := b.(*headBlock); ok {
|
|
h.updateMapping()
|
|
}
|
|
}
|
|
|
|
if err := db.compactor.compact(tmpdir, blocks...); err != nil {
|
|
return err
|
|
}
|
|
|
|
db.mtx.Lock()
|
|
defer db.mtx.Unlock()
|
|
|
|
if err := renameDir(tmpdir, blocks[0].Dir()); err != nil {
|
|
return errors.Wrap(err, "rename dir")
|
|
}
|
|
for _, b := range blocks[1:] {
|
|
if err := os.RemoveAll(b.Dir()); err != nil {
|
|
return errors.Wrap(err, "delete dir")
|
|
}
|
|
}
|
|
|
|
var merr MultiError
|
|
|
|
for _, b := range blocks {
|
|
merr.Add(errors.Wrapf(db.reinit(b.Dir()), "reinit block at %q", b.Dir()))
|
|
}
|
|
return merr.Err()
|
|
}
|
|
|
|
func isBlockDir(fi os.FileInfo) bool {
|
|
if !fi.IsDir() {
|
|
return false
|
|
}
|
|
if !strings.HasPrefix(fi.Name(), "b-") {
|
|
return false
|
|
}
|
|
if _, err := strconv.ParseUint(fi.Name()[2:], 10, 32); err != nil {
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
func (db *DB) initBlocks() error {
|
|
var (
|
|
pbs []*persistedBlock
|
|
heads []*headBlock
|
|
)
|
|
|
|
files, err := ioutil.ReadDir(db.dir)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
for _, fi := range files {
|
|
if !isBlockDir(fi) {
|
|
continue
|
|
}
|
|
dir := filepath.Join(db.dir, fi.Name())
|
|
|
|
if fileutil.Exist(filepath.Join(dir, walFileName)) {
|
|
h, err := openHeadBlock(dir, db.logger)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
heads = append(heads, h)
|
|
continue
|
|
}
|
|
|
|
b, err := newPersistedBlock(dir)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
pbs = append(pbs, b)
|
|
}
|
|
|
|
// Validate that blocks are sequential in time.
|
|
lastTime := int64(math.MinInt64)
|
|
|
|
for _, b := range pbs {
|
|
if b.Stats().MinTime < lastTime {
|
|
return errors.Errorf("illegal order for block at %q", b.Dir())
|
|
}
|
|
lastTime = b.Stats().MaxTime
|
|
}
|
|
for _, b := range heads {
|
|
if b.Stats().MinTime < lastTime {
|
|
return errors.Errorf("illegal order for block at %q", b.Dir())
|
|
}
|
|
lastTime = b.Stats().MaxTime
|
|
}
|
|
|
|
db.persisted = pbs
|
|
db.heads = heads
|
|
|
|
if len(heads) == 0 {
|
|
return db.cut()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Close the partition.
|
|
func (db *DB) Close() error {
|
|
close(db.stopc)
|
|
<-db.donec
|
|
|
|
var merr MultiError
|
|
|
|
db.mtx.Lock()
|
|
defer db.mtx.Unlock()
|
|
|
|
for _, pb := range db.persisted {
|
|
merr.Add(pb.Close())
|
|
}
|
|
for _, hb := range db.heads {
|
|
merr.Add(hb.Close())
|
|
}
|
|
|
|
return merr.Err()
|
|
}
|
|
|
|
func (db *DB) Appender() Appender {
|
|
db.mtx.RLock()
|
|
|
|
return &dbAppender{
|
|
db: db,
|
|
head: db.heads[len(db.heads)-1].Appender().(*headAppender),
|
|
gen: db.headGen,
|
|
}
|
|
}
|
|
|
|
type dbAppender struct {
|
|
db *DB
|
|
gen uint8
|
|
head *headAppender
|
|
}
|
|
|
|
func (a *dbAppender) SetSeries(lset labels.Labels) (uint64, error) {
|
|
ref, err := a.head.SetSeries(lset)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
return ref | (uint64(a.gen) << 40), nil
|
|
}
|
|
|
|
func (a *dbAppender) setSeries(hash uint64, lset labels.Labels) (uint64, error) {
|
|
ref, err := a.head.setSeries(hash, lset)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
return ref | (uint64(a.gen) << 40), nil
|
|
}
|
|
|
|
func (a *dbAppender) Add(ref uint64, t int64, v float64) error {
|
|
// We store the head generation in the 4th byte and use it to reject
|
|
// stale references.
|
|
gen := uint8((ref << 16) >> 56)
|
|
|
|
if gen != a.gen {
|
|
return errNotFound
|
|
}
|
|
return a.head.Add(ref, t, v)
|
|
}
|
|
|
|
func (a *dbAppender) Commit() error {
|
|
defer a.db.mtx.RUnlock()
|
|
|
|
err := a.head.Commit()
|
|
|
|
if a.head.headBlock.fullness() > 1.0 {
|
|
select {
|
|
case a.db.cutc <- struct{}{}:
|
|
default:
|
|
}
|
|
}
|
|
return err
|
|
}
|
|
|
|
func (a *dbAppender) Rollback() error {
|
|
err := a.head.Rollback()
|
|
a.db.mtx.RUnlock()
|
|
return err
|
|
}
|
|
|
|
func (db *DB) headForDir(dir string) (int, bool) {
|
|
for i, b := range db.heads {
|
|
if b.Dir() == dir {
|
|
return i, true
|
|
}
|
|
}
|
|
return -1, false
|
|
}
|
|
|
|
func (db *DB) persistedForDir(dir string) (int, bool) {
|
|
for i, b := range db.persisted {
|
|
if b.Dir() == dir {
|
|
return i, true
|
|
}
|
|
}
|
|
return -1, false
|
|
}
|
|
|
|
func (db *DB) reinit(dir string) error {
|
|
if !fileutil.Exist(dir) {
|
|
if i, ok := db.headForDir(dir); ok {
|
|
if err := db.heads[i].Close(); err != nil {
|
|
return err
|
|
}
|
|
db.heads = append(db.heads[:i], db.heads[i+1:]...)
|
|
}
|
|
if i, ok := db.persistedForDir(dir); ok {
|
|
if err := db.persisted[i].Close(); err != nil {
|
|
return err
|
|
}
|
|
db.persisted = append(db.persisted[:i], db.persisted[i+1:]...)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Remove a previous head block.
|
|
if i, ok := db.headForDir(dir); ok {
|
|
if err := db.heads[i].Close(); err != nil {
|
|
return err
|
|
}
|
|
db.heads = append(db.heads[:i], db.heads[i+1:]...)
|
|
}
|
|
// Close an old persisted block.
|
|
i, ok := db.persistedForDir(dir)
|
|
if ok {
|
|
if err := db.persisted[i].Close(); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
pb, err := newPersistedBlock(dir)
|
|
if err != nil {
|
|
return errors.Wrap(err, "open persisted block")
|
|
}
|
|
if i >= 0 {
|
|
db.persisted[i] = pb
|
|
} else {
|
|
db.persisted = append(db.persisted, pb)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (db *DB) compactable() []Block {
|
|
db.mtx.RLock()
|
|
defer db.mtx.RUnlock()
|
|
|
|
var blocks []Block
|
|
for _, pb := range db.persisted {
|
|
blocks = append([]Block{pb}, blocks...)
|
|
}
|
|
|
|
// threshold := db.heads[len(db.heads)-1].bstatdb.MaxTime - headGracePeriod
|
|
|
|
// for _, hb := range db.heads {
|
|
// if hb.bstatdb.MaxTime < threshold {
|
|
// blocks = append(blocks, hb)
|
|
// }
|
|
// }
|
|
for _, hb := range db.heads[:len(db.heads)-1] {
|
|
blocks = append([]Block{hb}, blocks...)
|
|
}
|
|
|
|
return blocks
|
|
}
|
|
|
|
func intervalOverlap(amin, amax, bmin, bmax int64) bool {
|
|
if bmin >= amin && bmin <= amax {
|
|
return true
|
|
}
|
|
if amin >= bmin && amin <= bmax {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func intervalContains(min, max, t int64) bool {
|
|
return t >= min && t <= max
|
|
}
|
|
|
|
// blocksForInterval returns all blocks within the partition that may contain
|
|
// data for the given time range.
|
|
func (db *DB) blocksForInterval(mint, maxt int64) []Block {
|
|
var bs []Block
|
|
|
|
for _, b := range db.persisted {
|
|
s := b.Stats()
|
|
if intervalOverlap(mint, maxt, s.MinTime, s.MaxTime) {
|
|
bs = append(bs, b)
|
|
}
|
|
}
|
|
for _, b := range db.heads {
|
|
s := b.Stats()
|
|
if intervalOverlap(mint, maxt, s.MinTime, s.MaxTime) {
|
|
bs = append(bs, b)
|
|
}
|
|
}
|
|
|
|
return bs
|
|
}
|
|
|
|
// TODO(fabxc): make configurable.
|
|
const headGracePeriod = 60 * 1000 // 60 seconds for millisecond scale
|
|
|
|
// cut starts a new head block to append to. The completed head block
|
|
// will still be appendable for the configured grace period.
|
|
func (db *DB) cut() error {
|
|
dir, err := db.nextBlockDir()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
newHead, err := openHeadBlock(dir, db.logger)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
db.heads = append(db.heads, newHead)
|
|
db.headGen++
|
|
|
|
return nil
|
|
}
|
|
|
|
func (db *DB) nextBlockDir() (string, error) {
|
|
names, err := fileutil.ReadDir(db.dir)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
i := uint64(0)
|
|
for _, n := range names {
|
|
if !strings.HasPrefix(n, "b-") {
|
|
continue
|
|
}
|
|
j, err := strconv.ParseUint(n[2:], 10, 32)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
i = j
|
|
}
|
|
return filepath.Join(db.dir, fmt.Sprintf("b-%0.6d", i+1)), nil
|
|
}
|
|
|
|
// PartitionedDB is a time series storage.
|
|
type PartitionedDB struct {
|
|
logger log.Logger
|
|
opts *Options
|
|
dir string
|
|
|
|
partitionPow uint
|
|
Partitions []*DB
|
|
}
|
|
|
|
func isPowTwo(x int) bool {
|
|
return x > 0 && (x&(x-1)) == 0
|
|
}
|
|
|
|
// OpenPartitioned or create a new DB.
|
|
func OpenPartitioned(dir string, n int, l log.Logger, opts *Options) (*PartitionedDB, error) {
|
|
if !isPowTwo(n) {
|
|
return nil, errors.Errorf("%d is not a power of two", n)
|
|
}
|
|
if opts == nil {
|
|
opts = DefaultOptions
|
|
}
|
|
if l == nil {
|
|
l = log.NewLogfmtLogger(os.Stdout)
|
|
l = log.NewContext(l).With("ts", log.DefaultTimestampUTC, "caller", log.DefaultCaller)
|
|
}
|
|
|
|
if err := os.MkdirAll(dir, 0777); err != nil {
|
|
return nil, err
|
|
}
|
|
c := &PartitionedDB{
|
|
logger: l,
|
|
opts: opts,
|
|
dir: dir,
|
|
partitionPow: uint(math.Log2(float64(n))),
|
|
}
|
|
|
|
// Initialize vertical partitiondb.
|
|
// TODO(fabxc): validate partition number to be power of 2, which is required
|
|
// for the bitshift-modulo when finding the right partition.
|
|
for i := 0; i < n; i++ {
|
|
l := log.NewContext(l).With("partition", i)
|
|
d := partitionDir(dir, i)
|
|
|
|
s, err := Open(d, l)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("initializing partition %q failed: %s", d, err)
|
|
}
|
|
|
|
c.Partitions = append(c.Partitions, s)
|
|
}
|
|
|
|
return c, nil
|
|
}
|
|
|
|
func partitionDir(base string, i int) string {
|
|
return filepath.Join(base, fmt.Sprintf("p-%0.4d", i))
|
|
}
|
|
|
|
// Close the database.
|
|
func (db *PartitionedDB) Close() error {
|
|
var g errgroup.Group
|
|
|
|
for _, partition := range db.Partitions {
|
|
g.Go(partition.Close)
|
|
}
|
|
|
|
return g.Wait()
|
|
}
|
|
|
|
// Appender returns a new appender against the database.
|
|
func (db *PartitionedDB) Appender() Appender {
|
|
app := &partitionedAppender{db: db}
|
|
|
|
for _, p := range db.Partitions {
|
|
app.partitions = append(app.partitions, p.Appender().(*dbAppender))
|
|
}
|
|
return app
|
|
}
|
|
|
|
type partitionedAppender struct {
|
|
db *PartitionedDB
|
|
partitions []*dbAppender
|
|
}
|
|
|
|
func (a *partitionedAppender) SetSeries(lset labels.Labels) (uint64, error) {
|
|
h := lset.Hash()
|
|
p := h >> (64 - a.db.partitionPow)
|
|
|
|
ref, err := a.partitions[p].setSeries(h, lset)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
return ref | (p << 48), nil
|
|
}
|
|
|
|
func (a *partitionedAppender) Add(ref uint64, t int64, v float64) error {
|
|
p := uint8((ref << 8) >> 56)
|
|
return a.partitions[p].Add(ref, t, v)
|
|
}
|
|
|
|
func (a *partitionedAppender) Commit() error {
|
|
var merr MultiError
|
|
|
|
for _, p := range a.partitions {
|
|
merr.Add(p.Commit())
|
|
}
|
|
return merr.Err()
|
|
}
|
|
|
|
func (a *partitionedAppender) Rollback() error {
|
|
var merr MultiError
|
|
|
|
for _, p := range a.partitions {
|
|
merr.Add(p.Rollback())
|
|
}
|
|
return merr.Err()
|
|
}
|
|
|
|
// The MultiError type implements the error interface, and contains the
|
|
// Errors used to construct it.
|
|
type MultiError []error
|
|
|
|
// Returns a concatenated string of the contained errors
|
|
func (es MultiError) Error() string {
|
|
var buf bytes.Buffer
|
|
|
|
if len(es) > 1 {
|
|
fmt.Fprintf(&buf, "%d errors: ", len(es))
|
|
}
|
|
|
|
for i, err := range es {
|
|
if i != 0 {
|
|
buf.WriteString("; ")
|
|
}
|
|
buf.WriteString(err.Error())
|
|
}
|
|
|
|
return buf.String()
|
|
}
|
|
|
|
// Add adds the error to the error list if it is not nil.
|
|
func (es *MultiError) Add(err error) {
|
|
if err == nil {
|
|
return
|
|
}
|
|
if merr, ok := err.(MultiError); ok {
|
|
*es = append(*es, merr...)
|
|
} else {
|
|
*es = append(*es, err)
|
|
}
|
|
}
|
|
|
|
// Err returns the error list as an error or nil if it is empty.
|
|
func (es MultiError) Err() error {
|
|
if len(es) == 0 {
|
|
return nil
|
|
}
|
|
return es
|
|
}
|
|
|
|
func yoloString(b []byte) string {
|
|
sh := (*reflect.SliceHeader)(unsafe.Pointer(&b))
|
|
|
|
h := reflect.StringHeader{
|
|
Data: sh.Data,
|
|
Len: sh.Len,
|
|
}
|
|
return *((*string)(unsafe.Pointer(&h)))
|
|
}
|
|
|
|
func yoloBytes(s string) []byte {
|
|
sh := (*reflect.StringHeader)(unsafe.Pointer(&s))
|
|
|
|
h := reflect.SliceHeader{
|
|
Cap: sh.Len,
|
|
Len: sh.Len,
|
|
Data: sh.Data,
|
|
}
|
|
return *((*[]byte)(unsafe.Pointer(&h)))
|
|
}
|